From ede99dcd33bfc6389605034eb8853dca736cfe1a Mon Sep 17 00:00:00 2001 From: Jackson Harper Date: Sat, 30 Mar 2024 10:42:01 +0800 Subject: [PATCH] Better matching for youtube video URLs --- packages/api/package.json | 5 +- .../api/src/jobs/process-youtube-video.ts | 3 +- packages/api/src/pubsub.ts | 17 +--- packages/api/src/utils/youtube.ts | 61 +++++++++++++ packages/api/test/utils/youtube.test.ts | 86 +++++++++++++++++++ 5 files changed, 155 insertions(+), 17 deletions(-) create mode 100644 packages/api/src/utils/youtube.ts create mode 100644 packages/api/test/utils/youtube.test.ts diff --git a/packages/api/package.json b/packages/api/package.json index f6db9603c..23d509a8f 100644 --- a/packages/api/package.json +++ b/packages/api/package.json @@ -78,7 +78,6 @@ "image-size": "^1.0.2", "intercom-client": "^3.1.4", "ioredis": "^5.3.2", - "redis": "^4.6.13", "jsonwebtoken": "^8.5.1", "jwks-rsa": "^2.0.3", "langchain": "^0.1.21", @@ -95,6 +94,7 @@ "posthog-node": "^3.6.3", "private-ip": "^2.3.3", "prom-client": "^15.1.0", + "redis": "^4.6.13", "rss-parser": "^3.13.0", "sanitize-html": "^2.3.2", "sax": "^1.3.0", @@ -106,6 +106,7 @@ "typeorm": "^0.3.4", "typeorm-naming-strategies": "^4.1.0", "underscore": "^1.13.6", + "url-pattern": "^1.0.3", "urlsafe-base64": "^1.0.0", "uuid": "^8.3.1", "voca": "^1.4.0", @@ -167,4 +168,4 @@ "volta": { "extends": "../../package.json" } -} \ No newline at end of file +} diff --git a/packages/api/src/jobs/process-youtube-video.ts b/packages/api/src/jobs/process-youtube-video.ts index d5e80b1aa..7369aedb6 100644 --- a/packages/api/src/jobs/process-youtube-video.ts +++ b/packages/api/src/jobs/process-youtube-video.ts @@ -14,6 +14,7 @@ import { enqueueProcessYouTubeTranscript } from '../utils/createTask' import { stringToHash } from '../utils/helpers' import { logger } from '../utils/logger' import { parsePreparedContent } from '../utils/parser' +import { videoIdFromYouTubeUrl } from '../utils/youtube' export interface ProcessYouTubeVideoJobData { userId: string @@ -334,7 +335,7 @@ export const processYouTubeVideo = async ( } videoURL = new URL(libraryItem.originalUrl) - const videoId = videoURL.searchParams.get('v') + const videoId = videoIdFromYouTubeUrl(libraryItem.originalUrl) if (!videoId) { logger.warning('no video id for supplied youtube url', { diff --git a/packages/api/src/pubsub.ts b/packages/api/src/pubsub.ts index 2214d787b..4401cfad9 100644 --- a/packages/api/src/pubsub.ts +++ b/packages/api/src/pubsub.ts @@ -10,23 +10,12 @@ import { enqueueWebhookJob, } from './utils/createTask' import { buildLogger } from './utils/logger' +import { isYouTubeVideoURL } from './utils/youtube' const logger = buildLogger('pubsub') const client = new PubSub() -const isYouTubeVideoURL = (url: string | undefined): boolean => { - if (!url) { - return false - } - const u = new URL(url) - if (!u.host.endsWith('youtube.com') && !u.host.endsWith('youtu.be')) { - return false - } - const videoId = u.searchParams.get('v') - return videoId != null -} - export const createPubSubClient = (): PubsubClient => { const publish = (topicName: string, msg: Buffer): Promise => { if (env.dev.isLocal) { @@ -93,11 +82,11 @@ export const createPubSubClient = (): PubsubClient => { // }) // } - const isYoutubeVideo = (data: any): data is { originalUrl: string } => { + const isItemWithURL = (data: any): data is { originalUrl: string } => { return 'originalUrl' in data } - if (isYoutubeVideo(data) && isYouTubeVideoURL(data['originalUrl'])) { + if (isItemWithURL(data) && isYouTubeVideoURL(data['originalUrl'])) { await enqueueProcessYouTubeVideo({ userId, libraryItemId, diff --git a/packages/api/src/utils/youtube.ts b/packages/api/src/utils/youtube.ts new file mode 100644 index 000000000..8a224e188 --- /dev/null +++ b/packages/api/src/utils/youtube.ts @@ -0,0 +1,61 @@ +import UrlPattern from 'url-pattern' + +export const videoIdFromYouTubeUrl = ( + urlString: string | undefined +): string | undefined => { + if (!urlString) { + return undefined + } + + const url = new URL(urlString) + if ( + !url.hostname.endsWith('youtube.com') && + !url.hostname.endsWith('youtu.be') + ) { + return undefined + } + + const videoId = url.searchParams.get('v') + if (videoId) { + return videoId || undefined + } + + const parsed = (() => { + const parsedUrl = new URL(url) + parsedUrl.search = '' + return parsedUrl.toString() + })() + + const shortVideo = new UrlPattern('http(s)\\://(www.)youtu.be/:videoId') + const directVideo = new UrlPattern( + '(http(s)\\://)(www.)youtube.com/v/:videoId' + ) + const embedVideo = new UrlPattern( + '(http(s)\\://)(www.)youtube.com/embed/:videoId' + ) + + let params = shortVideo.match(parsed) as Record + if (params && params.videoId) { + return params.videoId + } + + params = directVideo.match(parsed) as Record + if (params && params.videoId) { + return params.videoId + } + + params = embedVideo.match(parsed) as Record + if (params && params.videoId) { + return params.videoId + } + + return undefined +} + +export const isYouTubeVideoURL = (url: string | undefined): boolean => { + if (!url) { + return false + } + const videoId = videoIdFromYouTubeUrl(url) + return videoId != null +} diff --git a/packages/api/test/utils/youtube.test.ts b/packages/api/test/utils/youtube.test.ts new file mode 100644 index 000000000..7804a15b7 --- /dev/null +++ b/packages/api/test/utils/youtube.test.ts @@ -0,0 +1,86 @@ +import 'mocha' +import { expect } from 'chai' +import { + isYouTubeVideoURL, + videoIdFromYouTubeUrl, +} from '../../src/utils/youtube' + +describe('videoIdFromYouTubeUrl', () => { + it('Returns video id for video with playlist id', () => { + const result = videoIdFromYouTubeUrl( + 'https://www.youtube.com/watch?v=kfchvCyHmsc&list=PLDyKn8uKYtRalFdBWtv_EjDtUo2UEbu-a' + ) + expect(result).to.eq('kfchvCyHmsc') + }) + + it('Returns video id for direct url', () => { + const result = videoIdFromYouTubeUrl( + 'https://www.youtube.com/v/vLfAtCbE_Jc' + ) + expect(result).to.eq('vLfAtCbE_Jc') + }) + + it('Returns video id for standard url', () => { + const result = videoIdFromYouTubeUrl( + 'https://www.youtube.com/watch?v=vLfAtCbE_Jc' + ) + expect(result).to.eq('vLfAtCbE_Jc') + }) + + it('Returns video id for short url', () => { + const result = videoIdFromYouTubeUrl('https://youtu.be/vLfAtCbE_Jc') + expect(result).to.eq('vLfAtCbE_Jc') + }) + + it('Returns video id for short url with share id', () => { + const result = videoIdFromYouTubeUrl( + 'https://youtu.be/iZxR7rPdvuQ?si=ad73DTmmXL_lbn31' + ) + expect(result).to.eq('iZxR7rPdvuQ') + }) + + it('Returns video id for embed url', () => { + const result = videoIdFromYouTubeUrl( + 'https://www.youtube.com/embed/vLfAtCbE_Jc' + ) + expect(result).to.eq('vLfAtCbE_Jc') + }) + + it('Returns undefined for non-youtube url', () => { + const result = videoIdFromYouTubeUrl( + 'https://omnivore.app/iZxR7rPdvuQ?si=ad73DTmmXL_lbn31' + ) + expect(result).to.eq(undefined) + }) + + it('Returns undefined for non-youtube short url', () => { + const result = videoIdFromYouTubeUrl('https://omnivore.app/?v=iZxR7rPdvuQ') + expect(result).to.eq(undefined) + }) + + it('Returns video id when port is added', () => { + const result = videoIdFromYouTubeUrl( + 'https://www.youtube.com:443/watch?v=kfchvCyHmsc' + ) + expect(result).to.eq('kfchvCyHmsc') + }) +}) + +describe('isYouTubeVideoURL', () => { + it('Returns false for a shorts URL', () => { + const result = isYouTubeVideoURL( + 'https://www.youtube.com/shorts/ZsQKYwXbo4s' + ) + expect(result).to.eq(false) + }) + it('Returns false for a non-youtube URL', () => { + const result = isYouTubeVideoURL('https://omnivore.app/about') + expect(result).to.eq(false) + }) + it('Returns true for a video URL', () => { + const result = isYouTubeVideoURL( + 'https://www.youtube.com/watch?v=p4YOXmm839c' + ) + expect(result).to.eq(true) + }) +})