Better matching for youtube video URLs

This commit is contained in:
Jackson Harper
2024-03-30 10:42:01 +08:00
parent bfc1be6546
commit ede99dcd33
5 changed files with 155 additions and 17 deletions

View File

@ -78,7 +78,6 @@
"image-size": "^1.0.2",
"intercom-client": "^3.1.4",
"ioredis": "^5.3.2",
"redis": "^4.6.13",
"jsonwebtoken": "^8.5.1",
"jwks-rsa": "^2.0.3",
"langchain": "^0.1.21",
@ -95,6 +94,7 @@
"posthog-node": "^3.6.3",
"private-ip": "^2.3.3",
"prom-client": "^15.1.0",
"redis": "^4.6.13",
"rss-parser": "^3.13.0",
"sanitize-html": "^2.3.2",
"sax": "^1.3.0",
@ -106,6 +106,7 @@
"typeorm": "^0.3.4",
"typeorm-naming-strategies": "^4.1.0",
"underscore": "^1.13.6",
"url-pattern": "^1.0.3",
"urlsafe-base64": "^1.0.0",
"uuid": "^8.3.1",
"voca": "^1.4.0",
@ -167,4 +168,4 @@
"volta": {
"extends": "../../package.json"
}
}
}

View File

@ -14,6 +14,7 @@ import { enqueueProcessYouTubeTranscript } from '../utils/createTask'
import { stringToHash } from '../utils/helpers'
import { logger } from '../utils/logger'
import { parsePreparedContent } from '../utils/parser'
import { videoIdFromYouTubeUrl } from '../utils/youtube'
export interface ProcessYouTubeVideoJobData {
userId: string
@ -334,7 +335,7 @@ export const processYouTubeVideo = async (
}
videoURL = new URL(libraryItem.originalUrl)
const videoId = videoURL.searchParams.get('v')
const videoId = videoIdFromYouTubeUrl(libraryItem.originalUrl)
if (!videoId) {
logger.warning('no video id for supplied youtube url', {

View File

@ -10,23 +10,12 @@ import {
enqueueWebhookJob,
} from './utils/createTask'
import { buildLogger } from './utils/logger'
import { isYouTubeVideoURL } from './utils/youtube'
const logger = buildLogger('pubsub')
const client = new PubSub()
const isYouTubeVideoURL = (url: string | undefined): boolean => {
if (!url) {
return false
}
const u = new URL(url)
if (!u.host.endsWith('youtube.com') && !u.host.endsWith('youtu.be')) {
return false
}
const videoId = u.searchParams.get('v')
return videoId != null
}
export const createPubSubClient = (): PubsubClient => {
const publish = (topicName: string, msg: Buffer): Promise<void> => {
if (env.dev.isLocal) {
@ -93,11 +82,11 @@ export const createPubSubClient = (): PubsubClient => {
// })
// }
const isYoutubeVideo = (data: any): data is { originalUrl: string } => {
const isItemWithURL = (data: any): data is { originalUrl: string } => {
return 'originalUrl' in data
}
if (isYoutubeVideo(data) && isYouTubeVideoURL(data['originalUrl'])) {
if (isItemWithURL(data) && isYouTubeVideoURL(data['originalUrl'])) {
await enqueueProcessYouTubeVideo({
userId,
libraryItemId,

View File

@ -0,0 +1,61 @@
import UrlPattern from 'url-pattern'
export const videoIdFromYouTubeUrl = (
urlString: string | undefined
): string | undefined => {
if (!urlString) {
return undefined
}
const url = new URL(urlString)
if (
!url.hostname.endsWith('youtube.com') &&
!url.hostname.endsWith('youtu.be')
) {
return undefined
}
const videoId = url.searchParams.get('v')
if (videoId) {
return videoId || undefined
}
const parsed = (() => {
const parsedUrl = new URL(url)
parsedUrl.search = ''
return parsedUrl.toString()
})()
const shortVideo = new UrlPattern('http(s)\\://(www.)youtu.be/:videoId')
const directVideo = new UrlPattern(
'(http(s)\\://)(www.)youtube.com/v/:videoId'
)
const embedVideo = new UrlPattern(
'(http(s)\\://)(www.)youtube.com/embed/:videoId'
)
let params = shortVideo.match(parsed) as Record<string, string>
if (params && params.videoId) {
return params.videoId
}
params = directVideo.match(parsed) as Record<string, string>
if (params && params.videoId) {
return params.videoId
}
params = embedVideo.match(parsed) as Record<string, string>
if (params && params.videoId) {
return params.videoId
}
return undefined
}
export const isYouTubeVideoURL = (url: string | undefined): boolean => {
if (!url) {
return false
}
const videoId = videoIdFromYouTubeUrl(url)
return videoId != null
}

View File

@ -0,0 +1,86 @@
import 'mocha'
import { expect } from 'chai'
import {
isYouTubeVideoURL,
videoIdFromYouTubeUrl,
} from '../../src/utils/youtube'
describe('videoIdFromYouTubeUrl', () => {
it('Returns video id for video with playlist id', () => {
const result = videoIdFromYouTubeUrl(
'https://www.youtube.com/watch?v=kfchvCyHmsc&list=PLDyKn8uKYtRalFdBWtv_EjDtUo2UEbu-a'
)
expect(result).to.eq('kfchvCyHmsc')
})
it('Returns video id for direct url', () => {
const result = videoIdFromYouTubeUrl(
'https://www.youtube.com/v/vLfAtCbE_Jc'
)
expect(result).to.eq('vLfAtCbE_Jc')
})
it('Returns video id for standard url', () => {
const result = videoIdFromYouTubeUrl(
'https://www.youtube.com/watch?v=vLfAtCbE_Jc'
)
expect(result).to.eq('vLfAtCbE_Jc')
})
it('Returns video id for short url', () => {
const result = videoIdFromYouTubeUrl('https://youtu.be/vLfAtCbE_Jc')
expect(result).to.eq('vLfAtCbE_Jc')
})
it('Returns video id for short url with share id', () => {
const result = videoIdFromYouTubeUrl(
'https://youtu.be/iZxR7rPdvuQ?si=ad73DTmmXL_lbn31'
)
expect(result).to.eq('iZxR7rPdvuQ')
})
it('Returns video id for embed url', () => {
const result = videoIdFromYouTubeUrl(
'https://www.youtube.com/embed/vLfAtCbE_Jc'
)
expect(result).to.eq('vLfAtCbE_Jc')
})
it('Returns undefined for non-youtube url', () => {
const result = videoIdFromYouTubeUrl(
'https://omnivore.app/iZxR7rPdvuQ?si=ad73DTmmXL_lbn31'
)
expect(result).to.eq(undefined)
})
it('Returns undefined for non-youtube short url', () => {
const result = videoIdFromYouTubeUrl('https://omnivore.app/?v=iZxR7rPdvuQ')
expect(result).to.eq(undefined)
})
it('Returns video id when port is added', () => {
const result = videoIdFromYouTubeUrl(
'https://www.youtube.com:443/watch?v=kfchvCyHmsc'
)
expect(result).to.eq('kfchvCyHmsc')
})
})
describe('isYouTubeVideoURL', () => {
it('Returns false for a shorts URL', () => {
const result = isYouTubeVideoURL(
'https://www.youtube.com/shorts/ZsQKYwXbo4s'
)
expect(result).to.eq(false)
})
it('Returns false for a non-youtube URL', () => {
const result = isYouTubeVideoURL('https://omnivore.app/about')
expect(result).to.eq(false)
})
it('Returns true for a video URL', () => {
const result = isYouTubeVideoURL(
'https://www.youtube.com/watch?v=p4YOXmm839c'
)
expect(result).to.eq(true)
})
})