From e6ebac5e13d2fc27ab1744f54b35aff25db4b673 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Sun, 18 Aug 2024 11:52:21 +0800 Subject: [PATCH 1/2] fix: special handler for youtube shorts --- .../src/websites/youtube-handler.ts | 73 ++++++++++++------- 1 file changed, 47 insertions(+), 26 deletions(-) diff --git a/packages/content-handler/src/websites/youtube-handler.ts b/packages/content-handler/src/websites/youtube-handler.ts index 3779241dd..d83262176 100644 --- a/packages/content-handler/src/websites/youtube-handler.ts +++ b/packages/content-handler/src/websites/youtube-handler.ts @@ -1,21 +1,13 @@ -import { ContentHandler, PreHandleResult } from '../content-handler' import axios from 'axios' import _ from 'underscore' +import { ContentHandler, PreHandleResult } from '../content-handler' const YOUTUBE_URL_MATCH = - /^((?:https?:)?\/\/)?((?:www|m)\.)?((?:youtube\.com|youtu.be))(\/(?:[\w-]+\?v=|embed\/|v\/)?)([\w-]+)(\S+)?$/ + /^((?:https?:)?\/\/)?((?:www|m)\.)?((?:youtube\.com|youtu.be))(\/(?:[\w-]+\?v=|embed\/|v\/|shorts\/|playlist\?list=)?)([\w-]+)(\S+)?$/ export const getYoutubeVideoId = (url: string) => { const u = new URL(url) - const videoId = u.searchParams.get('v') - if (!videoId) { - const match = url.toString().match(YOUTUBE_URL_MATCH) - if (match === null || match.length < 6 || !match[5]) { - return undefined - } - return match[5] - } - return videoId + return u.searchParams.get('v') } export const getYoutubePlaylistId = (url: string) => { @@ -23,6 +15,41 @@ export const getYoutubePlaylistId = (url: string) => { return u.searchParams.get('list') } +const getEmbedData = (url: string) => { + const BaseUrl = 'https://www.youtube.com' + const embedBaseUrl = 'https://www.youtube.com/embed' + + const match = url.match(YOUTUBE_URL_MATCH) + if (match === null || match.length < 6) { + console.error('Invalid youtube url', url) + throw new Error(`Invalid youtube url: ${url}`) + } + + const type = match[4] + const id = match[5] + + if (type === '/playlist?list=') { + const playlistId = getYoutubePlaylistId(url) || id + + return { + src: `${embedBaseUrl}/videoseries?list=${playlistId}`, + url: `${BaseUrl}/playlist?list=${playlistId}`, + } + } else if (type === '/shorts/') { + return { + src: `${embedBaseUrl}/${id}`, + url: `${BaseUrl}/shorts/${id}`, + } + } + + const videoId = getYoutubeVideoId(url) || id + + return { + src: `${embedBaseUrl}/${videoId}`, + url: `${BaseUrl}/watch?v=${videoId}`, + } +} + export const escapeTitle = (title: string) => { return _.escape(title) } @@ -38,21 +65,15 @@ export class YoutubeHandler extends ContentHandler { } async preHandle(url: string): Promise { - const BaseUrl = 'https://www.youtube.com' - const embedBaseUrl = 'https://www.youtube.com/embed' - let urlToEncode: string - let src: string - const playlistId = getYoutubePlaylistId(url) - if (playlistId) { - urlToEncode = `${BaseUrl}/playlist?list=${playlistId}` - src = `${embedBaseUrl}/videoseries?list=${playlistId}` - } else { - const videoId = getYoutubeVideoId(url) - if (!videoId) { - return {} - } - urlToEncode = `${BaseUrl}/watch?v=${videoId}` - src = `${embedBaseUrl}/${videoId}` + let src, urlToEncode + + try { + const embedData = getEmbedData(url) + src = embedData.src + urlToEncode = embedData.url + } catch (error) { + console.error('Error getting embed data', error) + return {} } const oembedUrl = From 46743215313bb913c7ece8fbb5b7acf1466aa8f3 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Sun, 18 Aug 2024 12:37:10 +0800 Subject: [PATCH 2/2] reduce blocking domain to 1 hour --- packages/content-fetch/src/request_handler.ts | 4 ++-- .../src/websites/youtube-handler.ts | 17 +++++++------- .../test/youtube-handler.test.ts | 23 +++++++++++++------ 3 files changed, 26 insertions(+), 18 deletions(-) diff --git a/packages/content-fetch/src/request_handler.ts b/packages/content-fetch/src/request_handler.ts index f8fea81f3..6a427434d 100644 --- a/packages/content-fetch/src/request_handler.ts +++ b/packages/content-fetch/src/request_handler.ts @@ -165,8 +165,8 @@ const incrementContentFetchFailure = async ( const key = failureRedisKey(domain) try { const result = await redisClient.incr(key) - // expire the key in 1 day - await redisClient.expire(key, 24 * 60 * 60) + // expire the key in 1 hour + await redisClient.expire(key, 60 * 60) return result } catch (error) { diff --git a/packages/content-handler/src/websites/youtube-handler.ts b/packages/content-handler/src/websites/youtube-handler.ts index d83262176..7785a6d87 100644 --- a/packages/content-handler/src/websites/youtube-handler.ts +++ b/packages/content-handler/src/websites/youtube-handler.ts @@ -15,7 +15,7 @@ export const getYoutubePlaylistId = (url: string) => { return u.searchParams.get('list') } -const getEmbedData = (url: string) => { +export const getEmbedData = (url: string) => { const BaseUrl = 'https://www.youtube.com' const embedBaseUrl = 'https://www.youtube.com/embed' @@ -25,17 +25,17 @@ const getEmbedData = (url: string) => { throw new Error(`Invalid youtube url: ${url}`) } - const type = match[4] - const id = match[5] - - if (type === '/playlist?list=') { - const playlistId = getYoutubePlaylistId(url) || id - + const playlistId = getYoutubePlaylistId(url) + if (playlistId) { return { src: `${embedBaseUrl}/videoseries?list=${playlistId}`, url: `${BaseUrl}/playlist?list=${playlistId}`, } - } else if (type === '/shorts/') { + } + + const type = match[4] + const id = match[5] + if (type === '/shorts/') { return { src: `${embedBaseUrl}/${id}`, url: `${BaseUrl}/shorts/${id}`, @@ -43,7 +43,6 @@ const getEmbedData = (url: string) => { } const videoId = getYoutubeVideoId(url) || id - return { src: `${embedBaseUrl}/${videoId}`, url: `${BaseUrl}/watch?v=${videoId}`, diff --git a/packages/content-handler/test/youtube-handler.test.ts b/packages/content-handler/test/youtube-handler.test.ts index f08e114cc..b2e655741 100644 --- a/packages/content-handler/test/youtube-handler.test.ts +++ b/packages/content-handler/test/youtube-handler.test.ts @@ -1,6 +1,21 @@ import { expect } from "chai"; import "mocha"; -import { escapeTitle, getYoutubePlaylistId, getYoutubeVideoId } from "../src/websites/youtube-handler"; +import { + escapeTitle, + getEmbedData, + getYoutubePlaylistId, + getYoutubeVideoId, +} from '../src/websites/youtube-handler' + +describe('getEmbedData', () => { + expect('https://www.youtube.com/embed/vFD2gu007dc').to.eq( + getEmbedData('https://youtu.be/vFD2gu007dc').src + ) + + expect('https://www.youtube.com/embed/cg9b4RC87LI').to.eq( + getEmbedData('https://youtu.be/cg9b4RC87LI?t=116').src + ) +}) describe('getYoutubeVideoId', () => { it('should parse video id out of a URL', async () => { @@ -12,15 +27,9 @@ describe('getYoutubeVideoId', () => { 'https://www.youtube.com/watch?v=vFD2gu007dc&list=RDvFD2gu007dc&start_radio=1' ) ) - expect('vFD2gu007dc').to.eq( - getYoutubeVideoId('https://youtu.be/vFD2gu007dc') - ) expect('BMFVCnbRaV4').to.eq( getYoutubeVideoId('https://youtube.com/watch?v=BMFVCnbRaV4&feature=share') ) - expect('cg9b4RC87LI').to.eq( - getYoutubeVideoId('https://youtu.be/cg9b4RC87LI?t=116') - ) }) })