From 2dbd16a61eaa26523ced24c3d4e496c6e6068cdb Mon Sep 17 00:00:00 2001 From: Jackson Harper Date: Tue, 12 Mar 2024 17:48:16 +0800 Subject: [PATCH] Pull duration and description from YouTube metadata --- packages/api/src/jobs/get-youtube-info.ts | 153 ---------------------- packages/api/src/pubsub.ts | 20 ++- packages/api/src/queue-processor.ts | 6 +- packages/api/src/utils/createTask.ts | 10 +- yarn.lock | 41 ++++++ 5 files changed, 68 insertions(+), 162 deletions(-) delete mode 100644 packages/api/src/jobs/get-youtube-info.ts diff --git a/packages/api/src/jobs/get-youtube-info.ts b/packages/api/src/jobs/get-youtube-info.ts deleted file mode 100644 index 9f7abbdf2..000000000 --- a/packages/api/src/jobs/get-youtube-info.ts +++ /dev/null @@ -1,153 +0,0 @@ -import { logger } from '../utils/logger' -import { loadSummarizationChain } from 'langchain/chains' -import { ChatOpenAI } from '@langchain/openai' -import { - CharacterTextSplitter, - RecursiveCharacterTextSplitter, -} from 'langchain/text_splitter' -import { DocumentInterface } from '@langchain/core/documents' -import { YoutubeLoader } from 'langchain/document_loaders/web/youtube' -import { authTrx } from '../repository' -import { libraryItemRepository } from '../repository/library_item' -import { htmlToMarkdown, parsePreparedContent } from '../utils/parser' -import { AISummary } from '../entity/AISummary' -import { LibraryItem, LibraryItemState } from '../entity/library_item' -import { getAISummary } from '../services/ai-summaries' -import { YoutubeTranscript, TranscriptResponse } from 'youtube-transcript' -import { Converter } from 'showdown' -import { Video, Client as YouTubeClient } from 'youtubei' - -export interface ProcessYouTubeVideoJobData { - userId: string - libraryItemId: string -} - -export const PROCESS_YOU_TUBE_VIDEO_JOB_NAME = 'process-you-tube-video' - -export const processYouTubeVideo = async ( - jobData: ProcessYouTubeVideoJobData -) => { - try { - console.log( - '******************************* processYouTubeVideo *************************' - ) - const libraryItem = await authTrx( - async (tx) => - tx - .withRepository(libraryItemRepository) - .findById(jobData.libraryItemId), - undefined, - jobData.userId - ) - if (!libraryItem || libraryItem.state !== LibraryItemState.Succeeded) { - logger.info( - `Not ready to get YouTube metadata job state: ${ - libraryItem?.state ?? 'null' - }` - ) - return - } - - // const doc = await YoutubeLoader.createFromUrl(libraryItem.originalUrl, { - // language: 'en', - // addVideoInfo: true, - // }).load() - - // console.log('doc from youtube:', doc) - - const youtube = new YouTubeClient() - const video = (await youtube.getVideo( - 'Y0fqyJUrwe0' /* libraryItem.originalUrl */ - )) as Video - console.log('GOT VIDEO: ', video) - const transcript = await video.getTranscript() - - console.log('description: ', video?.description) - console.log('chapters: ', video?.chapters) - - // const transcript = await YoutubeTranscript.fetchTranscript( - // libraryItem.originalUrl - // ) - - if (transcript) { - console.log( - 'original transcript:\n', - transcript.map((item) => item.text).join(' '), - '\n\n' - ) - } else { - console.log('no transcript found') - } - - // const prompt = `Given the following transcript data, supplied as a list of text segments, turn it into readable - // text adding punctuation and paragraphs. Format the output as markdown. - - // ${JSON.stringify(transcript).replace(/"/g, '\\"')} - // ` - - // const llm = new ChatOpenAI({ - // configuration: { - // apiKey: process.env.OPENAI_API_KEY, - // }, - // }) - // const response = await llm.generate([[prompt]]) - // console.log('response: ', response.generations, response.llmOutput) - - // const text = response.generations[0][0].text - // const converter = new Converter() - // const transcriptHTML = converter.makeHtml(text) - - // const html = ` - // 1 Billion Rows Challenge - // - // - // - // - // - // - // - // - // - //
- //

- // - //

- // 1 Billion Rows Challenge

- // - //

- //
- //
- // - // `.replace( - // '
', - // `
${transcriptHTML}
` - // ) - - // console.log('input HTML: ', html) - // if (html) { - // const preparedDocument = { - // document: html, - // pageInfo: {}, - // } - // const updatedContent = await parsePreparedContent( - // libraryItem.originalUrl, - // preparedDocument, - // true - // ) - // console.log('updated content: ', updatedContent.parsedContent?.content) - // libraryItem.readableContent = - // updatedContent.parsedContent?.content ?? libraryItem.readableContent - // const _ = await authTrx( - // async (t) => { - // return t - // .getRepository(LibraryItem) - // .update(jobData.libraryItemId, libraryItem) - // }, - // undefined, - // jobData.userId - // ) - // } - } catch (err) { - console.log('error creating summary: ', err) - } -} diff --git a/packages/api/src/pubsub.ts b/packages/api/src/pubsub.ts index 5d2b21817..2d7f12ba0 100644 --- a/packages/api/src/pubsub.ts +++ b/packages/api/src/pubsub.ts @@ -18,7 +18,7 @@ import { findFeatureByName, getFeatureName, } from './services/features' -import { processYouTubeVideo } from './jobs/get-youtube-info' +import { processYouTubeVideo } from './jobs/process-youtube-video' const logger = buildLogger('pubsub') @@ -26,6 +26,18 @@ const client = new PubSub() type EntityData = Merge +const isYouTubeVideoURL = (url: string | undefined): Boolean => { + if (!url) { + return false + } + const u = new URL(url) + if (!u.host.endsWith('youtube.com') && !u.host.endsWith('youtu.be')) { + return false + } + const videoId = u.searchParams.get('v') + return videoId != null +} + export const createPubSubClient = (): PubsubClient => { const fieldsToDelete = ['user'] as const @@ -95,6 +107,12 @@ export const createPubSubClient = (): PubsubClient => { // userId, // libraryItemId, // }) + } + + if ( + 'originalUrl' in data && + isYouTubeVideoURL(data['originalUrl'] as string | undefined) + ) { await enqueueProcessYouTubeVideo({ userId, libraryItemId, diff --git a/packages/api/src/queue-processor.ts b/packages/api/src/queue-processor.ts index 7fc18f518..ed7940879 100644 --- a/packages/api/src/queue-processor.ts +++ b/packages/api/src/queue-processor.ts @@ -45,9 +45,9 @@ import { CACHED_READING_POSITION_PREFIX } from './services/cached_reading_positi import { getJobPriority } from './utils/createTask' import { logger } from './utils/logger' import { - PROCESS_YOU_TUBE_VIDEO_JOB_NAME, + PROCESS_YOUTUBE_VIDEO_JOB_NAME, processYouTubeVideo, -} from './jobs/get-youtube-info' +} from './jobs/process-youtube-video' export const QUEUE_NAME = 'omnivore-backend-queue' export const JOB_VERSION = 'v001' @@ -120,7 +120,7 @@ export const createWorker = (connection: ConnectionOptions) => return exportItem(job.data) case AI_SUMMARIZE_JOB_NAME: return aiSummarize(job.data) - case PROCESS_YOU_TUBE_VIDEO_JOB_NAME: + case PROCESS_YOUTUBE_VIDEO_JOB_NAME: return processYouTubeVideo(job.data) case EXPORT_ALL_ITEMS_JOB_NAME: return exportAllItems(job.data) diff --git a/packages/api/src/utils/createTask.ts b/packages/api/src/utils/createTask.ts index 6c9345f8f..fd99e1668 100644 --- a/packages/api/src/utils/createTask.ts +++ b/packages/api/src/utils/createTask.ts @@ -46,9 +46,9 @@ import { logger } from './logger' import View = google.cloud.tasks.v2.Task.View import { AISummarizeJobData, AI_SUMMARIZE_JOB_NAME } from '../jobs/ai-summarize' import { - PROCESS_YOU_TUBE_VIDEO_JOB_NAME, + PROCESS_YOUTUBE_VIDEO_JOB_NAME, ProcessYouTubeVideoJobData, -} from '../jobs/get-youtube-info' +} from '../jobs/process-youtube-video' // Instantiates a client. const client = new CloudTasksClient() @@ -82,7 +82,7 @@ export const getJobPriority = (jobName: string): number => { case REFRESH_ALL_FEEDS_JOB_NAME: case THUMBNAIL_JOB: return 100 - case PROCESS_YOU_TUBE_VIDEO_JOB_NAME: + case PROCESS_YOUTUBE_VIDEO_JOB_NAME: return 20 default: logger.error(`unknown job name: ${jobName}`) @@ -722,8 +722,8 @@ export const enqueueProcessYouTubeVideo = async ( return undefined } - return queue.add(PROCESS_YOU_TUBE_VIDEO_JOB_NAME, data, { - priority: getJobPriority(PROCESS_YOU_TUBE_VIDEO_JOB_NAME), + return queue.add(PROCESS_YOUTUBE_VIDEO_JOB_NAME, data, { + priority: getJobPriority(PROCESS_YOUTUBE_VIDEO_JOB_NAME), attempts: 3, }) } diff --git a/yarn.lock b/yarn.lock index cf307822f..192ec8f0f 100644 --- a/yarn.lock +++ b/yarn.lock @@ -2428,6 +2428,11 @@ dependencies: text-decoding "^1.0.0" +"@fastify/busboy@^2.0.0": + version "2.1.1" + resolved "https://registry.yarnpkg.com/@fastify/busboy/-/busboy-2.1.1.tgz#b9da6a878a371829a0502c9b6c1c143ef6663f4d" + integrity sha512-vBZP4NlzfOlerQTnba4aqZoMhE/a9HY7HRqoOPaETQcSQuWEIyZMHGfVu6w9wGtGK5fED5qRs2DteVCjOH60sA== + "@ffmpeg-installer/darwin-arm64@4.1.5": version "4.1.5" resolved "https://registry.yarnpkg.com/@ffmpeg-installer/darwin-arm64/-/darwin-arm64-4.1.5.tgz#b7b5c262dd96d1aea4807514e1cdcf6e11f82743" @@ -8173,6 +8178,11 @@ resolved "https://registry.yarnpkg.com/@types/showdown/-/showdown-2.0.1.tgz#24134738ba3107237d6a783e054a54773e739f81" integrity sha512-xdnAw2nFqomkaL0QdtEk0t7yz26UkaVPl4v1pYJvtE1T0fmfQEH3JaxErEhGByEAl3zUZrkNBlneuJp0WJGqEA== +"@types/showdown@^2.0.6": + version "2.0.6" + resolved "https://registry.yarnpkg.com/@types/showdown/-/showdown-2.0.6.tgz#3d7affd5f971b4a17783ec2b23b4ad3b97477b7e" + integrity sha512-pTvD/0CIeqe4x23+YJWlX2gArHa8G0J0Oh6GKaVXV7TAeickpkkZiNOgFcFcmLQ5lB/K0qBJL1FtRYltBfbGCQ== + "@types/sinon-chai@^3.2.8": version "3.2.8" resolved "https://registry.yarnpkg.com/@types/sinon-chai/-/sinon-chai-3.2.8.tgz#5871d09ab50d671d8e6dd72e9073f8e738ac61dc" @@ -19286,6 +19296,13 @@ jest@^27.4.5: import-local "^3.0.2" jest-cli "^27.5.1" +jintr@^1.1.0: + version "1.1.0" + resolved "https://registry.yarnpkg.com/jintr/-/jintr-1.1.0.tgz#223a3b07f5e03d410cec6e715c537c8ad1e714c3" + integrity sha512-Tu9wk3BpN2v+kb8yT6YBtue+/nbjeLFv4vvVC4PJ7oCidHKbifWhvORrAbQfxVIQZG+67am/mDagpiGSVtvrZg== + dependencies: + acorn "^8.8.0" + jose@^2.0.5: version "2.0.7" resolved "https://registry.yarnpkg.com/jose/-/jose-2.0.7.tgz#3aabbaec70bff313c108b9406498a163737b16ba" @@ -29876,6 +29893,13 @@ undici@^4.9.3: resolved "https://registry.yarnpkg.com/undici/-/undici-4.14.1.tgz#7633b143a8a10d6d63335e00511d071e8d52a1d9" integrity sha512-WJ+g+XqiZcATcBaUeluCajqy4pEDcQfK1vy+Fo+bC4/mqXI9IIQD/XWHLS70fkGUT6P52Drm7IFslO651OdLPQ== +undici@^5.19.1: + version "5.28.3" + resolved "https://registry.yarnpkg.com/undici/-/undici-5.28.3.tgz#a731e0eff2c3fcfd41c1169a869062be222d1e5b" + integrity sha512-3ItfzbrhDlINjaP0duwnNsKpDQk3acHI3gVJ1z4fmwMK31k5G9OVIAMLSIaP6w4FaGkaAkN6zaQO9LUvZ1t7VA== + dependencies: + "@fastify/busboy" "^2.0.0" + unfetch@^4.2.0: version "4.2.0" resolved "https://registry.yarnpkg.com/unfetch/-/unfetch-4.2.0.tgz#7e21b0ef7d363d8d9af0fb929a5555f6ef97a3be" @@ -31536,6 +31560,23 @@ yocto-queue@^1.0.0: resolved "https://registry.yarnpkg.com/yocto-queue/-/yocto-queue-1.0.0.tgz#7f816433fb2cbc511ec8bf7d263c3b58a1a3c251" integrity sha512-9bnSc/HEW2uRy67wc+T8UwauLuPJVn28jb+GtJY16iiKWyvmYJRXVT4UamsAEGQfPohgr2q4Tq0sQbQlxTfi1g== +youtubei.js@^9.1.0: + version "9.1.0" + resolved "https://registry.yarnpkg.com/youtubei.js/-/youtubei.js-9.1.0.tgz#bcf154c9fa21d3c8c1d00a5e10360d0a065c660e" + integrity sha512-C5GBJ4LgnS6vGAUkdIdQNOFFb5EZ1p3xBvUELNXmIG3Idr6vxWrKNBNy8ClZT3SuDVXaAJqDgF9b5jvY8lNKcg== + dependencies: + jintr "^1.1.0" + tslib "^2.5.0" + undici "^5.19.1" + +youtubei@^1.3.4: + version "1.3.4" + resolved "https://registry.yarnpkg.com/youtubei/-/youtubei-1.3.4.tgz#b9761e33dcc6e0a9569e6628ba1fc48c729636f0" + integrity sha512-xN6p2oddcTpreF/ojU2mChwdiUlV+TwwUL6xgP6lXRuxeGS5MokM1tzRdXCgIpxkzYYNNAWpt7xvPuAUQM0PCg== + dependencies: + node-fetch "2.6.7" + protobufjs "7.2.4" + yup@^0.31.0: version "0.31.1" resolved "https://registry.yarnpkg.com/yup/-/yup-0.31.1.tgz#0954cb181161f397b804346037a04f8a4b31599e"