From 2dbd16a61eaa26523ced24c3d4e496c6e6068cdb Mon Sep 17 00:00:00 2001
From: Jackson Harper
Date: Tue, 12 Mar 2024 17:48:16 +0800
Subject: [PATCH] Pull duration and description from YouTube metadata
---
packages/api/src/jobs/get-youtube-info.ts | 153 ----------------------
packages/api/src/pubsub.ts | 20 ++-
packages/api/src/queue-processor.ts | 6 +-
packages/api/src/utils/createTask.ts | 10 +-
yarn.lock | 41 ++++++
5 files changed, 68 insertions(+), 162 deletions(-)
delete mode 100644 packages/api/src/jobs/get-youtube-info.ts
diff --git a/packages/api/src/jobs/get-youtube-info.ts b/packages/api/src/jobs/get-youtube-info.ts
deleted file mode 100644
index 9f7abbdf2..000000000
--- a/packages/api/src/jobs/get-youtube-info.ts
+++ /dev/null
@@ -1,153 +0,0 @@
-import { logger } from '../utils/logger'
-import { loadSummarizationChain } from 'langchain/chains'
-import { ChatOpenAI } from '@langchain/openai'
-import {
- CharacterTextSplitter,
- RecursiveCharacterTextSplitter,
-} from 'langchain/text_splitter'
-import { DocumentInterface } from '@langchain/core/documents'
-import { YoutubeLoader } from 'langchain/document_loaders/web/youtube'
-import { authTrx } from '../repository'
-import { libraryItemRepository } from '../repository/library_item'
-import { htmlToMarkdown, parsePreparedContent } from '../utils/parser'
-import { AISummary } from '../entity/AISummary'
-import { LibraryItem, LibraryItemState } from '../entity/library_item'
-import { getAISummary } from '../services/ai-summaries'
-import { YoutubeTranscript, TranscriptResponse } from 'youtube-transcript'
-import { Converter } from 'showdown'
-import { Video, Client as YouTubeClient } from 'youtubei'
-
-export interface ProcessYouTubeVideoJobData {
- userId: string
- libraryItemId: string
-}
-
-export const PROCESS_YOU_TUBE_VIDEO_JOB_NAME = 'process-you-tube-video'
-
-export const processYouTubeVideo = async (
- jobData: ProcessYouTubeVideoJobData
-) => {
- try {
- console.log(
- '******************************* processYouTubeVideo *************************'
- )
- const libraryItem = await authTrx(
- async (tx) =>
- tx
- .withRepository(libraryItemRepository)
- .findById(jobData.libraryItemId),
- undefined,
- jobData.userId
- )
- if (!libraryItem || libraryItem.state !== LibraryItemState.Succeeded) {
- logger.info(
- `Not ready to get YouTube metadata job state: ${
- libraryItem?.state ?? 'null'
- }`
- )
- return
- }
-
- // const doc = await YoutubeLoader.createFromUrl(libraryItem.originalUrl, {
- // language: 'en',
- // addVideoInfo: true,
- // }).load()
-
- // console.log('doc from youtube:', doc)
-
- const youtube = new YouTubeClient()
- const video = (await youtube.getVideo(
- 'Y0fqyJUrwe0' /* libraryItem.originalUrl */
- )) as Video
- console.log('GOT VIDEO: ', video)
- const transcript = await video.getTranscript()
-
- console.log('description: ', video?.description)
- console.log('chapters: ', video?.chapters)
-
- // const transcript = await YoutubeTranscript.fetchTranscript(
- // libraryItem.originalUrl
- // )
-
- if (transcript) {
- console.log(
- 'original transcript:\n',
- transcript.map((item) => item.text).join(' '),
- '\n\n'
- )
- } else {
- console.log('no transcript found')
- }
-
- // const prompt = `Given the following transcript data, supplied as a list of text segments, turn it into readable
- // text adding punctuation and paragraphs. Format the output as markdown.
-
- // ${JSON.stringify(transcript).replace(/"/g, '\\"')}
- // `
-
- // const llm = new ChatOpenAI({
- // configuration: {
- // apiKey: process.env.OPENAI_API_KEY,
- // },
- // })
- // const response = await llm.generate([[prompt]])
- // console.log('response: ', response.generations, response.llmOutput)
-
- // const text = response.generations[0][0].text
- // const converter = new Converter()
- // const transcriptHTML = converter.makeHtml(text)
-
- // const html = `
- // 1 Billion Rows Challenge
- //
- //
- //
- //
- //
- //
- //
- //
- //
- //
- //
- //
- //
- // 1 Billion Rows Challenge
- // By ThePrimeTime
- //
- //
- //
- //
- // `.replace(
- // '',
- // `${transcriptHTML}
`
- // )
-
- // console.log('input HTML: ', html)
- // if (html) {
- // const preparedDocument = {
- // document: html,
- // pageInfo: {},
- // }
- // const updatedContent = await parsePreparedContent(
- // libraryItem.originalUrl,
- // preparedDocument,
- // true
- // )
- // console.log('updated content: ', updatedContent.parsedContent?.content)
- // libraryItem.readableContent =
- // updatedContent.parsedContent?.content ?? libraryItem.readableContent
- // const _ = await authTrx(
- // async (t) => {
- // return t
- // .getRepository(LibraryItem)
- // .update(jobData.libraryItemId, libraryItem)
- // },
- // undefined,
- // jobData.userId
- // )
- // }
- } catch (err) {
- console.log('error creating summary: ', err)
- }
-}
diff --git a/packages/api/src/pubsub.ts b/packages/api/src/pubsub.ts
index 5d2b21817..2d7f12ba0 100644
--- a/packages/api/src/pubsub.ts
+++ b/packages/api/src/pubsub.ts
@@ -18,7 +18,7 @@ import {
findFeatureByName,
getFeatureName,
} from './services/features'
-import { processYouTubeVideo } from './jobs/get-youtube-info'
+import { processYouTubeVideo } from './jobs/process-youtube-video'
const logger = buildLogger('pubsub')
@@ -26,6 +26,18 @@ const client = new PubSub()
type EntityData = Merge
+const isYouTubeVideoURL = (url: string | undefined): Boolean => {
+ if (!url) {
+ return false
+ }
+ const u = new URL(url)
+ if (!u.host.endsWith('youtube.com') && !u.host.endsWith('youtu.be')) {
+ return false
+ }
+ const videoId = u.searchParams.get('v')
+ return videoId != null
+}
+
export const createPubSubClient = (): PubsubClient => {
const fieldsToDelete = ['user'] as const
@@ -95,6 +107,12 @@ export const createPubSubClient = (): PubsubClient => {
// userId,
// libraryItemId,
// })
+ }
+
+ if (
+ 'originalUrl' in data &&
+ isYouTubeVideoURL(data['originalUrl'] as string | undefined)
+ ) {
await enqueueProcessYouTubeVideo({
userId,
libraryItemId,
diff --git a/packages/api/src/queue-processor.ts b/packages/api/src/queue-processor.ts
index 7fc18f518..ed7940879 100644
--- a/packages/api/src/queue-processor.ts
+++ b/packages/api/src/queue-processor.ts
@@ -45,9 +45,9 @@ import { CACHED_READING_POSITION_PREFIX } from './services/cached_reading_positi
import { getJobPriority } from './utils/createTask'
import { logger } from './utils/logger'
import {
- PROCESS_YOU_TUBE_VIDEO_JOB_NAME,
+ PROCESS_YOUTUBE_VIDEO_JOB_NAME,
processYouTubeVideo,
-} from './jobs/get-youtube-info'
+} from './jobs/process-youtube-video'
export const QUEUE_NAME = 'omnivore-backend-queue'
export const JOB_VERSION = 'v001'
@@ -120,7 +120,7 @@ export const createWorker = (connection: ConnectionOptions) =>
return exportItem(job.data)
case AI_SUMMARIZE_JOB_NAME:
return aiSummarize(job.data)
- case PROCESS_YOU_TUBE_VIDEO_JOB_NAME:
+ case PROCESS_YOUTUBE_VIDEO_JOB_NAME:
return processYouTubeVideo(job.data)
case EXPORT_ALL_ITEMS_JOB_NAME:
return exportAllItems(job.data)
diff --git a/packages/api/src/utils/createTask.ts b/packages/api/src/utils/createTask.ts
index 6c9345f8f..fd99e1668 100644
--- a/packages/api/src/utils/createTask.ts
+++ b/packages/api/src/utils/createTask.ts
@@ -46,9 +46,9 @@ import { logger } from './logger'
import View = google.cloud.tasks.v2.Task.View
import { AISummarizeJobData, AI_SUMMARIZE_JOB_NAME } from '../jobs/ai-summarize'
import {
- PROCESS_YOU_TUBE_VIDEO_JOB_NAME,
+ PROCESS_YOUTUBE_VIDEO_JOB_NAME,
ProcessYouTubeVideoJobData,
-} from '../jobs/get-youtube-info'
+} from '../jobs/process-youtube-video'
// Instantiates a client.
const client = new CloudTasksClient()
@@ -82,7 +82,7 @@ export const getJobPriority = (jobName: string): number => {
case REFRESH_ALL_FEEDS_JOB_NAME:
case THUMBNAIL_JOB:
return 100
- case PROCESS_YOU_TUBE_VIDEO_JOB_NAME:
+ case PROCESS_YOUTUBE_VIDEO_JOB_NAME:
return 20
default:
logger.error(`unknown job name: ${jobName}`)
@@ -722,8 +722,8 @@ export const enqueueProcessYouTubeVideo = async (
return undefined
}
- return queue.add(PROCESS_YOU_TUBE_VIDEO_JOB_NAME, data, {
- priority: getJobPriority(PROCESS_YOU_TUBE_VIDEO_JOB_NAME),
+ return queue.add(PROCESS_YOUTUBE_VIDEO_JOB_NAME, data, {
+ priority: getJobPriority(PROCESS_YOUTUBE_VIDEO_JOB_NAME),
attempts: 3,
})
}
diff --git a/yarn.lock b/yarn.lock
index cf307822f..192ec8f0f 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -2428,6 +2428,11 @@
dependencies:
text-decoding "^1.0.0"
+"@fastify/busboy@^2.0.0":
+ version "2.1.1"
+ resolved "https://registry.yarnpkg.com/@fastify/busboy/-/busboy-2.1.1.tgz#b9da6a878a371829a0502c9b6c1c143ef6663f4d"
+ integrity sha512-vBZP4NlzfOlerQTnba4aqZoMhE/a9HY7HRqoOPaETQcSQuWEIyZMHGfVu6w9wGtGK5fED5qRs2DteVCjOH60sA==
+
"@ffmpeg-installer/darwin-arm64@4.1.5":
version "4.1.5"
resolved "https://registry.yarnpkg.com/@ffmpeg-installer/darwin-arm64/-/darwin-arm64-4.1.5.tgz#b7b5c262dd96d1aea4807514e1cdcf6e11f82743"
@@ -8173,6 +8178,11 @@
resolved "https://registry.yarnpkg.com/@types/showdown/-/showdown-2.0.1.tgz#24134738ba3107237d6a783e054a54773e739f81"
integrity sha512-xdnAw2nFqomkaL0QdtEk0t7yz26UkaVPl4v1pYJvtE1T0fmfQEH3JaxErEhGByEAl3zUZrkNBlneuJp0WJGqEA==
+"@types/showdown@^2.0.6":
+ version "2.0.6"
+ resolved "https://registry.yarnpkg.com/@types/showdown/-/showdown-2.0.6.tgz#3d7affd5f971b4a17783ec2b23b4ad3b97477b7e"
+ integrity sha512-pTvD/0CIeqe4x23+YJWlX2gArHa8G0J0Oh6GKaVXV7TAeickpkkZiNOgFcFcmLQ5lB/K0qBJL1FtRYltBfbGCQ==
+
"@types/sinon-chai@^3.2.8":
version "3.2.8"
resolved "https://registry.yarnpkg.com/@types/sinon-chai/-/sinon-chai-3.2.8.tgz#5871d09ab50d671d8e6dd72e9073f8e738ac61dc"
@@ -19286,6 +19296,13 @@ jest@^27.4.5:
import-local "^3.0.2"
jest-cli "^27.5.1"
+jintr@^1.1.0:
+ version "1.1.0"
+ resolved "https://registry.yarnpkg.com/jintr/-/jintr-1.1.0.tgz#223a3b07f5e03d410cec6e715c537c8ad1e714c3"
+ integrity sha512-Tu9wk3BpN2v+kb8yT6YBtue+/nbjeLFv4vvVC4PJ7oCidHKbifWhvORrAbQfxVIQZG+67am/mDagpiGSVtvrZg==
+ dependencies:
+ acorn "^8.8.0"
+
jose@^2.0.5:
version "2.0.7"
resolved "https://registry.yarnpkg.com/jose/-/jose-2.0.7.tgz#3aabbaec70bff313c108b9406498a163737b16ba"
@@ -29876,6 +29893,13 @@ undici@^4.9.3:
resolved "https://registry.yarnpkg.com/undici/-/undici-4.14.1.tgz#7633b143a8a10d6d63335e00511d071e8d52a1d9"
integrity sha512-WJ+g+XqiZcATcBaUeluCajqy4pEDcQfK1vy+Fo+bC4/mqXI9IIQD/XWHLS70fkGUT6P52Drm7IFslO651OdLPQ==
+undici@^5.19.1:
+ version "5.28.3"
+ resolved "https://registry.yarnpkg.com/undici/-/undici-5.28.3.tgz#a731e0eff2c3fcfd41c1169a869062be222d1e5b"
+ integrity sha512-3ItfzbrhDlINjaP0duwnNsKpDQk3acHI3gVJ1z4fmwMK31k5G9OVIAMLSIaP6w4FaGkaAkN6zaQO9LUvZ1t7VA==
+ dependencies:
+ "@fastify/busboy" "^2.0.0"
+
unfetch@^4.2.0:
version "4.2.0"
resolved "https://registry.yarnpkg.com/unfetch/-/unfetch-4.2.0.tgz#7e21b0ef7d363d8d9af0fb929a5555f6ef97a3be"
@@ -31536,6 +31560,23 @@ yocto-queue@^1.0.0:
resolved "https://registry.yarnpkg.com/yocto-queue/-/yocto-queue-1.0.0.tgz#7f816433fb2cbc511ec8bf7d263c3b58a1a3c251"
integrity sha512-9bnSc/HEW2uRy67wc+T8UwauLuPJVn28jb+GtJY16iiKWyvmYJRXVT4UamsAEGQfPohgr2q4Tq0sQbQlxTfi1g==
+youtubei.js@^9.1.0:
+ version "9.1.0"
+ resolved "https://registry.yarnpkg.com/youtubei.js/-/youtubei.js-9.1.0.tgz#bcf154c9fa21d3c8c1d00a5e10360d0a065c660e"
+ integrity sha512-C5GBJ4LgnS6vGAUkdIdQNOFFb5EZ1p3xBvUELNXmIG3Idr6vxWrKNBNy8ClZT3SuDVXaAJqDgF9b5jvY8lNKcg==
+ dependencies:
+ jintr "^1.1.0"
+ tslib "^2.5.0"
+ undici "^5.19.1"
+
+youtubei@^1.3.4:
+ version "1.3.4"
+ resolved "https://registry.yarnpkg.com/youtubei/-/youtubei-1.3.4.tgz#b9761e33dcc6e0a9569e6628ba1fc48c729636f0"
+ integrity sha512-xN6p2oddcTpreF/ojU2mChwdiUlV+TwwUL6xgP6lXRuxeGS5MokM1tzRdXCgIpxkzYYNNAWpt7xvPuAUQM0PCg==
+ dependencies:
+ node-fetch "2.6.7"
+ protobufjs "7.2.4"
+
yup@^0.31.0:
version "0.31.1"
resolved "https://registry.yarnpkg.com/yup/-/yup-0.31.1.tgz#0954cb181161f397b804346037a04f8a4b31599e"