Pull duration and description from YouTube metadata

This commit is contained in:
Jackson Harper
2024-03-12 17:48:16 +08:00
parent f7225b298a
commit 2dbd16a61e
5 changed files with 68 additions and 162 deletions

View File

@ -1,153 +0,0 @@
import { logger } from '../utils/logger'
import { loadSummarizationChain } from 'langchain/chains'
import { ChatOpenAI } from '@langchain/openai'
import {
CharacterTextSplitter,
RecursiveCharacterTextSplitter,
} from 'langchain/text_splitter'
import { DocumentInterface } from '@langchain/core/documents'
import { YoutubeLoader } from 'langchain/document_loaders/web/youtube'
import { authTrx } from '../repository'
import { libraryItemRepository } from '../repository/library_item'
import { htmlToMarkdown, parsePreparedContent } from '../utils/parser'
import { AISummary } from '../entity/AISummary'
import { LibraryItem, LibraryItemState } from '../entity/library_item'
import { getAISummary } from '../services/ai-summaries'
import { YoutubeTranscript, TranscriptResponse } from 'youtube-transcript'
import { Converter } from 'showdown'
import { Video, Client as YouTubeClient } from 'youtubei'
export interface ProcessYouTubeVideoJobData {
userId: string
libraryItemId: string
}
export const PROCESS_YOU_TUBE_VIDEO_JOB_NAME = 'process-you-tube-video'
export const processYouTubeVideo = async (
jobData: ProcessYouTubeVideoJobData
) => {
try {
console.log(
'******************************* processYouTubeVideo *************************'
)
const libraryItem = await authTrx(
async (tx) =>
tx
.withRepository(libraryItemRepository)
.findById(jobData.libraryItemId),
undefined,
jobData.userId
)
if (!libraryItem || libraryItem.state !== LibraryItemState.Succeeded) {
logger.info(
`Not ready to get YouTube metadata job state: ${
libraryItem?.state ?? 'null'
}`
)
return
}
// const doc = await YoutubeLoader.createFromUrl(libraryItem.originalUrl, {
// language: 'en',
// addVideoInfo: true,
// }).load()
// console.log('doc from youtube:', doc)
const youtube = new YouTubeClient()
const video = (await youtube.getVideo(
'Y0fqyJUrwe0' /* libraryItem.originalUrl */
)) as Video
console.log('GOT VIDEO: ', video)
const transcript = await video.getTranscript()
console.log('description: ', video?.description)
console.log('chapters: ', video?.chapters)
// const transcript = await YoutubeTranscript.fetchTranscript(
// libraryItem.originalUrl
// )
if (transcript) {
console.log(
'original transcript:\n',
transcript.map((item) => item.text).join(' '),
'\n\n'
)
} else {
console.log('no transcript found')
}
// const prompt = `Given the following transcript data, supplied as a list of text segments, turn it into readable
// text adding punctuation and paragraphs. Format the output as markdown.
// ${JSON.stringify(transcript).replace(/"/g, '\\"')}
// `
// const llm = new ChatOpenAI({
// configuration: {
// apiKey: process.env.OPENAI_API_KEY,
// },
// })
// const response = await llm.generate([[prompt]])
// console.log('response: ', response.generations, response.llmOutput)
// const text = response.generations[0][0].text
// const converter = new Converter()
// const transcriptHTML = converter.makeHtml(text)
// const html = `<html>
// <head><title>1 Billion Rows Challenge</title>
// <meta property="og:image" content="https://i.ytimg.com/vi/OO6l1DkYA0k/hqdefault.jpg" />
// <meta property="og:image:secure_url" content="https://i.ytimg.com/vi/OO6l1DkYA0k/hqdefault.jpg" />
// <meta property="og:title" content="1 Billion Rows Challenge" />
// <meta property="og:description" content="" />
// <meta property="og:article:author" content="ThePrimeTime" />
// <meta property="og:site_name" content="YouTube" />
// <meta property="og:type" content="video" />
// </head>
// <body>
// <article>
// <p id="_omnivore_youtube_video" class="_omnivore_youtube_video">
// <iframe class="_omnivore_youtube_embed" width="619.4690265486726" height="350" src="https://www.youtube.com/embed/OO6l1DkYA0k" title="1 Billion Rows Challenge" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
// <p>
// <a href="https://www.youtube.com/watch?v=OO6l1DkYA0k" target="_blank">1 Billion Rows Challenge</a></p>
// <p itemscope="" itemprop="author" itemtype="http://schema.org/Person">By <a href="https://www.youtube.com/@ThePrimeTimeagen" target="_blank">ThePrimeTime</a></p>
// </p>
// <div id="_omnivore_youtube_transcript" class="_omnivore_youtube_transcript"></div>
// </article>
// </body>
// </html>`.replace(
// '<div id="_omnivore_youtube_transcript" class="_omnivore_youtube_transcript"></div>',
// `<div id="_omnivore_youtube_transcript" class="_omnivore_youtube_transcript">${transcriptHTML}</div>`
// )
// console.log('input HTML: ', html)
// if (html) {
// const preparedDocument = {
// document: html,
// pageInfo: {},
// }
// const updatedContent = await parsePreparedContent(
// libraryItem.originalUrl,
// preparedDocument,
// true
// )
// console.log('updated content: ', updatedContent.parsedContent?.content)
// libraryItem.readableContent =
// updatedContent.parsedContent?.content ?? libraryItem.readableContent
// const _ = await authTrx(
// async (t) => {
// return t
// .getRepository(LibraryItem)
// .update(jobData.libraryItemId, libraryItem)
// },
// undefined,
// jobData.userId
// )
// }
} catch (err) {
console.log('error creating summary: ', err)
}
}

View File

@ -18,7 +18,7 @@ import {
findFeatureByName,
getFeatureName,
} from './services/features'
import { processYouTubeVideo } from './jobs/get-youtube-info'
import { processYouTubeVideo } from './jobs/process-youtube-video'
const logger = buildLogger('pubsub')
@ -26,6 +26,18 @@ const client = new PubSub()
type EntityData<T> = Merge<T, { libraryItemId: string }>
const isYouTubeVideoURL = (url: string | undefined): Boolean => {
if (!url) {
return false
}
const u = new URL(url)
if (!u.host.endsWith('youtube.com') && !u.host.endsWith('youtu.be')) {
return false
}
const videoId = u.searchParams.get('v')
return videoId != null
}
export const createPubSubClient = (): PubsubClient => {
const fieldsToDelete = ['user'] as const
@ -95,6 +107,12 @@ export const createPubSubClient = (): PubsubClient => {
// userId,
// libraryItemId,
// })
}
if (
'originalUrl' in data &&
isYouTubeVideoURL(data['originalUrl'] as string | undefined)
) {
await enqueueProcessYouTubeVideo({
userId,
libraryItemId,

View File

@ -45,9 +45,9 @@ import { CACHED_READING_POSITION_PREFIX } from './services/cached_reading_positi
import { getJobPriority } from './utils/createTask'
import { logger } from './utils/logger'
import {
PROCESS_YOU_TUBE_VIDEO_JOB_NAME,
PROCESS_YOUTUBE_VIDEO_JOB_NAME,
processYouTubeVideo,
} from './jobs/get-youtube-info'
} from './jobs/process-youtube-video'
export const QUEUE_NAME = 'omnivore-backend-queue'
export const JOB_VERSION = 'v001'
@ -120,7 +120,7 @@ export const createWorker = (connection: ConnectionOptions) =>
return exportItem(job.data)
case AI_SUMMARIZE_JOB_NAME:
return aiSummarize(job.data)
case PROCESS_YOU_TUBE_VIDEO_JOB_NAME:
case PROCESS_YOUTUBE_VIDEO_JOB_NAME:
return processYouTubeVideo(job.data)
case EXPORT_ALL_ITEMS_JOB_NAME:
return exportAllItems(job.data)

View File

@ -46,9 +46,9 @@ import { logger } from './logger'
import View = google.cloud.tasks.v2.Task.View
import { AISummarizeJobData, AI_SUMMARIZE_JOB_NAME } from '../jobs/ai-summarize'
import {
PROCESS_YOU_TUBE_VIDEO_JOB_NAME,
PROCESS_YOUTUBE_VIDEO_JOB_NAME,
ProcessYouTubeVideoJobData,
} from '../jobs/get-youtube-info'
} from '../jobs/process-youtube-video'
// Instantiates a client.
const client = new CloudTasksClient()
@ -82,7 +82,7 @@ export const getJobPriority = (jobName: string): number => {
case REFRESH_ALL_FEEDS_JOB_NAME:
case THUMBNAIL_JOB:
return 100
case PROCESS_YOU_TUBE_VIDEO_JOB_NAME:
case PROCESS_YOUTUBE_VIDEO_JOB_NAME:
return 20
default:
logger.error(`unknown job name: ${jobName}`)
@ -722,8 +722,8 @@ export const enqueueProcessYouTubeVideo = async (
return undefined
}
return queue.add(PROCESS_YOU_TUBE_VIDEO_JOB_NAME, data, {
priority: getJobPriority(PROCESS_YOU_TUBE_VIDEO_JOB_NAME),
return queue.add(PROCESS_YOUTUBE_VIDEO_JOB_NAME, data, {
priority: getJobPriority(PROCESS_YOUTUBE_VIDEO_JOB_NAME),
attempts: 3,
})
}

View File

@ -2428,6 +2428,11 @@
dependencies:
text-decoding "^1.0.0"
"@fastify/busboy@^2.0.0":
version "2.1.1"
resolved "https://registry.yarnpkg.com/@fastify/busboy/-/busboy-2.1.1.tgz#b9da6a878a371829a0502c9b6c1c143ef6663f4d"
integrity sha512-vBZP4NlzfOlerQTnba4aqZoMhE/a9HY7HRqoOPaETQcSQuWEIyZMHGfVu6w9wGtGK5fED5qRs2DteVCjOH60sA==
"@ffmpeg-installer/darwin-arm64@4.1.5":
version "4.1.5"
resolved "https://registry.yarnpkg.com/@ffmpeg-installer/darwin-arm64/-/darwin-arm64-4.1.5.tgz#b7b5c262dd96d1aea4807514e1cdcf6e11f82743"
@ -8173,6 +8178,11 @@
resolved "https://registry.yarnpkg.com/@types/showdown/-/showdown-2.0.1.tgz#24134738ba3107237d6a783e054a54773e739f81"
integrity sha512-xdnAw2nFqomkaL0QdtEk0t7yz26UkaVPl4v1pYJvtE1T0fmfQEH3JaxErEhGByEAl3zUZrkNBlneuJp0WJGqEA==
"@types/showdown@^2.0.6":
version "2.0.6"
resolved "https://registry.yarnpkg.com/@types/showdown/-/showdown-2.0.6.tgz#3d7affd5f971b4a17783ec2b23b4ad3b97477b7e"
integrity sha512-pTvD/0CIeqe4x23+YJWlX2gArHa8G0J0Oh6GKaVXV7TAeickpkkZiNOgFcFcmLQ5lB/K0qBJL1FtRYltBfbGCQ==
"@types/sinon-chai@^3.2.8":
version "3.2.8"
resolved "https://registry.yarnpkg.com/@types/sinon-chai/-/sinon-chai-3.2.8.tgz#5871d09ab50d671d8e6dd72e9073f8e738ac61dc"
@ -19286,6 +19296,13 @@ jest@^27.4.5:
import-local "^3.0.2"
jest-cli "^27.5.1"
jintr@^1.1.0:
version "1.1.0"
resolved "https://registry.yarnpkg.com/jintr/-/jintr-1.1.0.tgz#223a3b07f5e03d410cec6e715c537c8ad1e714c3"
integrity sha512-Tu9wk3BpN2v+kb8yT6YBtue+/nbjeLFv4vvVC4PJ7oCidHKbifWhvORrAbQfxVIQZG+67am/mDagpiGSVtvrZg==
dependencies:
acorn "^8.8.0"
jose@^2.0.5:
version "2.0.7"
resolved "https://registry.yarnpkg.com/jose/-/jose-2.0.7.tgz#3aabbaec70bff313c108b9406498a163737b16ba"
@ -29876,6 +29893,13 @@ undici@^4.9.3:
resolved "https://registry.yarnpkg.com/undici/-/undici-4.14.1.tgz#7633b143a8a10d6d63335e00511d071e8d52a1d9"
integrity sha512-WJ+g+XqiZcATcBaUeluCajqy4pEDcQfK1vy+Fo+bC4/mqXI9IIQD/XWHLS70fkGUT6P52Drm7IFslO651OdLPQ==
undici@^5.19.1:
version "5.28.3"
resolved "https://registry.yarnpkg.com/undici/-/undici-5.28.3.tgz#a731e0eff2c3fcfd41c1169a869062be222d1e5b"
integrity sha512-3ItfzbrhDlINjaP0duwnNsKpDQk3acHI3gVJ1z4fmwMK31k5G9OVIAMLSIaP6w4FaGkaAkN6zaQO9LUvZ1t7VA==
dependencies:
"@fastify/busboy" "^2.0.0"
unfetch@^4.2.0:
version "4.2.0"
resolved "https://registry.yarnpkg.com/unfetch/-/unfetch-4.2.0.tgz#7e21b0ef7d363d8d9af0fb929a5555f6ef97a3be"
@ -31536,6 +31560,23 @@ yocto-queue@^1.0.0:
resolved "https://registry.yarnpkg.com/yocto-queue/-/yocto-queue-1.0.0.tgz#7f816433fb2cbc511ec8bf7d263c3b58a1a3c251"
integrity sha512-9bnSc/HEW2uRy67wc+T8UwauLuPJVn28jb+GtJY16iiKWyvmYJRXVT4UamsAEGQfPohgr2q4Tq0sQbQlxTfi1g==
youtubei.js@^9.1.0:
version "9.1.0"
resolved "https://registry.yarnpkg.com/youtubei.js/-/youtubei.js-9.1.0.tgz#bcf154c9fa21d3c8c1d00a5e10360d0a065c660e"
integrity sha512-C5GBJ4LgnS6vGAUkdIdQNOFFb5EZ1p3xBvUELNXmIG3Idr6vxWrKNBNy8ClZT3SuDVXaAJqDgF9b5jvY8lNKcg==
dependencies:
jintr "^1.1.0"
tslib "^2.5.0"
undici "^5.19.1"
youtubei@^1.3.4:
version "1.3.4"
resolved "https://registry.yarnpkg.com/youtubei/-/youtubei-1.3.4.tgz#b9761e33dcc6e0a9569e6628ba1fc48c729636f0"
integrity sha512-xN6p2oddcTpreF/ojU2mChwdiUlV+TwwUL6xgP6lXRuxeGS5MokM1tzRdXCgIpxkzYYNNAWpt7xvPuAUQM0PCg==
dependencies:
node-fetch "2.6.7"
protobufjs "7.2.4"
yup@^0.31.0:
version "0.31.1"
resolved "https://registry.yarnpkg.com/yup/-/yup-0.31.1.tgz#0954cb181161f397b804346037a04f8a4b31599e"