Pull duration and description from YouTube metadata
This commit is contained in:
@ -1,153 +0,0 @@
|
||||
import { logger } from '../utils/logger'
|
||||
import { loadSummarizationChain } from 'langchain/chains'
|
||||
import { ChatOpenAI } from '@langchain/openai'
|
||||
import {
|
||||
CharacterTextSplitter,
|
||||
RecursiveCharacterTextSplitter,
|
||||
} from 'langchain/text_splitter'
|
||||
import { DocumentInterface } from '@langchain/core/documents'
|
||||
import { YoutubeLoader } from 'langchain/document_loaders/web/youtube'
|
||||
import { authTrx } from '../repository'
|
||||
import { libraryItemRepository } from '../repository/library_item'
|
||||
import { htmlToMarkdown, parsePreparedContent } from '../utils/parser'
|
||||
import { AISummary } from '../entity/AISummary'
|
||||
import { LibraryItem, LibraryItemState } from '../entity/library_item'
|
||||
import { getAISummary } from '../services/ai-summaries'
|
||||
import { YoutubeTranscript, TranscriptResponse } from 'youtube-transcript'
|
||||
import { Converter } from 'showdown'
|
||||
import { Video, Client as YouTubeClient } from 'youtubei'
|
||||
|
||||
export interface ProcessYouTubeVideoJobData {
|
||||
userId: string
|
||||
libraryItemId: string
|
||||
}
|
||||
|
||||
export const PROCESS_YOU_TUBE_VIDEO_JOB_NAME = 'process-you-tube-video'
|
||||
|
||||
export const processYouTubeVideo = async (
|
||||
jobData: ProcessYouTubeVideoJobData
|
||||
) => {
|
||||
try {
|
||||
console.log(
|
||||
'******************************* processYouTubeVideo *************************'
|
||||
)
|
||||
const libraryItem = await authTrx(
|
||||
async (tx) =>
|
||||
tx
|
||||
.withRepository(libraryItemRepository)
|
||||
.findById(jobData.libraryItemId),
|
||||
undefined,
|
||||
jobData.userId
|
||||
)
|
||||
if (!libraryItem || libraryItem.state !== LibraryItemState.Succeeded) {
|
||||
logger.info(
|
||||
`Not ready to get YouTube metadata job state: ${
|
||||
libraryItem?.state ?? 'null'
|
||||
}`
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
// const doc = await YoutubeLoader.createFromUrl(libraryItem.originalUrl, {
|
||||
// language: 'en',
|
||||
// addVideoInfo: true,
|
||||
// }).load()
|
||||
|
||||
// console.log('doc from youtube:', doc)
|
||||
|
||||
const youtube = new YouTubeClient()
|
||||
const video = (await youtube.getVideo(
|
||||
'Y0fqyJUrwe0' /* libraryItem.originalUrl */
|
||||
)) as Video
|
||||
console.log('GOT VIDEO: ', video)
|
||||
const transcript = await video.getTranscript()
|
||||
|
||||
console.log('description: ', video?.description)
|
||||
console.log('chapters: ', video?.chapters)
|
||||
|
||||
// const transcript = await YoutubeTranscript.fetchTranscript(
|
||||
// libraryItem.originalUrl
|
||||
// )
|
||||
|
||||
if (transcript) {
|
||||
console.log(
|
||||
'original transcript:\n',
|
||||
transcript.map((item) => item.text).join(' '),
|
||||
'\n\n'
|
||||
)
|
||||
} else {
|
||||
console.log('no transcript found')
|
||||
}
|
||||
|
||||
// const prompt = `Given the following transcript data, supplied as a list of text segments, turn it into readable
|
||||
// text adding punctuation and paragraphs. Format the output as markdown.
|
||||
|
||||
// ${JSON.stringify(transcript).replace(/"/g, '\\"')}
|
||||
// `
|
||||
|
||||
// const llm = new ChatOpenAI({
|
||||
// configuration: {
|
||||
// apiKey: process.env.OPENAI_API_KEY,
|
||||
// },
|
||||
// })
|
||||
// const response = await llm.generate([[prompt]])
|
||||
// console.log('response: ', response.generations, response.llmOutput)
|
||||
|
||||
// const text = response.generations[0][0].text
|
||||
// const converter = new Converter()
|
||||
// const transcriptHTML = converter.makeHtml(text)
|
||||
|
||||
// const html = `<html>
|
||||
// <head><title>1 Billion Rows Challenge</title>
|
||||
// <meta property="og:image" content="https://i.ytimg.com/vi/OO6l1DkYA0k/hqdefault.jpg" />
|
||||
// <meta property="og:image:secure_url" content="https://i.ytimg.com/vi/OO6l1DkYA0k/hqdefault.jpg" />
|
||||
// <meta property="og:title" content="1 Billion Rows Challenge" />
|
||||
// <meta property="og:description" content="" />
|
||||
// <meta property="og:article:author" content="ThePrimeTime" />
|
||||
// <meta property="og:site_name" content="YouTube" />
|
||||
// <meta property="og:type" content="video" />
|
||||
// </head>
|
||||
// <body>
|
||||
// <article>
|
||||
// <p id="_omnivore_youtube_video" class="_omnivore_youtube_video">
|
||||
// <iframe class="_omnivore_youtube_embed" width="619.4690265486726" height="350" src="https://www.youtube.com/embed/OO6l1DkYA0k" title="1 Billion Rows Challenge" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
|
||||
// <p>
|
||||
// <a href="https://www.youtube.com/watch?v=OO6l1DkYA0k" target="_blank">1 Billion Rows Challenge</a></p>
|
||||
// <p itemscope="" itemprop="author" itemtype="http://schema.org/Person">By <a href="https://www.youtube.com/@ThePrimeTimeagen" target="_blank">ThePrimeTime</a></p>
|
||||
// </p>
|
||||
// <div id="_omnivore_youtube_transcript" class="_omnivore_youtube_transcript"></div>
|
||||
// </article>
|
||||
// </body>
|
||||
// </html>`.replace(
|
||||
// '<div id="_omnivore_youtube_transcript" class="_omnivore_youtube_transcript"></div>',
|
||||
// `<div id="_omnivore_youtube_transcript" class="_omnivore_youtube_transcript">${transcriptHTML}</div>`
|
||||
// )
|
||||
|
||||
// console.log('input HTML: ', html)
|
||||
// if (html) {
|
||||
// const preparedDocument = {
|
||||
// document: html,
|
||||
// pageInfo: {},
|
||||
// }
|
||||
// const updatedContent = await parsePreparedContent(
|
||||
// libraryItem.originalUrl,
|
||||
// preparedDocument,
|
||||
// true
|
||||
// )
|
||||
// console.log('updated content: ', updatedContent.parsedContent?.content)
|
||||
// libraryItem.readableContent =
|
||||
// updatedContent.parsedContent?.content ?? libraryItem.readableContent
|
||||
// const _ = await authTrx(
|
||||
// async (t) => {
|
||||
// return t
|
||||
// .getRepository(LibraryItem)
|
||||
// .update(jobData.libraryItemId, libraryItem)
|
||||
// },
|
||||
// undefined,
|
||||
// jobData.userId
|
||||
// )
|
||||
// }
|
||||
} catch (err) {
|
||||
console.log('error creating summary: ', err)
|
||||
}
|
||||
}
|
||||
@ -18,7 +18,7 @@ import {
|
||||
findFeatureByName,
|
||||
getFeatureName,
|
||||
} from './services/features'
|
||||
import { processYouTubeVideo } from './jobs/get-youtube-info'
|
||||
import { processYouTubeVideo } from './jobs/process-youtube-video'
|
||||
|
||||
const logger = buildLogger('pubsub')
|
||||
|
||||
@ -26,6 +26,18 @@ const client = new PubSub()
|
||||
|
||||
type EntityData<T> = Merge<T, { libraryItemId: string }>
|
||||
|
||||
const isYouTubeVideoURL = (url: string | undefined): Boolean => {
|
||||
if (!url) {
|
||||
return false
|
||||
}
|
||||
const u = new URL(url)
|
||||
if (!u.host.endsWith('youtube.com') && !u.host.endsWith('youtu.be')) {
|
||||
return false
|
||||
}
|
||||
const videoId = u.searchParams.get('v')
|
||||
return videoId != null
|
||||
}
|
||||
|
||||
export const createPubSubClient = (): PubsubClient => {
|
||||
const fieldsToDelete = ['user'] as const
|
||||
|
||||
@ -95,6 +107,12 @@ export const createPubSubClient = (): PubsubClient => {
|
||||
// userId,
|
||||
// libraryItemId,
|
||||
// })
|
||||
}
|
||||
|
||||
if (
|
||||
'originalUrl' in data &&
|
||||
isYouTubeVideoURL(data['originalUrl'] as string | undefined)
|
||||
) {
|
||||
await enqueueProcessYouTubeVideo({
|
||||
userId,
|
||||
libraryItemId,
|
||||
|
||||
@ -45,9 +45,9 @@ import { CACHED_READING_POSITION_PREFIX } from './services/cached_reading_positi
|
||||
import { getJobPriority } from './utils/createTask'
|
||||
import { logger } from './utils/logger'
|
||||
import {
|
||||
PROCESS_YOU_TUBE_VIDEO_JOB_NAME,
|
||||
PROCESS_YOUTUBE_VIDEO_JOB_NAME,
|
||||
processYouTubeVideo,
|
||||
} from './jobs/get-youtube-info'
|
||||
} from './jobs/process-youtube-video'
|
||||
|
||||
export const QUEUE_NAME = 'omnivore-backend-queue'
|
||||
export const JOB_VERSION = 'v001'
|
||||
@ -120,7 +120,7 @@ export const createWorker = (connection: ConnectionOptions) =>
|
||||
return exportItem(job.data)
|
||||
case AI_SUMMARIZE_JOB_NAME:
|
||||
return aiSummarize(job.data)
|
||||
case PROCESS_YOU_TUBE_VIDEO_JOB_NAME:
|
||||
case PROCESS_YOUTUBE_VIDEO_JOB_NAME:
|
||||
return processYouTubeVideo(job.data)
|
||||
case EXPORT_ALL_ITEMS_JOB_NAME:
|
||||
return exportAllItems(job.data)
|
||||
|
||||
@ -46,9 +46,9 @@ import { logger } from './logger'
|
||||
import View = google.cloud.tasks.v2.Task.View
|
||||
import { AISummarizeJobData, AI_SUMMARIZE_JOB_NAME } from '../jobs/ai-summarize'
|
||||
import {
|
||||
PROCESS_YOU_TUBE_VIDEO_JOB_NAME,
|
||||
PROCESS_YOUTUBE_VIDEO_JOB_NAME,
|
||||
ProcessYouTubeVideoJobData,
|
||||
} from '../jobs/get-youtube-info'
|
||||
} from '../jobs/process-youtube-video'
|
||||
|
||||
// Instantiates a client.
|
||||
const client = new CloudTasksClient()
|
||||
@ -82,7 +82,7 @@ export const getJobPriority = (jobName: string): number => {
|
||||
case REFRESH_ALL_FEEDS_JOB_NAME:
|
||||
case THUMBNAIL_JOB:
|
||||
return 100
|
||||
case PROCESS_YOU_TUBE_VIDEO_JOB_NAME:
|
||||
case PROCESS_YOUTUBE_VIDEO_JOB_NAME:
|
||||
return 20
|
||||
default:
|
||||
logger.error(`unknown job name: ${jobName}`)
|
||||
@ -722,8 +722,8 @@ export const enqueueProcessYouTubeVideo = async (
|
||||
return undefined
|
||||
}
|
||||
|
||||
return queue.add(PROCESS_YOU_TUBE_VIDEO_JOB_NAME, data, {
|
||||
priority: getJobPriority(PROCESS_YOU_TUBE_VIDEO_JOB_NAME),
|
||||
return queue.add(PROCESS_YOUTUBE_VIDEO_JOB_NAME, data, {
|
||||
priority: getJobPriority(PROCESS_YOUTUBE_VIDEO_JOB_NAME),
|
||||
attempts: 3,
|
||||
})
|
||||
}
|
||||
|
||||
41
yarn.lock
41
yarn.lock
@ -2428,6 +2428,11 @@
|
||||
dependencies:
|
||||
text-decoding "^1.0.0"
|
||||
|
||||
"@fastify/busboy@^2.0.0":
|
||||
version "2.1.1"
|
||||
resolved "https://registry.yarnpkg.com/@fastify/busboy/-/busboy-2.1.1.tgz#b9da6a878a371829a0502c9b6c1c143ef6663f4d"
|
||||
integrity sha512-vBZP4NlzfOlerQTnba4aqZoMhE/a9HY7HRqoOPaETQcSQuWEIyZMHGfVu6w9wGtGK5fED5qRs2DteVCjOH60sA==
|
||||
|
||||
"@ffmpeg-installer/darwin-arm64@4.1.5":
|
||||
version "4.1.5"
|
||||
resolved "https://registry.yarnpkg.com/@ffmpeg-installer/darwin-arm64/-/darwin-arm64-4.1.5.tgz#b7b5c262dd96d1aea4807514e1cdcf6e11f82743"
|
||||
@ -8173,6 +8178,11 @@
|
||||
resolved "https://registry.yarnpkg.com/@types/showdown/-/showdown-2.0.1.tgz#24134738ba3107237d6a783e054a54773e739f81"
|
||||
integrity sha512-xdnAw2nFqomkaL0QdtEk0t7yz26UkaVPl4v1pYJvtE1T0fmfQEH3JaxErEhGByEAl3zUZrkNBlneuJp0WJGqEA==
|
||||
|
||||
"@types/showdown@^2.0.6":
|
||||
version "2.0.6"
|
||||
resolved "https://registry.yarnpkg.com/@types/showdown/-/showdown-2.0.6.tgz#3d7affd5f971b4a17783ec2b23b4ad3b97477b7e"
|
||||
integrity sha512-pTvD/0CIeqe4x23+YJWlX2gArHa8G0J0Oh6GKaVXV7TAeickpkkZiNOgFcFcmLQ5lB/K0qBJL1FtRYltBfbGCQ==
|
||||
|
||||
"@types/sinon-chai@^3.2.8":
|
||||
version "3.2.8"
|
||||
resolved "https://registry.yarnpkg.com/@types/sinon-chai/-/sinon-chai-3.2.8.tgz#5871d09ab50d671d8e6dd72e9073f8e738ac61dc"
|
||||
@ -19286,6 +19296,13 @@ jest@^27.4.5:
|
||||
import-local "^3.0.2"
|
||||
jest-cli "^27.5.1"
|
||||
|
||||
jintr@^1.1.0:
|
||||
version "1.1.0"
|
||||
resolved "https://registry.yarnpkg.com/jintr/-/jintr-1.1.0.tgz#223a3b07f5e03d410cec6e715c537c8ad1e714c3"
|
||||
integrity sha512-Tu9wk3BpN2v+kb8yT6YBtue+/nbjeLFv4vvVC4PJ7oCidHKbifWhvORrAbQfxVIQZG+67am/mDagpiGSVtvrZg==
|
||||
dependencies:
|
||||
acorn "^8.8.0"
|
||||
|
||||
jose@^2.0.5:
|
||||
version "2.0.7"
|
||||
resolved "https://registry.yarnpkg.com/jose/-/jose-2.0.7.tgz#3aabbaec70bff313c108b9406498a163737b16ba"
|
||||
@ -29876,6 +29893,13 @@ undici@^4.9.3:
|
||||
resolved "https://registry.yarnpkg.com/undici/-/undici-4.14.1.tgz#7633b143a8a10d6d63335e00511d071e8d52a1d9"
|
||||
integrity sha512-WJ+g+XqiZcATcBaUeluCajqy4pEDcQfK1vy+Fo+bC4/mqXI9IIQD/XWHLS70fkGUT6P52Drm7IFslO651OdLPQ==
|
||||
|
||||
undici@^5.19.1:
|
||||
version "5.28.3"
|
||||
resolved "https://registry.yarnpkg.com/undici/-/undici-5.28.3.tgz#a731e0eff2c3fcfd41c1169a869062be222d1e5b"
|
||||
integrity sha512-3ItfzbrhDlINjaP0duwnNsKpDQk3acHI3gVJ1z4fmwMK31k5G9OVIAMLSIaP6w4FaGkaAkN6zaQO9LUvZ1t7VA==
|
||||
dependencies:
|
||||
"@fastify/busboy" "^2.0.0"
|
||||
|
||||
unfetch@^4.2.0:
|
||||
version "4.2.0"
|
||||
resolved "https://registry.yarnpkg.com/unfetch/-/unfetch-4.2.0.tgz#7e21b0ef7d363d8d9af0fb929a5555f6ef97a3be"
|
||||
@ -31536,6 +31560,23 @@ yocto-queue@^1.0.0:
|
||||
resolved "https://registry.yarnpkg.com/yocto-queue/-/yocto-queue-1.0.0.tgz#7f816433fb2cbc511ec8bf7d263c3b58a1a3c251"
|
||||
integrity sha512-9bnSc/HEW2uRy67wc+T8UwauLuPJVn28jb+GtJY16iiKWyvmYJRXVT4UamsAEGQfPohgr2q4Tq0sQbQlxTfi1g==
|
||||
|
||||
youtubei.js@^9.1.0:
|
||||
version "9.1.0"
|
||||
resolved "https://registry.yarnpkg.com/youtubei.js/-/youtubei.js-9.1.0.tgz#bcf154c9fa21d3c8c1d00a5e10360d0a065c660e"
|
||||
integrity sha512-C5GBJ4LgnS6vGAUkdIdQNOFFb5EZ1p3xBvUELNXmIG3Idr6vxWrKNBNy8ClZT3SuDVXaAJqDgF9b5jvY8lNKcg==
|
||||
dependencies:
|
||||
jintr "^1.1.0"
|
||||
tslib "^2.5.0"
|
||||
undici "^5.19.1"
|
||||
|
||||
youtubei@^1.3.4:
|
||||
version "1.3.4"
|
||||
resolved "https://registry.yarnpkg.com/youtubei/-/youtubei-1.3.4.tgz#b9761e33dcc6e0a9569e6628ba1fc48c729636f0"
|
||||
integrity sha512-xN6p2oddcTpreF/ojU2mChwdiUlV+TwwUL6xgP6lXRuxeGS5MokM1tzRdXCgIpxkzYYNNAWpt7xvPuAUQM0PCg==
|
||||
dependencies:
|
||||
node-fetch "2.6.7"
|
||||
protobufjs "7.2.4"
|
||||
|
||||
yup@^0.31.0:
|
||||
version "0.31.1"
|
||||
resolved "https://registry.yarnpkg.com/yup/-/yup-0.31.1.tgz#0954cb181161f397b804346037a04f8a4b31599e"
|
||||
|
||||
Reference in New Issue
Block a user