This commit is contained in:
Jackson Harper
2024-03-12 21:36:50 +08:00
parent 3ab9e41b82
commit f7225b298a
6 changed files with 195 additions and 5 deletions

View File

@ -97,6 +97,7 @@
"sanitize-html": "^2.3.2",
"sax": "^1.3.0",
"search-query-parser": "^1.6.0",
"showdown": "^2.1.0",
"snake-case": "^3.0.3",
"supertest": "^6.2.2",
"ts-loader": "^9.3.0",
@ -107,7 +108,9 @@
"uuid": "^8.3.1",
"voca": "^1.4.0",
"winston": "^3.3.3",
"word-counting": "^1.1.4"
"word-counting": "^1.1.4",
"youtubei": "^1.3.4",
"youtubei.js": "^9.1.0"
},
"devDependencies": {
"@babel/register": "^7.14.5",
@ -136,6 +139,7 @@
"@types/private-ip": "^1.0.0",
"@types/sanitize-html": "^1.27.1",
"@types/sax": "^1.2.7",
"@types/showdown": "^2.0.6",
"@types/sinon": "^10.0.13",
"@types/sinon-chai": "^3.2.8",
"@types/supertest": "^2.0.11",

View File

@ -0,0 +1,153 @@
import { logger } from '../utils/logger'
import { loadSummarizationChain } from 'langchain/chains'
import { ChatOpenAI } from '@langchain/openai'
import {
CharacterTextSplitter,
RecursiveCharacterTextSplitter,
} from 'langchain/text_splitter'
import { DocumentInterface } from '@langchain/core/documents'
import { YoutubeLoader } from 'langchain/document_loaders/web/youtube'
import { authTrx } from '../repository'
import { libraryItemRepository } from '../repository/library_item'
import { htmlToMarkdown, parsePreparedContent } from '../utils/parser'
import { AISummary } from '../entity/AISummary'
import { LibraryItem, LibraryItemState } from '../entity/library_item'
import { getAISummary } from '../services/ai-summaries'
import { YoutubeTranscript, TranscriptResponse } from 'youtube-transcript'
import { Converter } from 'showdown'
import { Video, Client as YouTubeClient } from 'youtubei'
export interface ProcessYouTubeVideoJobData {
userId: string
libraryItemId: string
}
export const PROCESS_YOU_TUBE_VIDEO_JOB_NAME = 'process-you-tube-video'
export const processYouTubeVideo = async (
jobData: ProcessYouTubeVideoJobData
) => {
try {
console.log(
'******************************* processYouTubeVideo *************************'
)
const libraryItem = await authTrx(
async (tx) =>
tx
.withRepository(libraryItemRepository)
.findById(jobData.libraryItemId),
undefined,
jobData.userId
)
if (!libraryItem || libraryItem.state !== LibraryItemState.Succeeded) {
logger.info(
`Not ready to get YouTube metadata job state: ${
libraryItem?.state ?? 'null'
}`
)
return
}
// const doc = await YoutubeLoader.createFromUrl(libraryItem.originalUrl, {
// language: 'en',
// addVideoInfo: true,
// }).load()
// console.log('doc from youtube:', doc)
const youtube = new YouTubeClient()
const video = (await youtube.getVideo(
'Y0fqyJUrwe0' /* libraryItem.originalUrl */
)) as Video
console.log('GOT VIDEO: ', video)
const transcript = await video.getTranscript()
console.log('description: ', video?.description)
console.log('chapters: ', video?.chapters)
// const transcript = await YoutubeTranscript.fetchTranscript(
// libraryItem.originalUrl
// )
if (transcript) {
console.log(
'original transcript:\n',
transcript.map((item) => item.text).join(' '),
'\n\n'
)
} else {
console.log('no transcript found')
}
// const prompt = `Given the following transcript data, supplied as a list of text segments, turn it into readable
// text adding punctuation and paragraphs. Format the output as markdown.
// ${JSON.stringify(transcript).replace(/"/g, '\\"')}
// `
// const llm = new ChatOpenAI({
// configuration: {
// apiKey: process.env.OPENAI_API_KEY,
// },
// })
// const response = await llm.generate([[prompt]])
// console.log('response: ', response.generations, response.llmOutput)
// const text = response.generations[0][0].text
// const converter = new Converter()
// const transcriptHTML = converter.makeHtml(text)
// const html = `<html>
// <head><title>1 Billion Rows Challenge</title>
// <meta property="og:image" content="https://i.ytimg.com/vi/OO6l1DkYA0k/hqdefault.jpg" />
// <meta property="og:image:secure_url" content="https://i.ytimg.com/vi/OO6l1DkYA0k/hqdefault.jpg" />
// <meta property="og:title" content="1 Billion Rows Challenge" />
// <meta property="og:description" content="" />
// <meta property="og:article:author" content="ThePrimeTime" />
// <meta property="og:site_name" content="YouTube" />
// <meta property="og:type" content="video" />
// </head>
// <body>
// <article>
// <p id="_omnivore_youtube_video" class="_omnivore_youtube_video">
// <iframe class="_omnivore_youtube_embed" width="619.4690265486726" height="350" src="https://www.youtube.com/embed/OO6l1DkYA0k" title="1 Billion Rows Challenge" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
// <p>
// <a href="https://www.youtube.com/watch?v=OO6l1DkYA0k" target="_blank">1 Billion Rows Challenge</a></p>
// <p itemscope="" itemprop="author" itemtype="http://schema.org/Person">By <a href="https://www.youtube.com/@ThePrimeTimeagen" target="_blank">ThePrimeTime</a></p>
// </p>
// <div id="_omnivore_youtube_transcript" class="_omnivore_youtube_transcript"></div>
// </article>
// </body>
// </html>`.replace(
// '<div id="_omnivore_youtube_transcript" class="_omnivore_youtube_transcript"></div>',
// `<div id="_omnivore_youtube_transcript" class="_omnivore_youtube_transcript">${transcriptHTML}</div>`
// )
// console.log('input HTML: ', html)
// if (html) {
// const preparedDocument = {
// document: html,
// pageInfo: {},
// }
// const updatedContent = await parsePreparedContent(
// libraryItem.originalUrl,
// preparedDocument,
// true
// )
// console.log('updated content: ', updatedContent.parsedContent?.content)
// libraryItem.readableContent =
// updatedContent.parsedContent?.content ?? libraryItem.readableContent
// const _ = await authTrx(
// async (t) => {
// return t
// .getRepository(LibraryItem)
// .update(jobData.libraryItemId, libraryItem)
// },
// undefined,
// jobData.userId
// )
// }
} catch (err) {
console.log('error creating summary: ', err)
}
}

View File

@ -7,6 +7,7 @@ import { Merge } from './util'
import {
enqueueAISummarizeJob,
enqueueExportItem,
enqueueProcessYouTubeVideo,
enqueueTriggerRuleJob,
enqueueWebhookJob,
} from './utils/createTask'
@ -17,6 +18,7 @@ import {
findFeatureByName,
getFeatureName,
} from './services/features'
import { processYouTubeVideo } from './jobs/get-youtube-info'
const logger = buildLogger('pubsub')
@ -89,7 +91,11 @@ export const createPubSubClient = (): PubsubClient => {
})
if (await findFeatureByName(FeatureName.AISummaries, userId)) {
await enqueueAISummarizeJob({
// await enqueueAISummarizeJob({
// userId,
// libraryItemId,
// })
await enqueueProcessYouTubeVideo({
userId,
libraryItemId,
})

View File

@ -44,6 +44,10 @@ import { redisDataSource } from './redis_data_source'
import { CACHED_READING_POSITION_PREFIX } from './services/cached_reading_position'
import { getJobPriority } from './utils/createTask'
import { logger } from './utils/logger'
import {
PROCESS_YOU_TUBE_VIDEO_JOB_NAME,
processYouTubeVideo,
} from './jobs/get-youtube-info'
export const QUEUE_NAME = 'omnivore-backend-queue'
export const JOB_VERSION = 'v001'
@ -116,6 +120,8 @@ export const createWorker = (connection: ConnectionOptions) =>
return exportItem(job.data)
case AI_SUMMARIZE_JOB_NAME:
return aiSummarize(job.data)
case PROCESS_YOU_TUBE_VIDEO_JOB_NAME:
return processYouTubeVideo(job.data)
case EXPORT_ALL_ITEMS_JOB_NAME:
return exportAllItems(job.data)
}

View File

@ -45,6 +45,10 @@ import { stringToHash } from './helpers'
import { logger } from './logger'
import View = google.cloud.tasks.v2.Task.View
import { AISummarizeJobData, AI_SUMMARIZE_JOB_NAME } from '../jobs/ai-summarize'
import {
PROCESS_YOU_TUBE_VIDEO_JOB_NAME,
ProcessYouTubeVideoJobData,
} from '../jobs/get-youtube-info'
// Instantiates a client.
const client = new CloudTasksClient()
@ -78,6 +82,8 @@ export const getJobPriority = (jobName: string): number => {
case REFRESH_ALL_FEEDS_JOB_NAME:
case THUMBNAIL_JOB:
return 100
case PROCESS_YOU_TUBE_VIDEO_JOB_NAME:
return 20
default:
logger.error(`unknown job name: ${jobName}`)
return 1
@ -708,6 +714,20 @@ export const enqueueAISummarizeJob = async (data: AISummarizeJobData) => {
})
}
export const enqueueProcessYouTubeVideo = async (
data: ProcessYouTubeVideoJobData
) => {
const queue = await getBackendQueue()
if (!queue) {
return undefined
}
return queue.add(PROCESS_YOU_TUBE_VIDEO_JOB_NAME, data, {
priority: getJobPriority(PROCESS_YOU_TUBE_VIDEO_JOB_NAME),
attempts: 3,
})
}
export const bulkEnqueueUpdateLabels = async (data: UpdateLabelsData[]) => {
const queue = await getBackendQueue()
if (!queue) {

View File

@ -86,9 +86,10 @@ export class YoutubeHandler extends ContentHandler {
<meta property="og:type" content="video" />
</head>
<body>
<iframe width="${width}" height="${height}" src="${src}" title="${escapedTitle}" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
<p><a href="${url}" target="_blank">${escapedTitle}</a></p>
<p itemscope="" itemprop="author" itemtype="http://schema.org/Person">By <a href="${oembed.author_url}" target="_blank">${authorName}</a></p>
<iframe id="_omnivore_youtube_video" width="${width}" height="${height}" src="${src}" title="${escapedTitle}" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
<p><a href="${url}" target="_blank">${escapedTitle}</a></p>
<p itemscope="" itemprop="author" itemtype="http://schema.org/Person">By <a href="${oembed.author_url}" target="_blank">${authorName}</a></p>
<div id="_omnivore_youtube_transcript"></div>
</body>
</html>`