Rebase
This commit is contained in:
@ -97,6 +97,7 @@
|
||||
"sanitize-html": "^2.3.2",
|
||||
"sax": "^1.3.0",
|
||||
"search-query-parser": "^1.6.0",
|
||||
"showdown": "^2.1.0",
|
||||
"snake-case": "^3.0.3",
|
||||
"supertest": "^6.2.2",
|
||||
"ts-loader": "^9.3.0",
|
||||
@ -107,7 +108,9 @@
|
||||
"uuid": "^8.3.1",
|
||||
"voca": "^1.4.0",
|
||||
"winston": "^3.3.3",
|
||||
"word-counting": "^1.1.4"
|
||||
"word-counting": "^1.1.4",
|
||||
"youtubei": "^1.3.4",
|
||||
"youtubei.js": "^9.1.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@babel/register": "^7.14.5",
|
||||
@ -136,6 +139,7 @@
|
||||
"@types/private-ip": "^1.0.0",
|
||||
"@types/sanitize-html": "^1.27.1",
|
||||
"@types/sax": "^1.2.7",
|
||||
"@types/showdown": "^2.0.6",
|
||||
"@types/sinon": "^10.0.13",
|
||||
"@types/sinon-chai": "^3.2.8",
|
||||
"@types/supertest": "^2.0.11",
|
||||
|
||||
153
packages/api/src/jobs/get-youtube-info.ts
Normal file
153
packages/api/src/jobs/get-youtube-info.ts
Normal file
@ -0,0 +1,153 @@
|
||||
import { logger } from '../utils/logger'
|
||||
import { loadSummarizationChain } from 'langchain/chains'
|
||||
import { ChatOpenAI } from '@langchain/openai'
|
||||
import {
|
||||
CharacterTextSplitter,
|
||||
RecursiveCharacterTextSplitter,
|
||||
} from 'langchain/text_splitter'
|
||||
import { DocumentInterface } from '@langchain/core/documents'
|
||||
import { YoutubeLoader } from 'langchain/document_loaders/web/youtube'
|
||||
import { authTrx } from '../repository'
|
||||
import { libraryItemRepository } from '../repository/library_item'
|
||||
import { htmlToMarkdown, parsePreparedContent } from '../utils/parser'
|
||||
import { AISummary } from '../entity/AISummary'
|
||||
import { LibraryItem, LibraryItemState } from '../entity/library_item'
|
||||
import { getAISummary } from '../services/ai-summaries'
|
||||
import { YoutubeTranscript, TranscriptResponse } from 'youtube-transcript'
|
||||
import { Converter } from 'showdown'
|
||||
import { Video, Client as YouTubeClient } from 'youtubei'
|
||||
|
||||
export interface ProcessYouTubeVideoJobData {
|
||||
userId: string
|
||||
libraryItemId: string
|
||||
}
|
||||
|
||||
export const PROCESS_YOU_TUBE_VIDEO_JOB_NAME = 'process-you-tube-video'
|
||||
|
||||
export const processYouTubeVideo = async (
|
||||
jobData: ProcessYouTubeVideoJobData
|
||||
) => {
|
||||
try {
|
||||
console.log(
|
||||
'******************************* processYouTubeVideo *************************'
|
||||
)
|
||||
const libraryItem = await authTrx(
|
||||
async (tx) =>
|
||||
tx
|
||||
.withRepository(libraryItemRepository)
|
||||
.findById(jobData.libraryItemId),
|
||||
undefined,
|
||||
jobData.userId
|
||||
)
|
||||
if (!libraryItem || libraryItem.state !== LibraryItemState.Succeeded) {
|
||||
logger.info(
|
||||
`Not ready to get YouTube metadata job state: ${
|
||||
libraryItem?.state ?? 'null'
|
||||
}`
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
// const doc = await YoutubeLoader.createFromUrl(libraryItem.originalUrl, {
|
||||
// language: 'en',
|
||||
// addVideoInfo: true,
|
||||
// }).load()
|
||||
|
||||
// console.log('doc from youtube:', doc)
|
||||
|
||||
const youtube = new YouTubeClient()
|
||||
const video = (await youtube.getVideo(
|
||||
'Y0fqyJUrwe0' /* libraryItem.originalUrl */
|
||||
)) as Video
|
||||
console.log('GOT VIDEO: ', video)
|
||||
const transcript = await video.getTranscript()
|
||||
|
||||
console.log('description: ', video?.description)
|
||||
console.log('chapters: ', video?.chapters)
|
||||
|
||||
// const transcript = await YoutubeTranscript.fetchTranscript(
|
||||
// libraryItem.originalUrl
|
||||
// )
|
||||
|
||||
if (transcript) {
|
||||
console.log(
|
||||
'original transcript:\n',
|
||||
transcript.map((item) => item.text).join(' '),
|
||||
'\n\n'
|
||||
)
|
||||
} else {
|
||||
console.log('no transcript found')
|
||||
}
|
||||
|
||||
// const prompt = `Given the following transcript data, supplied as a list of text segments, turn it into readable
|
||||
// text adding punctuation and paragraphs. Format the output as markdown.
|
||||
|
||||
// ${JSON.stringify(transcript).replace(/"/g, '\\"')}
|
||||
// `
|
||||
|
||||
// const llm = new ChatOpenAI({
|
||||
// configuration: {
|
||||
// apiKey: process.env.OPENAI_API_KEY,
|
||||
// },
|
||||
// })
|
||||
// const response = await llm.generate([[prompt]])
|
||||
// console.log('response: ', response.generations, response.llmOutput)
|
||||
|
||||
// const text = response.generations[0][0].text
|
||||
// const converter = new Converter()
|
||||
// const transcriptHTML = converter.makeHtml(text)
|
||||
|
||||
// const html = `<html>
|
||||
// <head><title>1 Billion Rows Challenge</title>
|
||||
// <meta property="og:image" content="https://i.ytimg.com/vi/OO6l1DkYA0k/hqdefault.jpg" />
|
||||
// <meta property="og:image:secure_url" content="https://i.ytimg.com/vi/OO6l1DkYA0k/hqdefault.jpg" />
|
||||
// <meta property="og:title" content="1 Billion Rows Challenge" />
|
||||
// <meta property="og:description" content="" />
|
||||
// <meta property="og:article:author" content="ThePrimeTime" />
|
||||
// <meta property="og:site_name" content="YouTube" />
|
||||
// <meta property="og:type" content="video" />
|
||||
// </head>
|
||||
// <body>
|
||||
// <article>
|
||||
// <p id="_omnivore_youtube_video" class="_omnivore_youtube_video">
|
||||
// <iframe class="_omnivore_youtube_embed" width="619.4690265486726" height="350" src="https://www.youtube.com/embed/OO6l1DkYA0k" title="1 Billion Rows Challenge" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
|
||||
// <p>
|
||||
// <a href="https://www.youtube.com/watch?v=OO6l1DkYA0k" target="_blank">1 Billion Rows Challenge</a></p>
|
||||
// <p itemscope="" itemprop="author" itemtype="http://schema.org/Person">By <a href="https://www.youtube.com/@ThePrimeTimeagen" target="_blank">ThePrimeTime</a></p>
|
||||
// </p>
|
||||
// <div id="_omnivore_youtube_transcript" class="_omnivore_youtube_transcript"></div>
|
||||
// </article>
|
||||
// </body>
|
||||
// </html>`.replace(
|
||||
// '<div id="_omnivore_youtube_transcript" class="_omnivore_youtube_transcript"></div>',
|
||||
// `<div id="_omnivore_youtube_transcript" class="_omnivore_youtube_transcript">${transcriptHTML}</div>`
|
||||
// )
|
||||
|
||||
// console.log('input HTML: ', html)
|
||||
// if (html) {
|
||||
// const preparedDocument = {
|
||||
// document: html,
|
||||
// pageInfo: {},
|
||||
// }
|
||||
// const updatedContent = await parsePreparedContent(
|
||||
// libraryItem.originalUrl,
|
||||
// preparedDocument,
|
||||
// true
|
||||
// )
|
||||
// console.log('updated content: ', updatedContent.parsedContent?.content)
|
||||
// libraryItem.readableContent =
|
||||
// updatedContent.parsedContent?.content ?? libraryItem.readableContent
|
||||
// const _ = await authTrx(
|
||||
// async (t) => {
|
||||
// return t
|
||||
// .getRepository(LibraryItem)
|
||||
// .update(jobData.libraryItemId, libraryItem)
|
||||
// },
|
||||
// undefined,
|
||||
// jobData.userId
|
||||
// )
|
||||
// }
|
||||
} catch (err) {
|
||||
console.log('error creating summary: ', err)
|
||||
}
|
||||
}
|
||||
@ -7,6 +7,7 @@ import { Merge } from './util'
|
||||
import {
|
||||
enqueueAISummarizeJob,
|
||||
enqueueExportItem,
|
||||
enqueueProcessYouTubeVideo,
|
||||
enqueueTriggerRuleJob,
|
||||
enqueueWebhookJob,
|
||||
} from './utils/createTask'
|
||||
@ -17,6 +18,7 @@ import {
|
||||
findFeatureByName,
|
||||
getFeatureName,
|
||||
} from './services/features'
|
||||
import { processYouTubeVideo } from './jobs/get-youtube-info'
|
||||
|
||||
const logger = buildLogger('pubsub')
|
||||
|
||||
@ -89,7 +91,11 @@ export const createPubSubClient = (): PubsubClient => {
|
||||
})
|
||||
|
||||
if (await findFeatureByName(FeatureName.AISummaries, userId)) {
|
||||
await enqueueAISummarizeJob({
|
||||
// await enqueueAISummarizeJob({
|
||||
// userId,
|
||||
// libraryItemId,
|
||||
// })
|
||||
await enqueueProcessYouTubeVideo({
|
||||
userId,
|
||||
libraryItemId,
|
||||
})
|
||||
|
||||
@ -44,6 +44,10 @@ import { redisDataSource } from './redis_data_source'
|
||||
import { CACHED_READING_POSITION_PREFIX } from './services/cached_reading_position'
|
||||
import { getJobPriority } from './utils/createTask'
|
||||
import { logger } from './utils/logger'
|
||||
import {
|
||||
PROCESS_YOU_TUBE_VIDEO_JOB_NAME,
|
||||
processYouTubeVideo,
|
||||
} from './jobs/get-youtube-info'
|
||||
|
||||
export const QUEUE_NAME = 'omnivore-backend-queue'
|
||||
export const JOB_VERSION = 'v001'
|
||||
@ -116,6 +120,8 @@ export const createWorker = (connection: ConnectionOptions) =>
|
||||
return exportItem(job.data)
|
||||
case AI_SUMMARIZE_JOB_NAME:
|
||||
return aiSummarize(job.data)
|
||||
case PROCESS_YOU_TUBE_VIDEO_JOB_NAME:
|
||||
return processYouTubeVideo(job.data)
|
||||
case EXPORT_ALL_ITEMS_JOB_NAME:
|
||||
return exportAllItems(job.data)
|
||||
}
|
||||
|
||||
@ -45,6 +45,10 @@ import { stringToHash } from './helpers'
|
||||
import { logger } from './logger'
|
||||
import View = google.cloud.tasks.v2.Task.View
|
||||
import { AISummarizeJobData, AI_SUMMARIZE_JOB_NAME } from '../jobs/ai-summarize'
|
||||
import {
|
||||
PROCESS_YOU_TUBE_VIDEO_JOB_NAME,
|
||||
ProcessYouTubeVideoJobData,
|
||||
} from '../jobs/get-youtube-info'
|
||||
|
||||
// Instantiates a client.
|
||||
const client = new CloudTasksClient()
|
||||
@ -78,6 +82,8 @@ export const getJobPriority = (jobName: string): number => {
|
||||
case REFRESH_ALL_FEEDS_JOB_NAME:
|
||||
case THUMBNAIL_JOB:
|
||||
return 100
|
||||
case PROCESS_YOU_TUBE_VIDEO_JOB_NAME:
|
||||
return 20
|
||||
default:
|
||||
logger.error(`unknown job name: ${jobName}`)
|
||||
return 1
|
||||
@ -708,6 +714,20 @@ export const enqueueAISummarizeJob = async (data: AISummarizeJobData) => {
|
||||
})
|
||||
}
|
||||
|
||||
export const enqueueProcessYouTubeVideo = async (
|
||||
data: ProcessYouTubeVideoJobData
|
||||
) => {
|
||||
const queue = await getBackendQueue()
|
||||
if (!queue) {
|
||||
return undefined
|
||||
}
|
||||
|
||||
return queue.add(PROCESS_YOU_TUBE_VIDEO_JOB_NAME, data, {
|
||||
priority: getJobPriority(PROCESS_YOU_TUBE_VIDEO_JOB_NAME),
|
||||
attempts: 3,
|
||||
})
|
||||
}
|
||||
|
||||
export const bulkEnqueueUpdateLabels = async (data: UpdateLabelsData[]) => {
|
||||
const queue = await getBackendQueue()
|
||||
if (!queue) {
|
||||
|
||||
@ -86,9 +86,10 @@ export class YoutubeHandler extends ContentHandler {
|
||||
<meta property="og:type" content="video" />
|
||||
</head>
|
||||
<body>
|
||||
<iframe width="${width}" height="${height}" src="${src}" title="${escapedTitle}" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
|
||||
<p><a href="${url}" target="_blank">${escapedTitle}</a></p>
|
||||
<p itemscope="" itemprop="author" itemtype="http://schema.org/Person">By <a href="${oembed.author_url}" target="_blank">${authorName}</a></p>
|
||||
<iframe id="_omnivore_youtube_video" width="${width}" height="${height}" src="${src}" title="${escapedTitle}" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
|
||||
<p><a href="${url}" target="_blank">${escapedTitle}</a></p>
|
||||
<p itemscope="" itemprop="author" itemtype="http://schema.org/Person">By <a href="${oembed.author_url}" target="_blank">${authorName}</a></p>
|
||||
<div id="_omnivore_youtube_transcript"></div>
|
||||
</body>
|
||||
</html>`
|
||||
|
||||
|
||||
Reference in New Issue
Block a user