diff --git a/packages/api/package.json b/packages/api/package.json
index b387616f4..45ad5c0c9 100644
--- a/packages/api/package.json
+++ b/packages/api/package.json
@@ -97,6 +97,7 @@
"sanitize-html": "^2.3.2",
"sax": "^1.3.0",
"search-query-parser": "^1.6.0",
+ "showdown": "^2.1.0",
"snake-case": "^3.0.3",
"supertest": "^6.2.2",
"ts-loader": "^9.3.0",
@@ -107,7 +108,9 @@
"uuid": "^8.3.1",
"voca": "^1.4.0",
"winston": "^3.3.3",
- "word-counting": "^1.1.4"
+ "word-counting": "^1.1.4",
+ "youtubei": "^1.3.4",
+ "youtubei.js": "^9.1.0"
},
"devDependencies": {
"@babel/register": "^7.14.5",
@@ -136,6 +139,7 @@
"@types/private-ip": "^1.0.0",
"@types/sanitize-html": "^1.27.1",
"@types/sax": "^1.2.7",
+ "@types/showdown": "^2.0.6",
"@types/sinon": "^10.0.13",
"@types/sinon-chai": "^3.2.8",
"@types/supertest": "^2.0.11",
diff --git a/packages/api/src/jobs/get-youtube-info.ts b/packages/api/src/jobs/get-youtube-info.ts
new file mode 100644
index 000000000..9f7abbdf2
--- /dev/null
+++ b/packages/api/src/jobs/get-youtube-info.ts
@@ -0,0 +1,153 @@
+import { logger } from '../utils/logger'
+import { loadSummarizationChain } from 'langchain/chains'
+import { ChatOpenAI } from '@langchain/openai'
+import {
+ CharacterTextSplitter,
+ RecursiveCharacterTextSplitter,
+} from 'langchain/text_splitter'
+import { DocumentInterface } from '@langchain/core/documents'
+import { YoutubeLoader } from 'langchain/document_loaders/web/youtube'
+import { authTrx } from '../repository'
+import { libraryItemRepository } from '../repository/library_item'
+import { htmlToMarkdown, parsePreparedContent } from '../utils/parser'
+import { AISummary } from '../entity/AISummary'
+import { LibraryItem, LibraryItemState } from '../entity/library_item'
+import { getAISummary } from '../services/ai-summaries'
+import { YoutubeTranscript, TranscriptResponse } from 'youtube-transcript'
+import { Converter } from 'showdown'
+import { Video, Client as YouTubeClient } from 'youtubei'
+
+export interface ProcessYouTubeVideoJobData {
+ userId: string
+ libraryItemId: string
+}
+
+export const PROCESS_YOU_TUBE_VIDEO_JOB_NAME = 'process-you-tube-video'
+
+export const processYouTubeVideo = async (
+ jobData: ProcessYouTubeVideoJobData
+) => {
+ try {
+ console.log(
+ '******************************* processYouTubeVideo *************************'
+ )
+ const libraryItem = await authTrx(
+ async (tx) =>
+ tx
+ .withRepository(libraryItemRepository)
+ .findById(jobData.libraryItemId),
+ undefined,
+ jobData.userId
+ )
+ if (!libraryItem || libraryItem.state !== LibraryItemState.Succeeded) {
+ logger.info(
+ `Not ready to get YouTube metadata job state: ${
+ libraryItem?.state ?? 'null'
+ }`
+ )
+ return
+ }
+
+ // const doc = await YoutubeLoader.createFromUrl(libraryItem.originalUrl, {
+ // language: 'en',
+ // addVideoInfo: true,
+ // }).load()
+
+ // console.log('doc from youtube:', doc)
+
+ const youtube = new YouTubeClient()
+ const video = (await youtube.getVideo(
+ 'Y0fqyJUrwe0' /* libraryItem.originalUrl */
+ )) as Video
+ console.log('GOT VIDEO: ', video)
+ const transcript = await video.getTranscript()
+
+ console.log('description: ', video?.description)
+ console.log('chapters: ', video?.chapters)
+
+ // const transcript = await YoutubeTranscript.fetchTranscript(
+ // libraryItem.originalUrl
+ // )
+
+ if (transcript) {
+ console.log(
+ 'original transcript:\n',
+ transcript.map((item) => item.text).join(' '),
+ '\n\n'
+ )
+ } else {
+ console.log('no transcript found')
+ }
+
+ // const prompt = `Given the following transcript data, supplied as a list of text segments, turn it into readable
+ // text adding punctuation and paragraphs. Format the output as markdown.
+
+ // ${JSON.stringify(transcript).replace(/"/g, '\\"')}
+ // `
+
+ // const llm = new ChatOpenAI({
+ // configuration: {
+ // apiKey: process.env.OPENAI_API_KEY,
+ // },
+ // })
+ // const response = await llm.generate([[prompt]])
+ // console.log('response: ', response.generations, response.llmOutput)
+
+ // const text = response.generations[0][0].text
+ // const converter = new Converter()
+ // const transcriptHTML = converter.makeHtml(text)
+
+ // const html = `
+ //
1 Billion Rows Challenge
+ //
+ //
+ //
+ //
+ //
+ //
+ //
+ //
+ //
+ //
+ //
+ //
+ //
+ // 1 Billion Rows Challenge
+ // By ThePrimeTime
+ //
+ //
+ //
+ //
+ // `.replace(
+ // '',
+ // `${transcriptHTML}
`
+ // )
+
+ // console.log('input HTML: ', html)
+ // if (html) {
+ // const preparedDocument = {
+ // document: html,
+ // pageInfo: {},
+ // }
+ // const updatedContent = await parsePreparedContent(
+ // libraryItem.originalUrl,
+ // preparedDocument,
+ // true
+ // )
+ // console.log('updated content: ', updatedContent.parsedContent?.content)
+ // libraryItem.readableContent =
+ // updatedContent.parsedContent?.content ?? libraryItem.readableContent
+ // const _ = await authTrx(
+ // async (t) => {
+ // return t
+ // .getRepository(LibraryItem)
+ // .update(jobData.libraryItemId, libraryItem)
+ // },
+ // undefined,
+ // jobData.userId
+ // )
+ // }
+ } catch (err) {
+ console.log('error creating summary: ', err)
+ }
+}
diff --git a/packages/api/src/pubsub.ts b/packages/api/src/pubsub.ts
index 7bf25d921..5d2b21817 100644
--- a/packages/api/src/pubsub.ts
+++ b/packages/api/src/pubsub.ts
@@ -7,6 +7,7 @@ import { Merge } from './util'
import {
enqueueAISummarizeJob,
enqueueExportItem,
+ enqueueProcessYouTubeVideo,
enqueueTriggerRuleJob,
enqueueWebhookJob,
} from './utils/createTask'
@@ -17,6 +18,7 @@ import {
findFeatureByName,
getFeatureName,
} from './services/features'
+import { processYouTubeVideo } from './jobs/get-youtube-info'
const logger = buildLogger('pubsub')
@@ -89,7 +91,11 @@ export const createPubSubClient = (): PubsubClient => {
})
if (await findFeatureByName(FeatureName.AISummaries, userId)) {
- await enqueueAISummarizeJob({
+ // await enqueueAISummarizeJob({
+ // userId,
+ // libraryItemId,
+ // })
+ await enqueueProcessYouTubeVideo({
userId,
libraryItemId,
})
diff --git a/packages/api/src/queue-processor.ts b/packages/api/src/queue-processor.ts
index 959c866fc..7fc18f518 100644
--- a/packages/api/src/queue-processor.ts
+++ b/packages/api/src/queue-processor.ts
@@ -44,6 +44,10 @@ import { redisDataSource } from './redis_data_source'
import { CACHED_READING_POSITION_PREFIX } from './services/cached_reading_position'
import { getJobPriority } from './utils/createTask'
import { logger } from './utils/logger'
+import {
+ PROCESS_YOU_TUBE_VIDEO_JOB_NAME,
+ processYouTubeVideo,
+} from './jobs/get-youtube-info'
export const QUEUE_NAME = 'omnivore-backend-queue'
export const JOB_VERSION = 'v001'
@@ -116,6 +120,8 @@ export const createWorker = (connection: ConnectionOptions) =>
return exportItem(job.data)
case AI_SUMMARIZE_JOB_NAME:
return aiSummarize(job.data)
+ case PROCESS_YOU_TUBE_VIDEO_JOB_NAME:
+ return processYouTubeVideo(job.data)
case EXPORT_ALL_ITEMS_JOB_NAME:
return exportAllItems(job.data)
}
diff --git a/packages/api/src/utils/createTask.ts b/packages/api/src/utils/createTask.ts
index 56e209091..6c9345f8f 100644
--- a/packages/api/src/utils/createTask.ts
+++ b/packages/api/src/utils/createTask.ts
@@ -45,6 +45,10 @@ import { stringToHash } from './helpers'
import { logger } from './logger'
import View = google.cloud.tasks.v2.Task.View
import { AISummarizeJobData, AI_SUMMARIZE_JOB_NAME } from '../jobs/ai-summarize'
+import {
+ PROCESS_YOU_TUBE_VIDEO_JOB_NAME,
+ ProcessYouTubeVideoJobData,
+} from '../jobs/get-youtube-info'
// Instantiates a client.
const client = new CloudTasksClient()
@@ -78,6 +82,8 @@ export const getJobPriority = (jobName: string): number => {
case REFRESH_ALL_FEEDS_JOB_NAME:
case THUMBNAIL_JOB:
return 100
+ case PROCESS_YOU_TUBE_VIDEO_JOB_NAME:
+ return 20
default:
logger.error(`unknown job name: ${jobName}`)
return 1
@@ -708,6 +714,20 @@ export const enqueueAISummarizeJob = async (data: AISummarizeJobData) => {
})
}
+export const enqueueProcessYouTubeVideo = async (
+ data: ProcessYouTubeVideoJobData
+) => {
+ const queue = await getBackendQueue()
+ if (!queue) {
+ return undefined
+ }
+
+ return queue.add(PROCESS_YOU_TUBE_VIDEO_JOB_NAME, data, {
+ priority: getJobPriority(PROCESS_YOU_TUBE_VIDEO_JOB_NAME),
+ attempts: 3,
+ })
+}
+
export const bulkEnqueueUpdateLabels = async (data: UpdateLabelsData[]) => {
const queue = await getBackendQueue()
if (!queue) {
diff --git a/packages/content-handler/src/websites/youtube-handler.ts b/packages/content-handler/src/websites/youtube-handler.ts
index e86eda113..33d24cf05 100644
--- a/packages/content-handler/src/websites/youtube-handler.ts
+++ b/packages/content-handler/src/websites/youtube-handler.ts
@@ -86,9 +86,10 @@ export class YoutubeHandler extends ContentHandler {
-
- ${escapedTitle}
- By ${authorName}
+
+ ${escapedTitle}
+ By ${authorName}
+