diff --git a/packages/api/package.json b/packages/api/package.json index b387616f4..45ad5c0c9 100644 --- a/packages/api/package.json +++ b/packages/api/package.json @@ -97,6 +97,7 @@ "sanitize-html": "^2.3.2", "sax": "^1.3.0", "search-query-parser": "^1.6.0", + "showdown": "^2.1.0", "snake-case": "^3.0.3", "supertest": "^6.2.2", "ts-loader": "^9.3.0", @@ -107,7 +108,9 @@ "uuid": "^8.3.1", "voca": "^1.4.0", "winston": "^3.3.3", - "word-counting": "^1.1.4" + "word-counting": "^1.1.4", + "youtubei": "^1.3.4", + "youtubei.js": "^9.1.0" }, "devDependencies": { "@babel/register": "^7.14.5", @@ -136,6 +139,7 @@ "@types/private-ip": "^1.0.0", "@types/sanitize-html": "^1.27.1", "@types/sax": "^1.2.7", + "@types/showdown": "^2.0.6", "@types/sinon": "^10.0.13", "@types/sinon-chai": "^3.2.8", "@types/supertest": "^2.0.11", diff --git a/packages/api/src/jobs/get-youtube-info.ts b/packages/api/src/jobs/get-youtube-info.ts new file mode 100644 index 000000000..9f7abbdf2 --- /dev/null +++ b/packages/api/src/jobs/get-youtube-info.ts @@ -0,0 +1,153 @@ +import { logger } from '../utils/logger' +import { loadSummarizationChain } from 'langchain/chains' +import { ChatOpenAI } from '@langchain/openai' +import { + CharacterTextSplitter, + RecursiveCharacterTextSplitter, +} from 'langchain/text_splitter' +import { DocumentInterface } from '@langchain/core/documents' +import { YoutubeLoader } from 'langchain/document_loaders/web/youtube' +import { authTrx } from '../repository' +import { libraryItemRepository } from '../repository/library_item' +import { htmlToMarkdown, parsePreparedContent } from '../utils/parser' +import { AISummary } from '../entity/AISummary' +import { LibraryItem, LibraryItemState } from '../entity/library_item' +import { getAISummary } from '../services/ai-summaries' +import { YoutubeTranscript, TranscriptResponse } from 'youtube-transcript' +import { Converter } from 'showdown' +import { Video, Client as YouTubeClient } from 'youtubei' + +export interface ProcessYouTubeVideoJobData { + userId: string + libraryItemId: string +} + +export const PROCESS_YOU_TUBE_VIDEO_JOB_NAME = 'process-you-tube-video' + +export const processYouTubeVideo = async ( + jobData: ProcessYouTubeVideoJobData +) => { + try { + console.log( + '******************************* processYouTubeVideo *************************' + ) + const libraryItem = await authTrx( + async (tx) => + tx + .withRepository(libraryItemRepository) + .findById(jobData.libraryItemId), + undefined, + jobData.userId + ) + if (!libraryItem || libraryItem.state !== LibraryItemState.Succeeded) { + logger.info( + `Not ready to get YouTube metadata job state: ${ + libraryItem?.state ?? 'null' + }` + ) + return + } + + // const doc = await YoutubeLoader.createFromUrl(libraryItem.originalUrl, { + // language: 'en', + // addVideoInfo: true, + // }).load() + + // console.log('doc from youtube:', doc) + + const youtube = new YouTubeClient() + const video = (await youtube.getVideo( + 'Y0fqyJUrwe0' /* libraryItem.originalUrl */ + )) as Video + console.log('GOT VIDEO: ', video) + const transcript = await video.getTranscript() + + console.log('description: ', video?.description) + console.log('chapters: ', video?.chapters) + + // const transcript = await YoutubeTranscript.fetchTranscript( + // libraryItem.originalUrl + // ) + + if (transcript) { + console.log( + 'original transcript:\n', + transcript.map((item) => item.text).join(' '), + '\n\n' + ) + } else { + console.log('no transcript found') + } + + // const prompt = `Given the following transcript data, supplied as a list of text segments, turn it into readable + // text adding punctuation and paragraphs. Format the output as markdown. + + // ${JSON.stringify(transcript).replace(/"/g, '\\"')} + // ` + + // const llm = new ChatOpenAI({ + // configuration: { + // apiKey: process.env.OPENAI_API_KEY, + // }, + // }) + // const response = await llm.generate([[prompt]]) + // console.log('response: ', response.generations, response.llmOutput) + + // const text = response.generations[0][0].text + // const converter = new Converter() + // const transcriptHTML = converter.makeHtml(text) + + // const html = ` + // 1 Billion Rows Challenge + // + // + // + // + // + // + // + // + // + //
+ //

+ // + //

+ // 1 Billion Rows Challenge

+ // + //

+ //
+ //
+ // + // `.replace( + // '
', + // `
${transcriptHTML}
` + // ) + + // console.log('input HTML: ', html) + // if (html) { + // const preparedDocument = { + // document: html, + // pageInfo: {}, + // } + // const updatedContent = await parsePreparedContent( + // libraryItem.originalUrl, + // preparedDocument, + // true + // ) + // console.log('updated content: ', updatedContent.parsedContent?.content) + // libraryItem.readableContent = + // updatedContent.parsedContent?.content ?? libraryItem.readableContent + // const _ = await authTrx( + // async (t) => { + // return t + // .getRepository(LibraryItem) + // .update(jobData.libraryItemId, libraryItem) + // }, + // undefined, + // jobData.userId + // ) + // } + } catch (err) { + console.log('error creating summary: ', err) + } +} diff --git a/packages/api/src/pubsub.ts b/packages/api/src/pubsub.ts index 7bf25d921..5d2b21817 100644 --- a/packages/api/src/pubsub.ts +++ b/packages/api/src/pubsub.ts @@ -7,6 +7,7 @@ import { Merge } from './util' import { enqueueAISummarizeJob, enqueueExportItem, + enqueueProcessYouTubeVideo, enqueueTriggerRuleJob, enqueueWebhookJob, } from './utils/createTask' @@ -17,6 +18,7 @@ import { findFeatureByName, getFeatureName, } from './services/features' +import { processYouTubeVideo } from './jobs/get-youtube-info' const logger = buildLogger('pubsub') @@ -89,7 +91,11 @@ export const createPubSubClient = (): PubsubClient => { }) if (await findFeatureByName(FeatureName.AISummaries, userId)) { - await enqueueAISummarizeJob({ + // await enqueueAISummarizeJob({ + // userId, + // libraryItemId, + // }) + await enqueueProcessYouTubeVideo({ userId, libraryItemId, }) diff --git a/packages/api/src/queue-processor.ts b/packages/api/src/queue-processor.ts index 959c866fc..7fc18f518 100644 --- a/packages/api/src/queue-processor.ts +++ b/packages/api/src/queue-processor.ts @@ -44,6 +44,10 @@ import { redisDataSource } from './redis_data_source' import { CACHED_READING_POSITION_PREFIX } from './services/cached_reading_position' import { getJobPriority } from './utils/createTask' import { logger } from './utils/logger' +import { + PROCESS_YOU_TUBE_VIDEO_JOB_NAME, + processYouTubeVideo, +} from './jobs/get-youtube-info' export const QUEUE_NAME = 'omnivore-backend-queue' export const JOB_VERSION = 'v001' @@ -116,6 +120,8 @@ export const createWorker = (connection: ConnectionOptions) => return exportItem(job.data) case AI_SUMMARIZE_JOB_NAME: return aiSummarize(job.data) + case PROCESS_YOU_TUBE_VIDEO_JOB_NAME: + return processYouTubeVideo(job.data) case EXPORT_ALL_ITEMS_JOB_NAME: return exportAllItems(job.data) } diff --git a/packages/api/src/utils/createTask.ts b/packages/api/src/utils/createTask.ts index 56e209091..6c9345f8f 100644 --- a/packages/api/src/utils/createTask.ts +++ b/packages/api/src/utils/createTask.ts @@ -45,6 +45,10 @@ import { stringToHash } from './helpers' import { logger } from './logger' import View = google.cloud.tasks.v2.Task.View import { AISummarizeJobData, AI_SUMMARIZE_JOB_NAME } from '../jobs/ai-summarize' +import { + PROCESS_YOU_TUBE_VIDEO_JOB_NAME, + ProcessYouTubeVideoJobData, +} from '../jobs/get-youtube-info' // Instantiates a client. const client = new CloudTasksClient() @@ -78,6 +82,8 @@ export const getJobPriority = (jobName: string): number => { case REFRESH_ALL_FEEDS_JOB_NAME: case THUMBNAIL_JOB: return 100 + case PROCESS_YOU_TUBE_VIDEO_JOB_NAME: + return 20 default: logger.error(`unknown job name: ${jobName}`) return 1 @@ -708,6 +714,20 @@ export const enqueueAISummarizeJob = async (data: AISummarizeJobData) => { }) } +export const enqueueProcessYouTubeVideo = async ( + data: ProcessYouTubeVideoJobData +) => { + const queue = await getBackendQueue() + if (!queue) { + return undefined + } + + return queue.add(PROCESS_YOU_TUBE_VIDEO_JOB_NAME, data, { + priority: getJobPriority(PROCESS_YOU_TUBE_VIDEO_JOB_NAME), + attempts: 3, + }) +} + export const bulkEnqueueUpdateLabels = async (data: UpdateLabelsData[]) => { const queue = await getBackendQueue() if (!queue) { diff --git a/packages/content-handler/src/websites/youtube-handler.ts b/packages/content-handler/src/websites/youtube-handler.ts index e86eda113..33d24cf05 100644 --- a/packages/content-handler/src/websites/youtube-handler.ts +++ b/packages/content-handler/src/websites/youtube-handler.ts @@ -86,9 +86,10 @@ export class YoutubeHandler extends ContentHandler { - -

${escapedTitle}

- + +

${escapedTitle}

+ +
`