Improve transcript generation

This commit is contained in:
Jackson Harper
2024-03-13 17:50:07 +08:00
parent a7ad67b3bb
commit 3ee6787e39
4 changed files with 118 additions and 8 deletions

View File

@ -46,6 +46,7 @@
"@sentry/integrations": "^7.10.0",
"@sentry/node": "^5.26.0",
"@sentry/tracing": "^7.9.0",
"@types/showdown": "^2.0.6",
"addressparser": "^1.0.1",
"apollo-datasource": "^3.3.1",
"apollo-server-express": "^3.6.3",

View File

@ -4,6 +4,11 @@ import { libraryItemRepository } from '../repository/library_item'
import { LibraryItem, LibraryItemState } from '../entity/library_item'
import { Chapter, Client as YouTubeClient } from 'youtubei'
import showdown from 'showdown'
import { parseHTML } from 'linkedom'
import { parsePreparedContent } from '../utils/parser'
import { OpenAI } from '@langchain/openai'
import { PromptTemplate } from '@langchain/core/prompts'
export interface ProcessYouTubeVideoJobData {
userId: string
@ -39,7 +44,7 @@ export const addTranscriptChapters = (
for (const chapter of chapters) {
const startOffset = chapter.start
const title = '## ' + chapter.title + '\n\n'
const title = '\n\n## ' + chapter.title + '\n\n'
const index = transcript.findIndex(
(textItem) => textItem.start > startOffset
@ -58,6 +63,92 @@ export const addTranscriptChapters = (
return transcript
}
export const createTranscriptHTML = async (
transcript: TranscriptProperties[]
): Promise<string> => {
let transcriptMarkdown = ''
if (process.env.YOUTUBE_TRANSCRIPT_PROMPT && process.env.OPENAI_API_KEY) {
const llm = new OpenAI({
modelName: 'gpt-4',
configuration: {
apiKey: process.env.OPENAI_API_KEY,
},
})
const promptTemplate = PromptTemplate.fromTemplate(
`${process.env.YOUTUBE_TRANSCRIPT_PROMPT}
Data:
{transcriptData}`
)
const chain = promptTemplate.pipe(llm)
let transcriptChunkLength = 0
let transcriptChunk: TranscriptProperties[] = []
for (const item of transcript) {
if (transcriptChunkLength + item.text.length > 8000) {
const result = await chain.invoke({
transcriptData: transcriptChunk.map((item) => item.text).join(' '),
})
transcriptMarkdown += result
transcriptChunk = []
transcriptChunkLength = 0
}
transcriptChunk.push(item)
transcriptChunkLength += item.text.length
}
if (transcriptChunk.length > 0) {
const result = await chain.invoke({
transcriptData: transcriptChunk.map((item) => item.text).join(' '),
})
transcriptMarkdown += result
}
}
// If the LLM didn't give us enough data fallback to the raw template
if (transcriptMarkdown.length < 1) {
transcriptMarkdown = transcript.map((item) => item.text).join(' ')
}
var converter = new showdown.Converter()
return converter.makeHtml(transcriptMarkdown)
}
export const addTranscriptToReadableContent = async (
originalUrl: string,
originalHTML: string,
transcriptHTML: string
): Promise<string | undefined> => {
const html = parseHTML(originalHTML)
const transcriptNode = html.document.querySelector(
'#_omnivore_youtube_transcript'
)
if (transcriptNode) {
transcriptNode.innerHTML = transcriptHTML
} else {
const div = html.document.createElement('div')
div.innerHTML = transcriptHTML
html.document.body.appendChild(div)
}
const preparedDocument = {
document: html.document.toString(),
pageInfo: {},
}
const updatedContent = await parsePreparedContent(
originalUrl,
preparedDocument,
true
)
return updatedContent.parsedContent?.content
}
export const processYouTubeVideo = async (
jobData: ProcessYouTubeVideoJobData
) => {
@ -70,7 +161,11 @@ export const processYouTubeVideo = async (
undefined,
jobData.userId
)
if (!libraryItem || libraryItem.state !== LibraryItemState.Succeeded) {
if (
!libraryItem ||
libraryItem.state !== LibraryItemState.Succeeded ||
!libraryItem.originalContent
) {
logger.info(
`Not ready to get YouTube metadata job state: ${
libraryItem?.state ?? 'null'
@ -112,19 +207,28 @@ export const processYouTubeVideo = async (
let chapters: Chapter[] = []
if ('chapters' in video) {
chapters = video.chapters
console.log('video.chapters: ', video.chapters)
}
let transcript: TranscriptProperties[] | undefined = undefined
if ('getTranscript' in video) {
transcript = await video.getTranscript()
console.log('transcript: ', transcript)
}
if (transcript) {
if (chapters) {
transcript = addTranscriptChapters(chapters, transcript)
}
const transcriptHTML = await createTranscriptHTML(transcript)
const updatedContent = await addTranscriptToReadableContent(
libraryItem.originalUrl,
libraryItem.originalContent,
transcriptHTML
)
if (updatedContent) {
needsUpdate = true
libraryItem.readableContent = updatedContent
}
}
if (needsUpdate) {

View File

@ -725,6 +725,7 @@ export const enqueueProcessYouTubeVideo = async (
return queue.add(PROCESS_YOUTUBE_VIDEO_JOB_NAME, data, {
priority: getJobPriority(PROCESS_YOUTUBE_VIDEO_JOB_NAME),
attempts: 3,
delay: 2000,
})
}

View File

@ -86,10 +86,14 @@ export class YoutubeHandler extends ContentHandler {
<meta property="og:type" content="video" />
</head>
<body>
<iframe id="_omnivore_youtube_video" width="${width}" height="${height}" src="${src}" title="${escapedTitle}" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
<p><a href="${url}" target="_blank">${escapedTitle}</a></p>
<p itemscope="" itemprop="author" itemtype="http://schema.org/Person">By <a href="${oembed.author_url}" target="_blank">${authorName}</a></p>
<div id="_omnivore_youtube_transcript"></div>
<div>
<article id="_omnivore_youtube">
<iframe id="_omnivore_youtube_video" width="${width}" height="${height}" src="${src}" title="${escapedTitle}" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
<p><a href="${url}" target="_blank">${escapedTitle}</a></p>
<p itemscope="" itemprop="author" itemtype="http://schema.org/Person">By <a href="${oembed.author_url}" target="_blank">${authorName}</a></p>
<div id="_omnivore_youtube_transcript"></div>
</article>
</div>
</body>
</html>`