diff --git a/packages/api/src/utils/createTask.ts b/packages/api/src/utils/createTask.ts index 41174d891..c69d5a986 100644 --- a/packages/api/src/utils/createTask.ts +++ b/packages/api/src/utils/createTask.ts @@ -327,14 +327,12 @@ export const enqueueSyncWithIntegration = async ( export const enqueueTextToSpeech = async ( userId: string, - pageId: string, - text: string + pageId: string ): Promise => { const { GOOGLE_CLOUD_PROJECT } = process.env const payload = { userId, pageId, - text, } // If there is no Google Cloud Project Id exposed, it means that we are in local environment diff --git a/packages/api/src/utils/parser.ts b/packages/api/src/utils/parser.ts index 8900fc5d3..2e9997e58 100644 --- a/packages/api/src/utils/parser.ts +++ b/packages/api/src/utils/parser.ts @@ -276,6 +276,48 @@ export const parsePreparedContent = async ( }) article.content = article.dom.outerHTML } + + const ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES = [ + 'omnivore-highlight-id', + 'data-twitter-tweet-id', + 'data-instagram-id', + ] + + // Get the top level element? + const pageNode = article.dom.firstElementChild as HTMLElement + console.log('pageNode: ', pageNode) + const nodesToVisitStack: [HTMLElement] = [pageNode] + const visitedNodeList = [] + + while (nodesToVisitStack.length > 0) { + const currentNode = nodesToVisitStack.pop() + console.log('currentNode: ', currentNode?.nodeType) + if ( + currentNode?.nodeType !== 1 || + // Avoiding dynamic elements from being counted as anchor-allowed elements + ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES.some((attrib) => + currentNode.hasAttribute(attrib) + ) + ) { + continue + } + visitedNodeList.push(currentNode) + ;[].slice + .call(currentNode.childNodes) + .reverse() + .forEach(function (node) { + nodesToVisitStack.push(node) + }) + } + + visitedNodeList.shift() + visitedNodeList.forEach((node, index) => { + // start from index 1, index 0 reserved for anchor unknown. + node.setAttribute('data-omnivore-anchor-idx', (index + 1).toString()) + }) + + console.log('article content:', article.dom.outerHTML) + article.content = article.dom.outerHTML } const newWindow = parseHTML('') diff --git a/packages/api/src/utils/textToSpeech.ts b/packages/api/src/utils/textToSpeech.ts index a4daa9743..53aefe881 100644 --- a/packages/api/src/utils/textToSpeech.ts +++ b/packages/api/src/utils/textToSpeech.ts @@ -10,13 +10,12 @@ import { SpeechSynthesizer, } from 'microsoft-cognitiveservices-speech-sdk' import { env } from '../env' +import { parseHTML } from 'linkedom' export interface TextToSpeechInput { id: string text: string voice?: string - textType?: 'text' | 'ssml' - engine?: 'standard' | 'neural' languageCode?: string } @@ -27,16 +26,14 @@ export interface TextToSpeechOutput { export interface SpeechMark { time: number - start: number - length: number + start?: number + length?: number word: string + type: 'word' | 'bookmark' } const logger = buildLogger('app.dispatch') -// // create a new AWS Polly client -// const client = new AWS.Polly() - export const synthesizeTextToSpeech = async ( input: TextToSpeechInput ): Promise => { @@ -69,10 +66,9 @@ export const synthesizeTextToSpeech = async ( // The event synthesis completed signals that the synthesis is completed. synthesizer.synthesisCompleted = (s, e) => { logger.info( - '(synthesized) Reason: ' + - ResultReason[e.result.reason] + - ' Audio length: ' + + `(synthesized) Reason: ${ResultReason[e.result.reason]} Audio length: ${ e.result.audioData.byteLength + }` ) } @@ -100,6 +96,20 @@ export const synthesizeTextToSpeech = async ( time: (timeOffset + e.audioOffset) / 10000, start: characterOffset + e.textOffset, length: e.wordLength, + type: 'word', + }) + } + + synthesizer.bookmarkReached = (s, e) => { + logger.info( + `(Bookmark reached), Audio offset: ${ + e.audioOffset / 10000 + }ms, bookmark text: ${e.text}` + ) + speechMarks.push({ + word: e.text, + time: (timeOffset + e.audioOffset) / 10000, + type: 'bookmark', }) } @@ -138,84 +148,39 @@ export const synthesizeTextToSpeech = async ( } } -// export const createAudio = async ( -// input: TextToSpeechInput -// ): Promise => { -// const { text, voice, textType, engine, languageCode } = input -// const params: SynthesizeSpeechInput = { -// OutputFormat: 'ogg_vorbis', -// Text: text, -// TextType: textType || 'text', -// VoiceId: voice || 'Joanna', -// Engine: engine || 'neural', -// LanguageCode: languageCode || 'en-US', -// } -// try { -// const data = await client.synthesizeSpeech(params).promise() -// return data.AudioStream as Buffer -// } catch (error) { -// logger.error('Unable to create audio file', { error }) -// throw error -// } -// } +export const htmlToSsml = ( + html: string, + language = 'en-US', + voice = 'en-US-JennyNeural', + rate = 100, + volume = 100 +): string => { + const document = parseHTML(html).document + const paragraphs = document.querySelectorAll('p') + // create new ssml document + const ssml = parseHTML('').document + const speakElement = ssml.createElement('speak') + speakElement.setAttribute('version', '1.0') + speakElement.setAttribute('xmlns', 'http://www.w3.org/2001/10/synthesis') + speakElement.setAttribute('xml:lang', language) + const voiceElement = ssml.createElement('voice') + voiceElement.setAttribute('name', voice) + speakElement.appendChild(voiceElement) + const prosodyElement = ssml.createElement('prosody') + prosodyElement.setAttribute('rate', `${rate}%`) + prosodyElement.setAttribute('volume', volume.toString()) + voiceElement.appendChild(prosodyElement) + // add each paragraph to the ssml document + paragraphs.forEach((p) => { + const id = p.getAttribute('data-omnivore-anchor-idx') + if (id) { + const text = p.innerText + const bookMark = ssml.createElement('bookmark') + bookMark.setAttribute('mark', `data-omnivore-anchor-idx-${id}`) + bookMark.innerText = text + prosodyElement.appendChild(bookMark) + } + }) -// export const createSpeechMarks = async ( -// input: TextToSpeechInput -// ): Promise => { -// const { text, voice, textType, engine, languageCode } = input -// const params: SynthesizeSpeechInput = { -// OutputFormat: 'json', -// Text: text, -// TextType: textType || 'text', -// VoiceId: voice || 'Joanna', -// Engine: engine || 'neural', -// SpeechMarkTypes: ['word'], -// LanguageCode: languageCode || 'en-US', -// } -// try { -// const data = await client.synthesizeSpeech(params).promise() -// return (data.AudioStream as Buffer).toString() -// } catch (error) { -// logger.error('Unable to create speech marks', { error }) -// throw error -// } -// } -// -// export const createAudioWithSpeechMarks = async ( -// input: TextToSpeechInput -// ): Promise => { -// try { -// const audio = await createAudio(input) -// // upload audio to google cloud storage -// const filePath = `speech/${input.id}.ogg` -// -// logger.info('start uploading...', { filePath }) -// await uploadToBucket(filePath, audio, { -// contentType: 'audio/ogg', -// public: true, -// }) -// -// // get public url for audio file -// const publicUrl = getFilePublicUrl(filePath) -// logger.info('upload complete', { publicUrl }) -// -// const speechMarks = await createSpeechMarks(input) -// return { -// audioUrl: publicUrl, -// speechMarks, -// } -// } catch (error) { -// logger.error('Unable to create audio with speech marks', error) -// throw error -// } -// } - -// export const htmlToSsml = ( -// html: string, -// language = 'en-US', -// voice = 'en-US-JennyNeural', -// rate = 100, -// volume = 100 -// ): string => { -// return `${html}` -// } + return speakElement.outerHTML +} diff --git a/packages/api/test/utils/textToSpeech.test.ts b/packages/api/test/utils/textToSpeech.test.ts index 48a65abe0..89d8f5edf 100644 --- a/packages/api/test/utils/textToSpeech.test.ts +++ b/packages/api/test/utils/textToSpeech.test.ts @@ -11,16 +11,9 @@ describe('textToSpeech', () => { it('should create an audio file with speech marks', async () => { const input: TextToSpeechInput = { id: generateFakeUuid(), - text: - '《太阁立志传5 DX》清洲会议触发教程\n' + - '玩家要亲历清洲会议事件,需要位于织田家。\n' + - '清洲会议需要完成以下条件才能触发:\n' + - '本能寺发生之后,织田信长和织田信忠死亡。\n' + - '羽柴秀吉、柴田胜家、织田信雄、织田信孝为大名。\n' + - '清洲城必须为信雄的直辖城,或者清洲城主为信雄一方。\n' + - '前两个条件都很容易达成,主要是要保证清洲城主为信雄这一条件比较难办,需要玩家控制城主封地。', - languageCode: 'zh-CN', - voice: 'zh-CN-XiaochenNeural', + text: 'Marry had a little lamb', + languageCode: 'en-US', + voice: 'en-US-JennyNeural', } const output = await synthesizeTextToSpeech(input) expect(output.audioUrl).to.be.a('string')