From ae4c01f2d3abd7c08e36f72dddd6e0d2b4876ba0 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Mon, 3 Oct 2022 17:23:13 +0800 Subject: [PATCH] Split utterance into chunks of 256 chars --- packages/text-to-speech/src/htmlToSsml.ts | 78 +++++++++++++-------- packages/text-to-speech/src/textToSpeech.ts | 27 +++---- 2 files changed, 57 insertions(+), 48 deletions(-) diff --git a/packages/text-to-speech/src/htmlToSsml.ts b/packages/text-to-speech/src/htmlToSsml.ts index 5e0741c8c..a64a370bf 100644 --- a/packages/text-to-speech/src/htmlToSsml.ts +++ b/packages/text-to-speech/src/htmlToSsml.ts @@ -255,7 +255,7 @@ export const stripEmojis = (text: string): string => { return text.replace(emojiRegex, '').replace(/\s+/g, ' ') } -const textToUtterance = ({ +const textToUtterances = ({ tokenizer, idx, textItems, @@ -269,32 +269,51 @@ const textToUtterance = ({ wordOffset: number voice?: string isHtml?: boolean -}): Utterance => { - const text = textItems.join('') - let textWithWordOffset = text - if (isHtml) { - try { - textWithWordOffset = htmlToText(text, { wordwrap: false }) - } catch (err) { - console.error( - 'Unable to convert HTML to text, html:', +}): Utterance[] => { + let text = textItems.join('') + if (!isHtml) { + // for title + const wordCount = tokenizer.tokenize(text).length + return [ + { + idx, text, - ', error:', - err - ) - textWithWordOffset = - parseHTML(text).document.documentElement.textContent ?? text - console.info('Converted HTML to text:', textWithWordOffset) + wordOffset, + wordCount, + voice, + }, + ] + } + + const utterances: Utterance[] = [] + try { + text = htmlToText(text, { wordwrap: false }) + } catch (err) { + console.error( + 'Unable to convert HTML to text, html:', + text, + ', error:', + err + ) + text = parseHTML(text).document.documentElement.textContent ?? text + console.info('Converted HTML to text:', text) + } + // split text into chunks of 256 characters to stream faster without breaking on words + const textChunks = text.match(/.{1,256}(?= |$)/g) + if (textChunks) { + for (const chunk of textChunks) { + const wordCount = tokenizer.tokenize(chunk).length + utterances.push({ + idx, + text: chunk, + wordOffset, + wordCount, + voice, + }) + wordOffset += wordCount } } - const wordCount = tokenizer.tokenize(textWithWordOffset).length - return { - idx, - text, - wordOffset, - wordCount, - voice, - } + return utterances } export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => { @@ -331,13 +350,13 @@ export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => { let wordOffset = 0 if (title) { // first utterances is the title - const titleUtterance = textToUtterance({ + const titleUtterance = textToUtterances({ tokenizer, idx: '', textItems: [cleanText(title)], // title could have HTML entity names like & or emoji wordOffset, isHtml: false, - }) + })[0] utterances.push(titleUtterance) wordOffset += titleUtterance.wordCount } @@ -351,7 +370,7 @@ export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => { // use paragraph as anchor const idx = i.toString() i = emitElement(textItems, node, true) - const utterance = textToUtterance({ + const newUtterances = textToUtterances({ tokenizer, idx, textItems, @@ -359,8 +378,9 @@ export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => { voice: node.nodeName === 'BLOCKQUOTE' ? options.secondaryVoice : undefined, }) - utterance.wordCount > 0 && utterances.push(utterance) - wordOffset += utterance.wordCount + const wordCount = newUtterances.reduce((acc, u) => acc + u.wordCount, 0) + wordCount > 0 && utterances.push(...newUtterances) + wordOffset += wordCount } } diff --git a/packages/text-to-speech/src/textToSpeech.ts b/packages/text-to-speech/src/textToSpeech.ts index aa5f39713..c7c35fed3 100644 --- a/packages/text-to-speech/src/textToSpeech.ts +++ b/packages/text-to-speech/src/textToSpeech.ts @@ -138,28 +138,17 @@ export const synthesizeTextToSpeech = async ( } } // for ssml - let audioData: Buffer = Buffer.from([]) - // split ssml into chunks of 2000 characters to stream faster - // both within limit & without breaking on words and bookmarks - const ssmlChunks = input.text.match(/.{1,2000}(?= |$)(?! mark=)/g) - if (ssmlChunks) { - for (const ssmlChunk of ssmlChunks) { - const startSsmlChunk = startSsml(ssmlOptions) - const ssml = `${startSsmlChunk}${ssmlChunk}${endSsml()}` - // set the text offset to be the end of SSML start tag - wordOffset -= startSsmlChunk.length - const result = await speakSsmlAsyncPromise(ssml) - if (result.reason === ResultReason.Canceled) { - throw new Error(result.errorDetails) - } - timeOffset = timeOffset + result.audioDuration - wordOffset = wordOffset + ssmlChunk.length - audioData = Buffer.concat([audioData, Buffer.from(result.audioData)]) - } + const startSsmlTag = startSsml(ssmlOptions) + const ssml = `${startSsmlTag}${input.text}${endSsml()}` + // set the text offset to be the end of SSML start tag + wordOffset -= startSsmlTag.length + const result = await speakSsmlAsyncPromise(ssml) + if (result.reason === ResultReason.Canceled) { + throw new Error(result.errorDetails) } return { - audioData, + audioData: Buffer.from(result.audioData), speechMarks, } } catch (error) {