From ae4c01f2d3abd7c08e36f72dddd6e0d2b4876ba0 Mon Sep 17 00:00:00 2001
From: Hongbo Wu <hongbo@omnivore.app>
Date: Mon, 3 Oct 2022 17:23:13 +0800
Subject: [PATCH] Split utterance into chunks of 256 chars

---
 packages/text-to-speech/src/htmlToSsml.ts   | 78 +++++++++++++--------
 packages/text-to-speech/src/textToSpeech.ts | 27 +++----
 2 files changed, 57 insertions(+), 48 deletions(-)

diff --git a/packages/text-to-speech/src/htmlToSsml.ts b/packages/text-to-speech/src/htmlToSsml.ts
index 5e0741c8c..a64a370bf 100644
--- a/packages/text-to-speech/src/htmlToSsml.ts
+++ b/packages/text-to-speech/src/htmlToSsml.ts
@@ -255,7 +255,7 @@ export const stripEmojis = (text: string): string => {
   return text.replace(emojiRegex, '').replace(/\s+/g, ' ')
 }
 
-const textToUtterance = ({
+const textToUtterances = ({
   tokenizer,
   idx,
   textItems,
@@ -269,32 +269,51 @@ const textToUtterance = ({
   wordOffset: number
   voice?: string
   isHtml?: boolean
-}): Utterance => {
-  const text = textItems.join('')
-  let textWithWordOffset = text
-  if (isHtml) {
-    try {
-      textWithWordOffset = htmlToText(text, { wordwrap: false })
-    } catch (err) {
-      console.error(
-        'Unable to convert HTML to text, html:',
+}): Utterance[] => {
+  let text = textItems.join('')
+  if (!isHtml) {
+    // for title
+    const wordCount = tokenizer.tokenize(text).length
+    return [
+      {
+        idx,
         text,
-        ', error:',
-        err
-      )
-      textWithWordOffset =
-        parseHTML(text).document.documentElement.textContent ?? text
-      console.info('Converted HTML to text:', textWithWordOffset)
+        wordOffset,
+        wordCount,
+        voice,
+      },
+    ]
+  }
+
+  const utterances: Utterance[] = []
+  try {
+    text = htmlToText(text, { wordwrap: false })
+  } catch (err) {
+    console.error(
+      'Unable to convert HTML to text, html:',
+      text,
+      ', error:',
+      err
+    )
+    text = parseHTML(text).document.documentElement.textContent ?? text
+    console.info('Converted HTML to text:', text)
+  }
+  // split text into chunks of 256 characters to stream faster without breaking on words
+  const textChunks = text.match(/.{1,256}(?= |$)/g)
+  if (textChunks) {
+    for (const chunk of textChunks) {
+      const wordCount = tokenizer.tokenize(chunk).length
+      utterances.push({
+        idx,
+        text: chunk,
+        wordOffset,
+        wordCount,
+        voice,
+      })
+      wordOffset += wordCount
     }
   }
-  const wordCount = tokenizer.tokenize(textWithWordOffset).length
-  return {
-    idx,
-    text,
-    wordOffset,
-    wordCount,
-    voice,
-  }
+  return utterances
 }
 
 export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => {
@@ -331,13 +350,13 @@ export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => {
   let wordOffset = 0
   if (title) {
     // first utterances is the title
-    const titleUtterance = textToUtterance({
+    const titleUtterance = textToUtterances({
       tokenizer,
       idx: '',
       textItems: [cleanText(title)], // title could have HTML entity names like & or emoji
       wordOffset,
       isHtml: false,
-    })
+    })[0]
     utterances.push(titleUtterance)
     wordOffset += titleUtterance.wordCount
   }
@@ -351,7 +370,7 @@ export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => {
       // use paragraph as anchor
       const idx = i.toString()
       i = emitElement(textItems, node, true)
-      const utterance = textToUtterance({
+      const newUtterances = textToUtterances({
         tokenizer,
         idx,
         textItems,
@@ -359,8 +378,9 @@ export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => {
         voice:
           node.nodeName === 'BLOCKQUOTE' ? options.secondaryVoice : undefined,
       })
-      utterance.wordCount > 0 && utterances.push(utterance)
-      wordOffset += utterance.wordCount
+      const wordCount = newUtterances.reduce((acc, u) => acc + u.wordCount, 0)
+      wordCount > 0 && utterances.push(...newUtterances)
+      wordOffset += wordCount
     }
   }
 
diff --git a/packages/text-to-speech/src/textToSpeech.ts b/packages/text-to-speech/src/textToSpeech.ts
index aa5f39713..c7c35fed3 100644
--- a/packages/text-to-speech/src/textToSpeech.ts
+++ b/packages/text-to-speech/src/textToSpeech.ts
@@ -138,28 +138,17 @@ export const synthesizeTextToSpeech = async (
       }
     }
     // for ssml
-    let audioData: Buffer = Buffer.from([])
-    // split ssml into chunks of 2000 characters to stream faster
-    // both within limit & without breaking on words and bookmarks <bookmark mark="1"/>
-    const ssmlChunks = input.text.match(/.{1,2000}(?= |$)(?! mark=)/g)
-    if (ssmlChunks) {
-      for (const ssmlChunk of ssmlChunks) {
-        const startSsmlChunk = startSsml(ssmlOptions)
-        const ssml = `${startSsmlChunk}${ssmlChunk}${endSsml()}`
-        // set the text offset to be the end of SSML start tag
-        wordOffset -= startSsmlChunk.length
-        const result = await speakSsmlAsyncPromise(ssml)
-        if (result.reason === ResultReason.Canceled) {
-          throw new Error(result.errorDetails)
-        }
-        timeOffset = timeOffset + result.audioDuration
-        wordOffset = wordOffset + ssmlChunk.length
-        audioData = Buffer.concat([audioData, Buffer.from(result.audioData)])
-      }
+    const startSsmlTag = startSsml(ssmlOptions)
+    const ssml = `${startSsmlTag}${input.text}${endSsml()}`
+    // set the text offset to be the end of SSML start tag
+    wordOffset -= startSsmlTag.length
+    const result = await speakSsmlAsyncPromise(ssml)
+    if (result.reason === ResultReason.Canceled) {
+      throw new Error(result.errorDetails)
     }
 
     return {
-      audioData,
+      audioData: Buffer.from(result.audioData),
       speechMarks,
     }
   } catch (error) {