Split utterance into chunks of 256 chars

2022-10-03 17:23:13 +08:00
parent 0aa17eb6dc
commit ae4c01f2d3
2 changed files with 57 additions and 48 deletions
--- a/packages/text-to-speech/src/htmlToSsml.ts
+++ b/packages/text-to-speech/src/htmlToSsml.ts
@ -255,7 +255,7 @@ export const stripEmojis = (text: string): string => {
  return text.replace(emojiRegex, '').replace(/\s+/g, ' ')
 }

-const textToUtterance = ({
+const textToUtterances = ({
  tokenizer,
  idx,
  textItems,
@ -269,32 +269,51 @@ const textToUtterance = ({
  wordOffset: number
  voice?: string
  isHtml?: boolean
-}): Utterance => {
-  const text = textItems.join('')
-  let textWithWordOffset = text
-  if (isHtml) {
-    try {
-      textWithWordOffset = htmlToText(text, { wordwrap: false })
-    } catch (err) {
-      console.error(
-        'Unable to convert HTML to text, html:',
+}): Utterance[] => {
+  let text = textItems.join('')
+  if (!isHtml) {
+    // for title
+    const wordCount = tokenizer.tokenize(text).length
+    return [
+      {
+        idx,
        text,
-        ', error:',
-        err
-      )
-      textWithWordOffset =
-        parseHTML(text).document.documentElement.textContent ?? text
-      console.info('Converted HTML to text:', textWithWordOffset)
+        wordOffset,
+        wordCount,
+        voice,
+      },
+    ]
+  }
+
+  const utterances: Utterance[] = []
+  try {
+    text = htmlToText(text, { wordwrap: false })
+  } catch (err) {
+    console.error(
+      'Unable to convert HTML to text, html:',
+      text,
+      ', error:',
+      err
+    )
+    text = parseHTML(text).document.documentElement.textContent ?? text
+    console.info('Converted HTML to text:', text)
+  }
+  // split text into chunks of 256 characters to stream faster without breaking on words
+  const textChunks = text.match(/.{1,256}(?= |$)/g)
+  if (textChunks) {
+    for (const chunk of textChunks) {
+      const wordCount = tokenizer.tokenize(chunk).length
+      utterances.push({
+        idx,
+        text: chunk,
+        wordOffset,
+        wordCount,
+        voice,
+      })
+      wordOffset += wordCount
    }
  }
-  const wordCount = tokenizer.tokenize(textWithWordOffset).length
-  return {
-    idx,
-    text,
-    wordOffset,
-    wordCount,
-    voice,
-  }
+  return utterances
 }

 export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => {
@ -331,13 +350,13 @@ export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => {
  let wordOffset = 0
  if (title) {
    // first utterances is the title
-    const titleUtterance = textToUtterance({
+    const titleUtterance = textToUtterances({
      tokenizer,
      idx: '',
      textItems: [cleanText(title)], // title could have HTML entity names like & or emoji
      wordOffset,
      isHtml: false,
-    })
+    })[0]
    utterances.push(titleUtterance)
    wordOffset += titleUtterance.wordCount
  }
@ -351,7 +370,7 @@ export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => {
      // use paragraph as anchor
      const idx = i.toString()
      i = emitElement(textItems, node, true)
-      const utterance = textToUtterance({
+      const newUtterances = textToUtterances({
        tokenizer,
        idx,
        textItems,
@ -359,8 +378,9 @@ export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => {
        voice:
          node.nodeName === 'BLOCKQUOTE' ? options.secondaryVoice : undefined,
      })
-      utterance.wordCount > 0 && utterances.push(utterance)
-      wordOffset += utterance.wordCount
+      const wordCount = newUtterances.reduce((acc, u) => acc + u.wordCount, 0)
+      wordCount > 0 && utterances.push(...newUtterances)
+      wordOffset += wordCount
    }
  }

--- a/packages/text-to-speech/src/textToSpeech.ts
+++ b/packages/text-to-speech/src/textToSpeech.ts
@ -138,28 +138,17 @@ export const synthesizeTextToSpeech = async (
      }
    }
    // for ssml
-    let audioData: Buffer = Buffer.from([])
-    // split ssml into chunks of 2000 characters to stream faster
-    // both within limit & without breaking on words and bookmarks <bookmark mark="1"/>
-    const ssmlChunks = input.text.match(/.{1,2000}(?= |$)(?! mark=)/g)
-    if (ssmlChunks) {
-      for (const ssmlChunk of ssmlChunks) {
-        const startSsmlChunk = startSsml(ssmlOptions)
-        const ssml = `${startSsmlChunk}${ssmlChunk}${endSsml()}`
-        // set the text offset to be the end of SSML start tag
-        wordOffset -= startSsmlChunk.length
-        const result = await speakSsmlAsyncPromise(ssml)
-        if (result.reason === ResultReason.Canceled) {
-          throw new Error(result.errorDetails)
-        }
-        timeOffset = timeOffset + result.audioDuration
-        wordOffset = wordOffset + ssmlChunk.length
-        audioData = Buffer.concat([audioData, Buffer.from(result.audioData)])
-      }
+    const startSsmlTag = startSsml(ssmlOptions)
+    const ssml = `${startSsmlTag}${input.text}${endSsml()}`
+    // set the text offset to be the end of SSML start tag
+    wordOffset -= startSsmlTag.length
+    const result = await speakSsmlAsyncPromise(ssml)
+    if (result.reason === ResultReason.Canceled) {
+      throw new Error(result.errorDetails)
    }

    return {
-      audioData,
+      audioData: Buffer.from(result.audioData),
      speechMarks,
    }
  } catch (error) {