if we hit 256, look back for first ending sentence within 80 chars

2022-10-04 17:16:26 +08:00
parent 39dcab5076
commit 690ce05b0e
1 changed files with 32 additions and 13 deletions
--- a/packages/text-to-speech/src/htmlToSsml.ts
+++ b/packages/text-to-speech/src/htmlToSsml.ts
@ -298,21 +298,40 @@ const textToUtterances = ({
    text = parseHTML(text).document.documentElement.textContent ?? text
    console.info('Converted HTML to text:', text)
  }
-  // split text into chunks of 256 characters to stream faster without breaking on words
-  const textChunks = text.match(/.{1,256}(?= |$)/g)
-  if (textChunks) {
-    for (const chunk of textChunks) {
-      const wordCount = tokenizer.tokenize(chunk).length
-      utterances.push({
-        idx,
-        text: chunk,
-        wordOffset,
-        wordCount,
-        voice,
-      })
-      wordOffset += wordCount
+  // if we hit 256, look back for first ending sentence within 80 chars
+  const MAX_CHARS = 256
+  const MAX_LOOKBACK = 80
+  while (text.length > MAX_CHARS) {
+    let lookback = MAX_LOOKBACK
+    let end = MAX_CHARS - lookback
+    while (lookback > 0) {
+      if (text[end] === '.' || text[end] === '!' || text[end] === '?') {
+        break
+      }
+      end++
+      lookback--
    }
+    const utterance = text.substring(0, end + 1)
+    const wordCount = tokenizer.tokenize(utterance).length
+    utterances.push({
+      idx,
+      text: utterance,
+      wordOffset,
+      wordCount,
+      voice,
+    })
+    text = text.substring(end + 1)
+    wordOffset += wordCount
  }
+
+  const wordCount = tokenizer.tokenize(text).length
+  utterances.push({
+    idx,
+    text,
+    wordOffset,
+    wordCount,
+    voice,
+  })
  return utterances
 }