diff --git a/packages/text-to-speech/src/htmlToSsml.ts b/packages/text-to-speech/src/htmlToSsml.ts
index 5e0741c8c..a64a370bf 100644
--- a/packages/text-to-speech/src/htmlToSsml.ts
+++ b/packages/text-to-speech/src/htmlToSsml.ts
@@ -255,7 +255,7 @@ export const stripEmojis = (text: string): string => {
return text.replace(emojiRegex, '').replace(/\s+/g, ' ')
}
-const textToUtterance = ({
+const textToUtterances = ({
tokenizer,
idx,
textItems,
@@ -269,32 +269,51 @@ const textToUtterance = ({
wordOffset: number
voice?: string
isHtml?: boolean
-}): Utterance => {
- const text = textItems.join('')
- let textWithWordOffset = text
- if (isHtml) {
- try {
- textWithWordOffset = htmlToText(text, { wordwrap: false })
- } catch (err) {
- console.error(
- 'Unable to convert HTML to text, html:',
+}): Utterance[] => {
+ let text = textItems.join('')
+ if (!isHtml) {
+ // for title
+ const wordCount = tokenizer.tokenize(text).length
+ return [
+ {
+ idx,
text,
- ', error:',
- err
- )
- textWithWordOffset =
- parseHTML(text).document.documentElement.textContent ?? text
- console.info('Converted HTML to text:', textWithWordOffset)
+ wordOffset,
+ wordCount,
+ voice,
+ },
+ ]
+ }
+
+ const utterances: Utterance[] = []
+ try {
+ text = htmlToText(text, { wordwrap: false })
+ } catch (err) {
+ console.error(
+ 'Unable to convert HTML to text, html:',
+ text,
+ ', error:',
+ err
+ )
+ text = parseHTML(text).document.documentElement.textContent ?? text
+ console.info('Converted HTML to text:', text)
+ }
+ // split text into chunks of 256 characters to stream faster without breaking on words
+ const textChunks = text.match(/.{1,256}(?= |$)/g)
+ if (textChunks) {
+ for (const chunk of textChunks) {
+ const wordCount = tokenizer.tokenize(chunk).length
+ utterances.push({
+ idx,
+ text: chunk,
+ wordOffset,
+ wordCount,
+ voice,
+ })
+ wordOffset += wordCount
}
}
- const wordCount = tokenizer.tokenize(textWithWordOffset).length
- return {
- idx,
- text,
- wordOffset,
- wordCount,
- voice,
- }
+ return utterances
}
export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => {
@@ -331,13 +350,13 @@ export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => {
let wordOffset = 0
if (title) {
// first utterances is the title
- const titleUtterance = textToUtterance({
+ const titleUtterance = textToUtterances({
tokenizer,
idx: '',
textItems: [cleanText(title)], // title could have HTML entity names like & or emoji
wordOffset,
isHtml: false,
- })
+ })[0]
utterances.push(titleUtterance)
wordOffset += titleUtterance.wordCount
}
@@ -351,7 +370,7 @@ export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => {
// use paragraph as anchor
const idx = i.toString()
i = emitElement(textItems, node, true)
- const utterance = textToUtterance({
+ const newUtterances = textToUtterances({
tokenizer,
idx,
textItems,
@@ -359,8 +378,9 @@ export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => {
voice:
node.nodeName === 'BLOCKQUOTE' ? options.secondaryVoice : undefined,
})
- utterance.wordCount > 0 && utterances.push(utterance)
- wordOffset += utterance.wordCount
+ const wordCount = newUtterances.reduce((acc, u) => acc + u.wordCount, 0)
+ wordCount > 0 && utterances.push(...newUtterances)
+ wordOffset += wordCount
}
}
diff --git a/packages/text-to-speech/src/textToSpeech.ts b/packages/text-to-speech/src/textToSpeech.ts
index aa5f39713..c7c35fed3 100644
--- a/packages/text-to-speech/src/textToSpeech.ts
+++ b/packages/text-to-speech/src/textToSpeech.ts
@@ -138,28 +138,17 @@ export const synthesizeTextToSpeech = async (
}
}
// for ssml
- let audioData: Buffer = Buffer.from([])
- // split ssml into chunks of 2000 characters to stream faster
- // both within limit & without breaking on words and bookmarks
- const ssmlChunks = input.text.match(/.{1,2000}(?= |$)(?! mark=)/g)
- if (ssmlChunks) {
- for (const ssmlChunk of ssmlChunks) {
- const startSsmlChunk = startSsml(ssmlOptions)
- const ssml = `${startSsmlChunk}${ssmlChunk}${endSsml()}`
- // set the text offset to be the end of SSML start tag
- wordOffset -= startSsmlChunk.length
- const result = await speakSsmlAsyncPromise(ssml)
- if (result.reason === ResultReason.Canceled) {
- throw new Error(result.errorDetails)
- }
- timeOffset = timeOffset + result.audioDuration
- wordOffset = wordOffset + ssmlChunk.length
- audioData = Buffer.concat([audioData, Buffer.from(result.audioData)])
- }
+ const startSsmlTag = startSsml(ssmlOptions)
+ const ssml = `${startSsmlTag}${input.text}${endSsml()}`
+ // set the text offset to be the end of SSML start tag
+ wordOffset -= startSsmlTag.length
+ const result = await speakSsmlAsyncPromise(ssml)
+ if (result.reason === ResultReason.Canceled) {
+ throw new Error(result.errorDetails)
}
return {
- audioData,
+ audioData: Buffer.from(result.audioData),
speechMarks,
}
} catch (error) {