diff --git a/packages/text-to-speech/src/htmlToSsml.ts b/packages/text-to-speech/src/htmlToSsml.ts index 9d2894821..2aa9231a6 100644 --- a/packages/text-to-speech/src/htmlToSsml.ts +++ b/packages/text-to-speech/src/htmlToSsml.ts @@ -253,8 +253,9 @@ const htmlToUtterance = ( wordOffset: number, voice?: string ): Utterance => { - const text = htmlToText(htmlItems.join(''), { wordwrap: false }) - const wordCount = tokenizer.tokenize(text).length + const text = htmlItems.join('') + const plainText = htmlToText(text, { wordwrap: false }) + const wordCount = tokenizer.tokenize(plainText).length return { idx, text, @@ -289,11 +290,10 @@ export const htmlToSpeechFile = ( const node = parsedNodes[i - 2] if (TOP_LEVEL_TAGS.includes(node.nodeName) || hasSignificantText(node)) { - const idx = i.toString() i = emitElement(textItems, node, true) const utterance = htmlToUtterance( tokenizer, - idx, + i.toString(), textItems, wordOffset, node.nodeName === 'BLOCKQUOTE' ? options.secondaryVoice : undefined diff --git a/packages/text-to-speech/src/textToSpeech.ts b/packages/text-to-speech/src/textToSpeech.ts index 449668548..4169e5401 100644 --- a/packages/text-to-speech/src/textToSpeech.ts +++ b/packages/text-to-speech/src/textToSpeech.ts @@ -8,7 +8,6 @@ import { SpeechSynthesizer, } from 'microsoft-cognitiveservices-speech-sdk' import { endSsml, htmlToSsmlItems, ssmlItemText, startSsml } from './htmlToSsml' -import * as _ from 'underscore' export interface TextToSpeechInput { text: string @@ -151,7 +150,7 @@ export const synthesizeTextToSpeech = async ( // for utterance const start = startSsml(ssmlOptions) wordOffset = -start.length - const ssml = `${start}${_.escape(input.text)}${endSsml()}` + const ssml = `${start}${input.text}${endSsml()}` const result = await speakSsmlAsyncPromise(ssml) return { audioData: Buffer.from(result.audioData),