diff --git a/packages/text-to-speech/src/htmlToSsml.ts b/packages/text-to-speech/src/htmlToSsml.ts
index 9d2894821..2aa9231a6 100644
--- a/packages/text-to-speech/src/htmlToSsml.ts
+++ b/packages/text-to-speech/src/htmlToSsml.ts
@@ -253,8 +253,9 @@ const htmlToUtterance = (
wordOffset: number,
voice?: string
): Utterance => {
- const text = htmlToText(htmlItems.join(''), { wordwrap: false })
- const wordCount = tokenizer.tokenize(text).length
+ const text = htmlItems.join('')
+ const plainText = htmlToText(text, { wordwrap: false })
+ const wordCount = tokenizer.tokenize(plainText).length
return {
idx,
text,
@@ -289,11 +290,10 @@ export const htmlToSpeechFile = (
const node = parsedNodes[i - 2]
if (TOP_LEVEL_TAGS.includes(node.nodeName) || hasSignificantText(node)) {
- const idx = i.toString()
i = emitElement(textItems, node, true)
const utterance = htmlToUtterance(
tokenizer,
- idx,
+ i.toString(),
textItems,
wordOffset,
node.nodeName === 'BLOCKQUOTE' ? options.secondaryVoice : undefined
diff --git a/packages/text-to-speech/src/textToSpeech.ts b/packages/text-to-speech/src/textToSpeech.ts
index 449668548..4169e5401 100644
--- a/packages/text-to-speech/src/textToSpeech.ts
+++ b/packages/text-to-speech/src/textToSpeech.ts
@@ -8,7 +8,6 @@ import {
SpeechSynthesizer,
} from 'microsoft-cognitiveservices-speech-sdk'
import { endSsml, htmlToSsmlItems, ssmlItemText, startSsml } from './htmlToSsml'
-import * as _ from 'underscore'
export interface TextToSpeechInput {
text: string
@@ -151,7 +150,7 @@ export const synthesizeTextToSpeech = async (
// for utterance
const start = startSsml(ssmlOptions)
wordOffset = -start.length
- const ssml = `${start}${_.escape(input.text)}${endSsml()}`
+ const ssml = `${start}${input.text}${endSsml()}`
const result = await speakSsmlAsyncPromise(ssml)
return {
audioData: Buffer.from(result.audioData),