diff --git a/packages/text-to-speech/src/htmlToSsml.ts b/packages/text-to-speech/src/htmlToSsml.ts index 125bd333d..beebd4a71 100644 --- a/packages/text-to-speech/src/htmlToSsml.ts +++ b/packages/text-to-speech/src/htmlToSsml.ts @@ -364,13 +364,19 @@ const textToUtterances = ({ return utterances } +const replaceSmartQuotes = (text: string): string => { + // replace smart quotes with regular quotes + return text.replace(/[\u2018\u2019]/g, "'").replace(/[\u201C\u201D]/g, '"') +} + export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => { const { title, content, options } = htmlInput console.log('creating speech file with options:', options) const language = options.language || DEFAULT_LANGUAGE const defaultVoice = options.primaryVoice || DEFAULT_VOICE - const dom = parseHTML(content) + // replace smart quotes with regular quotes to avoid issues with tokenization + const dom = parseHTML(replaceSmartQuotes(content)) const body = dom.document.querySelector('#readability-page-1') if (!body) { console.log('No HTML body found') diff --git a/packages/text-to-speech/test/htmlToSsml.test.ts b/packages/text-to-speech/test/htmlToSsml.test.ts index 61f1da66d..88c23d386 100644 --- a/packages/text-to-speech/test/htmlToSsml.test.ts +++ b/packages/text-to-speech/test/htmlToSsml.test.ts @@ -237,7 +237,7 @@ describe('convert HTML to Speech file', () => { title: 'Wang Yi at the UN; Fu Zhenghua sentenced; Nvidia China sales', options: TEST_OPTIONS, }) - expect(speechFile.utterances).to.have.lengthOf(20) + expect(speechFile.utterances).to.have.lengthOf(21) }) it('converts long utterances to multiple utterances', () => { @@ -289,7 +289,7 @@ describe('convert HTML to Speech file', () => { options: TEST_OPTIONS, }) expect(speechFile.utterances[1].text).to.eql( - 'If terms of the original $12.5 billion financing package remain the same, bankers may struggle to sell the risky Twitter buyout debt just as credit markets begin to crack, with yields at multiyear highs, they’re potentially on the hook for hundreds of millions of dollars of losses on the unsecured portion alone should they try to unload it to investors.' + "If terms of the original $12.5 billion financing package remain the same, bankers may struggle to sell the risky Twitter buyout debt just as credit markets begin to crack, with yields at multiyear highs, they're potentially on the hook for hundreds of millions of dollars of losses on the unsecured portion alone should they try to unload it to investors." ) }) @@ -306,7 +306,7 @@ describe('convert HTML to Speech file', () => { 'Just for curiosity, how do you pick the articles for Slow Chinese? Any advice on finding opportunities to communicate in Chinese? What are your tips to improve comprehension? ' ) expect(speechFile.utterances[1].text).to.eql( - 'I feel like I’m working on reading, listening, and speaking all at once, sometimes I feel like I’m just getting surface understanding. ' + `I feel like I'm working on reading, listening, and speaking all at once, sometimes I feel like I'm just getting surface understanding. ` ) }) @@ -320,7 +320,7 @@ describe('convert HTML to Speech file', () => { }) expect(speechFile.utterances).to.have.lengthOf(1) expect(speechFile.utterances[0].text).to.eql( - 'Q: „Die kürzliche Razzia in den BBC-Büros in Delhi sind ein weiterer Versuch der Regierung, kritische Medien-Kommentare zu unterdrücken. Man hat des Gefühl, Herr Modi hat Angst, in den Spiegel zu schauen!? “' + 'Q: „Die kürzliche Razzia in den BBC-Büros in Delhi sind ein weiterer Versuch der Regierung, kritische Medien-Kommentare zu unterdrücken. Man hat des Gefühl, Herr Modi hat Angst, in den Spiegel zu schauen!?"' ) }) @@ -337,4 +337,19 @@ describe('convert HTML to Speech file', () => { '这是一段中文,我想看看它是怎么分句的。如果买二手房有中介参与,要找相对大的、知名的中介。中介的收费、服务情况要先问清。还要和中介谈好,中介费的付款时间,一般来说是签完合同付一部分,过户后付一部分,省的太早付完钱,中介就不管事了。付完记得要发票。中介如果提供贷款服务,让他玩去。贷款之类的问题,别怕麻烦,自己去找银行。' ) }) + + it('parses the smart quotes correctly', () => { + const html = ` +
Nor was Stalin any kind of naïve, unsuspecting victim of Hitler’s Barbarossa onslaught, as some historical clichés would have it. McMeekin makes an extended case that Stalin was preparing to attack Nazi Germany when Hitler attacked him, that the two dictators were basically in a race to see who could mobilize to betray the other first — and that the initial Soviet debacle in 1941 happened in part because Stalin was also pushing his military toward an offensive alignment, and they were caught in a “mid-mobilization limbo.”
+