From 842b2304c34f8d64b7628e6399d774af64b2c2ba Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Tue, 16 Jan 2024 12:31:15 +0800 Subject: [PATCH] fix smart quotes not tokenized correctly --- packages/text-to-speech/src/htmlToSsml.ts | 8 +++++++- packages/text-to-speech/test/htmlToSsml.test.ts | 15 +++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/packages/text-to-speech/src/htmlToSsml.ts b/packages/text-to-speech/src/htmlToSsml.ts index 125bd333d..beebd4a71 100644 --- a/packages/text-to-speech/src/htmlToSsml.ts +++ b/packages/text-to-speech/src/htmlToSsml.ts @@ -364,13 +364,19 @@ const textToUtterances = ({ return utterances } +const replaceSmartQuotes = (text: string): string => { + // replace smart quotes with regular quotes + return text.replace(/[\u2018\u2019]/g, "'").replace(/[\u201C\u201D]/g, '"') +} + export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => { const { title, content, options } = htmlInput console.log('creating speech file with options:', options) const language = options.language || DEFAULT_LANGUAGE const defaultVoice = options.primaryVoice || DEFAULT_VOICE - const dom = parseHTML(content) + // replace smart quotes with regular quotes to avoid issues with tokenization + const dom = parseHTML(replaceSmartQuotes(content)) const body = dom.document.querySelector('#readability-page-1') if (!body) { console.log('No HTML body found') diff --git a/packages/text-to-speech/test/htmlToSsml.test.ts b/packages/text-to-speech/test/htmlToSsml.test.ts index 61f1da66d..5fbd216e7 100644 --- a/packages/text-to-speech/test/htmlToSsml.test.ts +++ b/packages/text-to-speech/test/htmlToSsml.test.ts @@ -337,4 +337,19 @@ describe('convert HTML to Speech file', () => { '这是一段中文,我想看看它是怎么分句的。如果买二手房有中介参与,要找相对大的、知名的中介。中介的收费、服务情况要先问清。还要和中介谈好,中介费的付款时间,一般来说是签完合同付一部分,过户后付一部分,省的太早付完钱,中介就不管事了。付完记得要发票。中介如果提供贷款服务,让他玩去。贷款之类的问题,别怕麻烦,自己去找银行。' ) }) + + it('parses the smart quotes correctly', () => { + const html = ` +
+

Nor was Stalin any kind of naïve, unsuspecting victim of Hitler’s Barbarossa onslaught, as some historical clichés would have it. McMeekin makes an extended case that Stalin was preparing to attack Nazi Germany when Hitler attacked him, that the two dictators were basically in a race to see who could mobilize to betray the other first — and that the initial Soviet debacle in 1941 happened in part because Stalin was also pushing his military toward an offensive alignment, and they were caught in a “mid-mobilization limbo.”

+
+ ` + + const speechFile = htmlToSpeechFile({ + content: html, + options: TEST_OPTIONS, + }) + expect(speechFile.utterances).to.have.lengthOf(2) + expect(speechFile.utterances[1].text).to.eql('McMeekin makes an extended case that Stalin was preparing to attack Nazi Germany when Hitler attacked him, that the two dictators were basically in a race to see who could mobilize to betray the other first — and that the initial Soviet debacle in 1941 happened in part because Stalin was also pushing his military toward an offensive alignment, and they were caught in a "mid-mobilization limbo."') + }) })