fix smart quotes not tokenized correctly

This commit is contained in:
Hongbo Wu
2024-01-16 12:31:15 +08:00
parent 5bd670ded4
commit 842b2304c3
2 changed files with 22 additions and 1 deletions

View File

@ -364,13 +364,19 @@ const textToUtterances = ({
return utterances
}
const replaceSmartQuotes = (text: string): string => {
// replace smart quotes with regular quotes
return text.replace(/[\u2018\u2019]/g, "'").replace(/[\u201C\u201D]/g, '"')
}
export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => {
const { title, content, options } = htmlInput
console.log('creating speech file with options:', options)
const language = options.language || DEFAULT_LANGUAGE
const defaultVoice = options.primaryVoice || DEFAULT_VOICE
const dom = parseHTML(content)
// replace smart quotes with regular quotes to avoid issues with tokenization
const dom = parseHTML(replaceSmartQuotes(content))
const body = dom.document.querySelector('#readability-page-1')
if (!body) {
console.log('No HTML body found')

View File

@ -337,4 +337,19 @@ describe('convert HTML to Speech file', () => {
'这是一段中文,我想看看它是怎么分句的。如果买二手房有中介参与,要找相对大的、知名的中介。中介的收费、服务情况要先问清。还要和中介谈好,中介费的付款时间,一般来说是签完合同付一部分,过户后付一部分,省的太早付完钱,中介就不管事了。付完记得要发票。中介如果提供贷款服务,让他玩去。贷款之类的问题,别怕麻烦,自己去找银行。'
)
})
it('parses the smart quotes correctly', () => {
const html = `
<div class="page" id="readability-page-1" data-omnivore-anchor-idx="1">
<p data-omnivore-anchor-idx="23">Nor was Stalin any kind of naïve, unsuspecting victim of Hitlers <a data-omnivore-anchor-idx="24" href="https://archive.ph/o/AGEPn/https://www.britannica.com/event/Operation-Barbarossa" rel="noopener noreferrer" target="_blank" title="">Barbarossa onslaught</a>, as some historical clichés would have it. McMeekin makes an extended case that Stalin was preparing to attack Nazi Germany when Hitler attacked him, that the two dictators were basically in a race to see who could mobilize to betray the other first — and that the initial Soviet debacle in 1941 happened in part because Stalin was also pushing his military toward an offensive alignment, and they were caught in a “mid-mobilization limbo.”</p>
</div>
`
const speechFile = htmlToSpeechFile({
content: html,
options: TEST_OPTIONS,
})
expect(speechFile.utterances).to.have.lengthOf(2)
expect(speechFile.utterances[1].text).to.eql('McMeekin makes an extended case that Stalin was preparing to attack Nazi Germany when Hitler attacked him, that the two dictators were basically in a race to see who could mobilize to betray the other first — and that the initial Soviet debacle in 1941 happened in part because Stalin was also pushing his military toward an offensive alignment, and they were caught in a "mid-mobilization limbo."')
})
})