Merge pull request #1414 from omnivore-app/fix/tts-list-item

Fix not tokenizing sentences correctly in li elements by adding a new line to the end of each text in the li element
This commit is contained in:
Hongbo Wu
2022-11-15 15:08:19 +08:00
committed by GitHub
3 changed files with 28 additions and 4 deletions

View File

@ -178,6 +178,10 @@ function emitElement(
}
if (child.nodeType == 1 /* Node.ELEMENT_NODE */) {
maxVisitedIdx = emitElement(textItems, child as HTMLElement, false)
if (child.nodeName === 'LI') {
// add a new line after each list item
emit(textItems, '\n')
}
}
}

View File

@ -0,0 +1 @@
<DIV id="readability-content"><DIV class="page" id="readability-page-1"><blockquote data-omnivore-anchor-idx="81"><ul data-omnivore-anchor-idx="82"><li data-omnivore-anchor-idx="83"><p data-omnivore-anchor-idx="84"><em data-omnivore-anchor-idx="85">Just for curiosity, how do you pick the articles for Slow Chinese?</em></p></li><li data-omnivore-anchor-idx="86"><p data-omnivore-anchor-idx="87"><em data-omnivore-anchor-idx="88">Any advice on finding opportunities to communicate in Chinese?</em></p></li><li data-omnivore-anchor-idx="89"><p data-omnivore-anchor-idx="90"><em data-omnivore-anchor-idx="91">What are your tips to improve comprehension?&nbsp; I feel like Im working on reading, listening, and speaking all at once, sometimes I feel like Im just getting surface understanding.</em></p></li><li data-omnivore-anchor-idx="92"><p data-omnivore-anchor-idx="93"><em data-omnivore-anchor-idx="94">I often feel I use the same words/phrases over and over and my short-term memory is weak, I live in a non-Chinese environment although I have many opportunities to practice Chinese and have no plans to travel to China.</em></p></li><li data-omnivore-anchor-idx="95"><p data-omnivore-anchor-idx="96"><em data-omnivore-anchor-idx="97">I'm American-born Chinese, so I grew up with Chinese speaking parents, but I used English and home and in daily life. My listening is strong, everything else is weak. I'm in China currently&nbsp;studying in a Master's program. I'm probably about HSK5-6&nbsp;in my vocabulary and reading comprehension. If you have any tips for picking up reading/speaking, I'd love to know.</em></p></li></ul></blockquote></DIV></DIV>

View File

@ -15,6 +15,10 @@ const TEST_OPTIONS = {
rate: '1.0',
}
const load = (filename: string) => {
return fs.readFileSync(path.join(__dirname, filename), 'utf8')
}
describe('stripEmojis', () => {
it('strips emojis from text and removes the extra space', () => {
const text = '🥛The Big Short guy is back with a new prediction'
@ -226,10 +230,8 @@ describe('htmlToSpeechFile', () => {
describe('convert HTML to Speech file', () => {
it('converts each <li> to an utterance', () => {
const html = fs.readFileSync(
path.resolve(__dirname, './fixtures/li.html'),
{ encoding: 'utf-8' }
)
const html = load('./fixtures/li.html')
const speechFile = htmlToSpeechFile({
content: html,
title: 'Wang Yi at the UN; Fu Zhenghua sentenced; Nvidia China sales',
@ -290,4 +292,21 @@ describe('convert HTML to Speech file', () => {
'If terms of the original $12.5 billion financing package remain the same, bankers may struggle to sell the risky Twitter buyout debt just as credit markets begin to crack, with yields at multiyear highs, theyre potentially on the hook for hundreds of millions of dollars of losses on the unsecured portion alone should they try to unload it to investors.'
)
})
it('splits sentences correctly in a blockquote element', () => {
const html = load('./fixtures/blockquote.html')
const speechFile = htmlToSpeechFile({
content: html,
options: TEST_OPTIONS,
})
expect(speechFile.utterances).to.have.lengthOf(5)
expect(speechFile.utterances[0].text).to.eql(
'Just for curiosity, how do you pick the articles for Slow Chinese? Any advice on finding opportunities to communicate in Chinese? What are your tips to improve comprehension? '
)
expect(speechFile.utterances[1].text).to.eql(
'I feel like Im working on reading, listening, and speaking all at once, sometimes I feel like Im just getting surface understanding. '
)
})
})