Files
omnivore/packages/text-to-speech/test/htmlToSsml.test.ts
2022-09-27 09:52:51 +08:00

241 lines
8.0 KiB
TypeScript

import 'mocha'
import { expect } from 'chai'
import {
htmlToSpeechFile,
htmlToSsmlItems,
stripEmojis,
} from '../src/htmlToSsml'
import * as fs from 'fs'
import path from 'path'
const TEST_OPTIONS = {
primaryVoice: 'test-primary',
secondaryVoice: 'test-secondary',
language: 'en-US',
rate: '1.0',
}
describe('stripEmojis', () => {
it('strips emojis from text and removes the extra space', () => {
const text = '🥛The Big Short guy is back with a new prediction'
expect(stripEmojis(text)).to.equal(
'The Big Short guy is back with a new prediction'
)
})
it('strips emojis from html and removes the extra space', () => {
const text = `<h2 data-omnivore-anchor-idx="37">🧠Brain food</h2>`
expect(stripEmojis(text)).to.equal(
`<h2 data-omnivore-anchor-idx="37">Brain food</h2>`
)
})
})
describe('htmlToSpeechFile', () => {
describe('a simple html file', () => {
xit('should convert Html to SSML', () => {
const ssml = htmlToSsmlItems(
`
<div id="readability-content">
<div id="readability-page-1">
<p>this is some text</p>
</div>
</div>
`,
TEST_OPTIONS
)
const text = ssml[0].textItems.join('').trim()
expect(text).to.equal(`<p><bookmark mark="3" />this is some text</p>`)
})
})
describe('a file with nested elements', () => {
xit('should collapse spans into the parent paragraph', () => {
const ssml = htmlToSsmlItems(
`
<div id="readability-content">
<div class="page" id="readability-page-1">
<p>
this is in the first paragraph
<span>this is in the second span</span>
this is also in the first paragraph
</p>
<p>
this is in the first paragraph
<span>this is in the second span</span>
this is also in the first paragraph
</p>
</div>
</div>
`,
TEST_OPTIONS
)
const text = ssml[0].textItems.join('').trim()
expect(text).to.equal(
`<p><bookmark mark="3" />this is in the first paragraph<bookmark mark="4" />this is in the second span<bookmark mark="3" />this is also in the first paragraph</p>`.trim()
)
const text1 = ssml[1].textItems.join('').trim()
expect(text1).to.equal(
`<p><bookmark mark="5" />this is in the first paragraph<bookmark mark="6" />this is in the second span<bookmark mark="5" />this is also in the first paragraph</p>`.trim()
)
})
xit('should extract child paragraphs to the top level', () => {
const ssml = htmlToSsmlItems(
`
<div id="readability-content">
<div>
<span>
this is in the first paragraph
<p>this is in the second paragraph</p>
this is also in the first paragraph
</span>
</div>
</div>
`,
TEST_OPTIONS
)
const text = ssml[0].textItems.join('').trim()
expect(text).to.equal(
`<p><bookmark mark="3" />this is in the first paragraph<bookmark mark="4" />this is in the second paragraph<bookmark mark="3" />this is also in the first paragraph</p>`.trim()
)
})
xit('should hoist paragraphs in spans to the top level', () => {
const ssml = htmlToSsmlItems(
`
<div id="readability-content">
<div>
<p>
this is in the first paragraph
<span>this is in the second paragraph</span>
this is also in the first paragraph
</p>
</div>
</div>
`,
TEST_OPTIONS
)
const text = ssml[0].textItems.join('').trim()
expect(text).to.equal(`TBD`.trim())
})
xit('should hoist lists to the top level', () => {
const ssml = htmlToSsmlItems(
`
<div id="readability-content">
<div>
<p>
this is in the first paragraph
<ul><li>this is the first item in a list</li></ul>
this is also in the first paragraph
</p>
</div>
</div>
`,
TEST_OPTIONS
)
const text = ssml[0].textItems.join('').trim()
expect(text).to.equal(`TBD`.trim())
})
xit('should hoist headers to the top level', () => {
const ssml = htmlToSsmlItems(
`
<div id="readability-content">
<div>
<p>
this is in the first paragraph
<h1>this is a header</h1>
this is also in the first paragraph
</p>
</div>
</div>
`,
TEST_OPTIONS
)
const text = ssml[0].textItems.join('').trim()
expect(text).to.equal(`TBD`.trim())
})
xit('should hoist blockquotes to the top level', () => {
const ssml = htmlToSsmlItems(
`
<div id="readability-content">
<div>
<p>
this is in the first paragraph
<blockquote>this is a blockquote</blockquote>
this is also in the first paragraph
</p>
</div>
</div>
`,
TEST_OPTIONS
)
const text = ssml[0].textItems.join('').trim()
expect(text).to.equal(`TBD`.trim())
})
})
describe('a file with blockquotes', () => {
xit('should convert Html to SSML with complimentary voices', () => {
const ssml = htmlToSsmlItems(
`
<div id="readability-content">
<div class="page" id="readability-page-1">
<p>first</p>
<blockquote>second</blockquote>
<p>third</p>
</div>
</div>
`,
TEST_OPTIONS
)
const first = ssml[0].textItems.join('').trim()
const second = ssml[1].textItems.join('').trim()
const third = ssml[2].textItems.join('').trim()
expect(first).to.equal(`<p><bookmark mark="1" />first</p>`)
expect(second).to.equal(`<p><bookmark mark="2" />second</p>`)
expect(third).to.equal(`<p><bookmark mark="3" />third</p>`)
expect(ssml[0].open.trim()).to.equal(
`<speak xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="http://www.w3.org/2001/mstts" xmlns:emo="http://www.w3.org/2009/10/emotionml" version="1.0" xml:lang="en-US"><voice name="test-primary"><prosody rate="1" pitch="default">`
)
expect(ssml[1].open.trim()).to.equal(
`<speak xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="http://www.w3.org/2001/mstts" xmlns:emo="http://www.w3.org/2009/10/emotionml" version="1.0" xml:lang="en-US"><voice name="test-secondary"><prosody rate="1" pitch="default">`
)
expect(ssml[2].open.trim()).to.equal(
`<speak xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="http://www.w3.org/2001/mstts" xmlns:emo="http://www.w3.org/2009/10/emotionml" version="1.0" xml:lang="en-US"><voice name="test-primary"><prosody rate="1" pitch="default">`
)
})
})
// For local testing:
// describe('readability test files', () => {
// it('should convert Html to SSML without throwing', async () => {
// const g = new glob.GlobSync('../readabilityjs/test/test-pages/*')
// console.log('glob: ', glob)
// for (const f of g.found) {
// const readablePath = `${f}/expected.html`
// if (!fs.existsSync(readablePath)) {
// continue
// }
// const html = fs.readFileSync(readablePath, { encoding: 'utf-8' })
// const ssmlItems = htmlToSsmlItems(html, TEST_OPTIONS)
// console.log('SSML ITEMS', ssmlItems)
// }
// })
// })
})
describe('convert HTML to Speech file', () => {
it('converts each <li> to an utterance', () => {
const html = fs.readFileSync(
path.resolve(__dirname, './fixtures/large.html'),
{ encoding: 'utf-8' }
)
const speechFile = htmlToSpeechFile({
content: html,
title: 'Wang Yi at the UN; Fu Zhenghua sentenced; Nvidia China sales',
options: TEST_OPTIONS,
})
expect(speechFile.utterances).to.have.lengthOf(12)
})
})