Add a new HTML to SSML function

This commit is contained in:
Jackson Harper
2022-08-29 16:37:30 +08:00
parent 4447fa0ad0
commit e6706a6efb
2 changed files with 253 additions and 0 deletions

View File

@ -0,0 +1,178 @@
import { parseHTML } from 'linkedom'
// this code needs to be kept in sync with the
// frontend code in: useReadingProgressAnchor
const ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES = [
'omnivore-highlight-id',
'data-twitter-tweet-id',
'data-instagram-id',
]
function ssmlTagsForTopLevelElement(element: Element) {
// if (element.nodeName == 'BLOCKQUOTE') {
// return {
// opening: `<voice name="en-US-GuyNeural"><p>`,
// closing: `</p></voice>`
// }
// }
return {
opening: `<p>`,
closing: `</p>`
}
}
function parseDomTree(pageNode: Element) {
if (!pageNode || pageNode.childNodes.length == 0) {
console.log(' no child nodes found')
return []
}
const nodesToVisitStack = [pageNode]
const visitedNodeList = []
while (nodesToVisitStack.length > 0) {
const currentNode = nodesToVisitStack.pop()
if (
currentNode?.nodeType !== 1 /* Node.ELEMENT_NODE */ ||
// Avoiding dynamic elements from being counted as anchor-allowed elements
ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES.some((attrib) =>
currentNode.hasAttribute(attrib)
)
) {
continue
}
visitedNodeList.push(currentNode)
;[].slice
.call(currentNode.childNodes)
.reverse()
.forEach(function (node) {
nodesToVisitStack.push(node)
})
}
visitedNodeList.shift()
visitedNodeList.forEach((node, index) => {
// start from index 1, index 0 reserved for anchor unknown.
node.setAttribute('data-omnivore-anchor-idx', (index + 1).toString())
})
return visitedNodeList
}
function emit(textItems: string[], text: string) {
textItems.push(text)
}
function cleanTextNode(textNode: ChildNode): String {
return (textNode.textContent ?? '').replace(/\s+/g, ' ')
}
function emitTextNode(textItems: string[], cleanedText: String, textNode: ChildNode) {
const ssmlElement = textNode.parentNode?.nodeName === 'B' ? "emphasis" : undefined
if (!cleanedText) { return }
if (ssmlElement) {
emit(textItems, `<${ssmlElement}>`)
}
emit(textItems, `${cleanedText}`)
if (ssmlElement) {
emit(textItems, `</${ssmlElement}>`)
}
}
function emitElement(textItems: string[], element: Element, isTopLevel: Boolean) {
const SKIP_TAGS = ['SCRIPT', 'STYLE', 'IMG', 'FIGURE', 'FIGCAPTION', 'IFRAME']
const topLevelTags = ssmlTagsForTopLevelElement(element)
const idx = element.getAttribute('data-omnivore-anchor-idx')
var maxVisitedIdx = Number(idx)
if (isTopLevel) {
emit(textItems, topLevelTags.opening)
}
for (const child of Array.from(element.childNodes)) {
if (SKIP_TAGS.indexOf(child.nodeName) >= 0) {
continue
}
if (child.nodeType == 3 /* Node.TEXT_NODE */ && (child.textContent?.length ?? 0) > 0 ) {
const cleanedText = cleanTextNode(child)
if (cleanedText.length > 1) { // Make sure its more than just a space
emit(textItems, `<bookmark mark="${idx}" />`)
}
emitTextNode(textItems, cleanedText, child)
}
if (child.nodeType == 1 /* Node.ELEMENT_NODE */) {
maxVisitedIdx = emitElement(textItems, child as HTMLElement, false)
}
}
if (isTopLevel) {
emit(textItems, topLevelTags.closing)
}
return Number(maxVisitedIdx)
}
export type SSMLItem = {
open: string
close: string
textItems: string[]
}
export type VoiceOptions = {
primary: string
secondary: string
}
const startSsml = (element: Element, voices: VoiceOptions): string => {
const voice = element.nodeName === 'BLOCKQUOTE' ? voices.secondary : voices.primary
return `
<speak xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="http://www.w3.org/2001/mstts" xmlns:emo="http://www.w3.org/2009/10/emotionml" version="1.0" xml:lang="en-US"><voice name="${voice}"><prosody rate="0%" pitch="0%">
`
}
const endSsml = (): string => {
return `</prosody></voice></speak>`
}
export const ssmlItemText = (item: SSMLItem): string => {
return [
item.open,
...item.textItems,
item.close
].join('')
}
export const htmlToSsml = (html: string, voices: { primary: string, secondary: string}): SSMLItem[] => {
const dom = parseHTML(html)
var body = dom.document.querySelector('#readability-page-1')
if (!body) {
throw new Error('Unable to parse HTML document')
}
var parsedNodes = parseDomTree(body)
if (parsedNodes.length < 1) {
throw new Error('No HTML nodes found')
}
const items: SSMLItem[] = []
for (var i = 1; i < parsedNodes.length + 1; i++) {
var textItems: string[] = []
const node = parsedNodes[i - 1]
i = emitElement(textItems, node, true)
items.push({
open: startSsml(node, voices),
close: endSsml(),
textItems: textItems,
})
}
return items
}

View File

@ -0,0 +1,75 @@
import 'mocha'
import { expect } from 'chai'
import { htmlToSsml } from '../src/htmlToSsml'
describe('htmlToSsml', () => {
const TEST_VOCIES = { primary: 'test-primary', secondary: 'test-secondary' }
describe('a simple html file', () => {
it('should convert Html to SSML', async () => {
const ssml = htmlToSsml(`
<div class="page" id="readability-page-1">
<p data-omnivore-anchor-idx="1">this is some text</p>
</div>
`, TEST_VOCIES
)
const text = ssml[0].textItems.join('').trim()
expect(text).to.equal(
`<p><bookmark mark="1" />this is some text</p>`
)
})
})
describe('a file with nested elements', () => {
it('should convert Html to SSML', async () => {
const ssml = htmlToSsml(`
<div class="page" id="readability-page-1">
<p>
this is in the first paragraph
<span>this is in the second span</span>
this is also in the first paragraph
</p>
</div>
`, TEST_VOCIES
)
const text = ssml[0].textItems.join('').trim()
expect(text).to.equal(
`<p><bookmark mark="1" /> this is in the first paragraph <bookmark mark="2" />this is in the second span<bookmark mark="1" /> this is also in the first paragraph </p>`.trim()
)
})
})
describe('a file with blockquotes', () => {
it('should convert Html to SSML with complimentary voices', async () => {
const ssml = htmlToSsml(`
<div class="page" id="readability-page-1">
<p>first</p>
<blockquote>second</blockquote>
<p>third</p>
</div>
`, TEST_VOCIES
)
const first = ssml[0].textItems.join('').trim()
const second = ssml[1].textItems.join('').trim()
const third = ssml[2].textItems.join('').trim()
expect(first).to.equal(
`<p><bookmark mark="1" />first</p>`
)
expect(second).to.equal(
`<p><bookmark mark="2" />second</p>`
)
expect(third).to.equal(
`<p><bookmark mark="3" />third</p>`
)
expect(ssml[0].open.trim()).to.equal(
`<speak xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="http://www.w3.org/2001/mstts" xmlns:emo="http://www.w3.org/2009/10/emotionml" version="1.0" xml:lang="en-US"><voice name="test-primary"><prosody rate="0%" pitch="0%">`
)
expect(ssml[1].open.trim()).to.equal(
`<speak xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="http://www.w3.org/2001/mstts" xmlns:emo="http://www.w3.org/2009/10/emotionml" version="1.0" xml:lang="en-US"><voice name="test-secondary"><prosody rate="0%" pitch="0%">`
)
expect(ssml[2].open.trim()).to.equal(
`<speak xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="http://www.w3.org/2001/mstts" xmlns:emo="http://www.w3.org/2009/10/emotionml" version="1.0" xml:lang="en-US"><voice name="test-primary"><prosody rate="0%" pitch="0%">`
)
})
})
})