diff --git a/packages/text-to-speech/src/htmlToSsml.ts b/packages/text-to-speech/src/htmlToSsml.ts new file mode 100644 index 000000000..581959543 --- /dev/null +++ b/packages/text-to-speech/src/htmlToSsml.ts @@ -0,0 +1,178 @@ + + + +import { parseHTML } from 'linkedom' + +// this code needs to be kept in sync with the +// frontend code in: useReadingProgressAnchor + +const ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES = [ + 'omnivore-highlight-id', + 'data-twitter-tweet-id', + 'data-instagram-id', +] + +function ssmlTagsForTopLevelElement(element: Element) { + // if (element.nodeName == 'BLOCKQUOTE') { + // return { + // opening: `

`, + // closing: `

` + // } + // } + return { + opening: `

`, + closing: `

` + } +} + +function parseDomTree(pageNode: Element) { + if (!pageNode || pageNode.childNodes.length == 0) { + console.log(' no child nodes found') + return [] + } + + const nodesToVisitStack = [pageNode] + const visitedNodeList = [] + + while (nodesToVisitStack.length > 0) { + const currentNode = nodesToVisitStack.pop() + if ( + currentNode?.nodeType !== 1 /* Node.ELEMENT_NODE */ || + // Avoiding dynamic elements from being counted as anchor-allowed elements + ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES.some((attrib) => + currentNode.hasAttribute(attrib) + ) + ) { + continue + } + + visitedNodeList.push(currentNode) + ;[].slice + .call(currentNode.childNodes) + .reverse() + .forEach(function (node) { + nodesToVisitStack.push(node) + }) + } + + visitedNodeList.shift() + visitedNodeList.forEach((node, index) => { + // start from index 1, index 0 reserved for anchor unknown. + node.setAttribute('data-omnivore-anchor-idx', (index + 1).toString()) + }) + return visitedNodeList +} + +function emit(textItems: string[], text: string) { + textItems.push(text) +} + +function cleanTextNode(textNode: ChildNode): String { + return (textNode.textContent ?? '').replace(/\s+/g, ' ') +} + +function emitTextNode(textItems: string[], cleanedText: String, textNode: ChildNode) { + const ssmlElement = textNode.parentNode?.nodeName === 'B' ? "emphasis" : undefined + if (!cleanedText) { return } + + if (ssmlElement) { + emit(textItems, `<${ssmlElement}>`) + } + emit(textItems, `${cleanedText}`) + if (ssmlElement) { + emit(textItems, ``) + } +} + +function emitElement(textItems: string[], element: Element, isTopLevel: Boolean) { + const SKIP_TAGS = ['SCRIPT', 'STYLE', 'IMG', 'FIGURE', 'FIGCAPTION', 'IFRAME'] + + const topLevelTags = ssmlTagsForTopLevelElement(element) + const idx = element.getAttribute('data-omnivore-anchor-idx') + var maxVisitedIdx = Number(idx) + + if (isTopLevel) { + emit(textItems, topLevelTags.opening) + } + + for (const child of Array.from(element.childNodes)) { + if (SKIP_TAGS.indexOf(child.nodeName) >= 0) { + continue + } + + if (child.nodeType == 3 /* Node.TEXT_NODE */ && (child.textContent?.length ?? 0) > 0 ) { + const cleanedText = cleanTextNode(child) + if (cleanedText.length > 1) { // Make sure its more than just a space + emit(textItems, ``) + } + emitTextNode(textItems, cleanedText, child) + } + if (child.nodeType == 1 /* Node.ELEMENT_NODE */) { + maxVisitedIdx = emitElement(textItems, child as HTMLElement, false) + } + } + + if (isTopLevel) { + emit(textItems, topLevelTags.closing) + } + + return Number(maxVisitedIdx) +} + +export type SSMLItem = { + open: string + close: string + textItems: string[] +} + +export type VoiceOptions = { + primary: string + secondary: string +} + +const startSsml = (element: Element, voices: VoiceOptions): string => { + const voice = element.nodeName === 'BLOCKQUOTE' ? voices.secondary : voices.primary + return ` + + ` +} + +const endSsml = (): string => { + return `` +} + +export const ssmlItemText = (item: SSMLItem): string => { + return [ + item.open, + ...item.textItems, + item.close + ].join('') +} + +export const htmlToSsml = (html: string, voices: { primary: string, secondary: string}): SSMLItem[] => { + const dom = parseHTML(html) + var body = dom.document.querySelector('#readability-page-1') + if (!body) { + throw new Error('Unable to parse HTML document') + } + + var parsedNodes = parseDomTree(body) + if (parsedNodes.length < 1) { + throw new Error('No HTML nodes found') + } + + const items: SSMLItem[] = [] + for (var i = 1; i < parsedNodes.length + 1; i++) { + var textItems: string[] = [] + const node = parsedNodes[i - 1] + + i = emitElement(textItems, node, true) + items.push({ + open: startSsml(node, voices), + close: endSsml(), + textItems: textItems, + }) + } + + return items +} diff --git a/packages/text-to-speech/test/htmlToSsml.test.ts b/packages/text-to-speech/test/htmlToSsml.test.ts new file mode 100644 index 000000000..424de5d9d --- /dev/null +++ b/packages/text-to-speech/test/htmlToSsml.test.ts @@ -0,0 +1,75 @@ +import 'mocha' +import { expect } from 'chai' +import { htmlToSsml } from '../src/htmlToSsml' + +describe('htmlToSsml', () => { + const TEST_VOCIES = { primary: 'test-primary', secondary: 'test-secondary' } + + describe('a simple html file', () => { + it('should convert Html to SSML', async () => { + const ssml = htmlToSsml(` +
+

this is some text

+
+ `, TEST_VOCIES + ) + const text = ssml[0].textItems.join('').trim() + expect(text).to.equal( + `

this is some text

` + ) + }) + }) + describe('a file with nested elements', () => { + it('should convert Html to SSML', async () => { + const ssml = htmlToSsml(` +
+

+this is in the first paragraph +this is in the second span +this is also in the first paragraph +

+
+ `, TEST_VOCIES + ) + const text = ssml[0].textItems.join('').trim() + expect(text).to.equal( + `

this is in the first paragraph this is in the second span this is also in the first paragraph

`.trim() + ) + }) + }) + describe('a file with blockquotes', () => { + it('should convert Html to SSML with complimentary voices', async () => { + const ssml = htmlToSsml(` +
+

first

+
second
+

third

+
+ `, TEST_VOCIES + ) + const first = ssml[0].textItems.join('').trim() + const second = ssml[1].textItems.join('').trim() + const third = ssml[2].textItems.join('').trim() + + expect(first).to.equal( + `

first

` + ) + expect(second).to.equal( + `

second

` + ) + expect(third).to.equal( + `

third

` + ) + + expect(ssml[0].open.trim()).to.equal( + `` + ) + expect(ssml[1].open.trim()).to.equal( + `` + ) + expect(ssml[2].open.trim()).to.equal( + `` + ) + }) + }) +})