Add function to parse HTML to SSML

This commit is contained in:
Hongbo Wu
2022-08-16 22:15:44 +08:00
parent def8f28138
commit 447e413605
4 changed files with 101 additions and 103 deletions

View File

@ -327,14 +327,12 @@ export const enqueueSyncWithIntegration = async (
export const enqueueTextToSpeech = async (
userId: string,
pageId: string,
text: string
pageId: string
): Promise<string> => {
const { GOOGLE_CLOUD_PROJECT } = process.env
const payload = {
userId,
pageId,
text,
}
// If there is no Google Cloud Project Id exposed, it means that we are in local environment

View File

@ -276,6 +276,48 @@ export const parsePreparedContent = async (
})
article.content = article.dom.outerHTML
}
const ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES = [
'omnivore-highlight-id',
'data-twitter-tweet-id',
'data-instagram-id',
]
// Get the top level element?
const pageNode = article.dom.firstElementChild as HTMLElement
console.log('pageNode: ', pageNode)
const nodesToVisitStack: [HTMLElement] = [pageNode]
const visitedNodeList = []
while (nodesToVisitStack.length > 0) {
const currentNode = nodesToVisitStack.pop()
console.log('currentNode: ', currentNode?.nodeType)
if (
currentNode?.nodeType !== 1 ||
// Avoiding dynamic elements from being counted as anchor-allowed elements
ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES.some((attrib) =>
currentNode.hasAttribute(attrib)
)
) {
continue
}
visitedNodeList.push(currentNode)
;[].slice
.call(currentNode.childNodes)
.reverse()
.forEach(function (node) {
nodesToVisitStack.push(node)
})
}
visitedNodeList.shift()
visitedNodeList.forEach((node, index) => {
// start from index 1, index 0 reserved for anchor unknown.
node.setAttribute('data-omnivore-anchor-idx', (index + 1).toString())
})
console.log('article content:', article.dom.outerHTML)
article.content = article.dom.outerHTML
}
const newWindow = parseHTML('')

View File

@ -10,13 +10,12 @@ import {
SpeechSynthesizer,
} from 'microsoft-cognitiveservices-speech-sdk'
import { env } from '../env'
import { parseHTML } from 'linkedom'
export interface TextToSpeechInput {
id: string
text: string
voice?: string
textType?: 'text' | 'ssml'
engine?: 'standard' | 'neural'
languageCode?: string
}
@ -27,16 +26,14 @@ export interface TextToSpeechOutput {
export interface SpeechMark {
time: number
start: number
length: number
start?: number
length?: number
word: string
type: 'word' | 'bookmark'
}
const logger = buildLogger('app.dispatch')
// // create a new AWS Polly client
// const client = new AWS.Polly()
export const synthesizeTextToSpeech = async (
input: TextToSpeechInput
): Promise<TextToSpeechOutput> => {
@ -69,10 +66,9 @@ export const synthesizeTextToSpeech = async (
// The event synthesis completed signals that the synthesis is completed.
synthesizer.synthesisCompleted = (s, e) => {
logger.info(
'(synthesized) Reason: ' +
ResultReason[e.result.reason] +
' Audio length: ' +
`(synthesized) Reason: ${ResultReason[e.result.reason]} Audio length: ${
e.result.audioData.byteLength
}`
)
}
@ -100,6 +96,20 @@ export const synthesizeTextToSpeech = async (
time: (timeOffset + e.audioOffset) / 10000,
start: characterOffset + e.textOffset,
length: e.wordLength,
type: 'word',
})
}
synthesizer.bookmarkReached = (s, e) => {
logger.info(
`(Bookmark reached), Audio offset: ${
e.audioOffset / 10000
}ms, bookmark text: ${e.text}`
)
speechMarks.push({
word: e.text,
time: (timeOffset + e.audioOffset) / 10000,
type: 'bookmark',
})
}
@ -138,84 +148,39 @@ export const synthesizeTextToSpeech = async (
}
}
// export const createAudio = async (
// input: TextToSpeechInput
// ): Promise<Buffer> => {
// const { text, voice, textType, engine, languageCode } = input
// const params: SynthesizeSpeechInput = {
// OutputFormat: 'ogg_vorbis',
// Text: text,
// TextType: textType || 'text',
// VoiceId: voice || 'Joanna',
// Engine: engine || 'neural',
// LanguageCode: languageCode || 'en-US',
// }
// try {
// const data = await client.synthesizeSpeech(params).promise()
// return data.AudioStream as Buffer
// } catch (error) {
// logger.error('Unable to create audio file', { error })
// throw error
// }
// }
export const htmlToSsml = (
html: string,
language = 'en-US',
voice = 'en-US-JennyNeural',
rate = 100,
volume = 100
): string => {
const document = parseHTML(html).document
const paragraphs = document.querySelectorAll('p')
// create new ssml document
const ssml = parseHTML('').document
const speakElement = ssml.createElement('speak')
speakElement.setAttribute('version', '1.0')
speakElement.setAttribute('xmlns', 'http://www.w3.org/2001/10/synthesis')
speakElement.setAttribute('xml:lang', language)
const voiceElement = ssml.createElement('voice')
voiceElement.setAttribute('name', voice)
speakElement.appendChild(voiceElement)
const prosodyElement = ssml.createElement('prosody')
prosodyElement.setAttribute('rate', `${rate}%`)
prosodyElement.setAttribute('volume', volume.toString())
voiceElement.appendChild(prosodyElement)
// add each paragraph to the ssml document
paragraphs.forEach((p) => {
const id = p.getAttribute('data-omnivore-anchor-idx')
if (id) {
const text = p.innerText
const bookMark = ssml.createElement('bookmark')
bookMark.setAttribute('mark', `data-omnivore-anchor-idx-${id}`)
bookMark.innerText = text
prosodyElement.appendChild(bookMark)
}
})
// export const createSpeechMarks = async (
// input: TextToSpeechInput
// ): Promise<string> => {
// const { text, voice, textType, engine, languageCode } = input
// const params: SynthesizeSpeechInput = {
// OutputFormat: 'json',
// Text: text,
// TextType: textType || 'text',
// VoiceId: voice || 'Joanna',
// Engine: engine || 'neural',
// SpeechMarkTypes: ['word'],
// LanguageCode: languageCode || 'en-US',
// }
// try {
// const data = await client.synthesizeSpeech(params).promise()
// return (data.AudioStream as Buffer).toString()
// } catch (error) {
// logger.error('Unable to create speech marks', { error })
// throw error
// }
// }
//
// export const createAudioWithSpeechMarks = async (
// input: TextToSpeechInput
// ): Promise<TextToSpeechOutput> => {
// try {
// const audio = await createAudio(input)
// // upload audio to google cloud storage
// const filePath = `speech/${input.id}.ogg`
//
// logger.info('start uploading...', { filePath })
// await uploadToBucket(filePath, audio, {
// contentType: 'audio/ogg',
// public: true,
// })
//
// // get public url for audio file
// const publicUrl = getFilePublicUrl(filePath)
// logger.info('upload complete', { publicUrl })
//
// const speechMarks = await createSpeechMarks(input)
// return {
// audioUrl: publicUrl,
// speechMarks,
// }
// } catch (error) {
// logger.error('Unable to create audio with speech marks', error)
// throw error
// }
// }
// export const htmlToSsml = (
// html: string,
// language = 'en-US',
// voice = 'en-US-JennyNeural',
// rate = 100,
// volume = 100
// ): string => {
// return `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="${language}"><voice name="${voice}"><prosody rate="${rate}%" volume="${volume}%">${html}</prosody></voice></speak>`
// }
return speakElement.outerHTML
}

View File

@ -11,16 +11,9 @@ describe('textToSpeech', () => {
it('should create an audio file with speech marks', async () => {
const input: TextToSpeechInput = {
id: generateFakeUuid(),
text:
'《太阁立志传5 DX》清洲会议触发教程\n' +
'玩家要亲历清洲会议事件,需要位于织田家。\n' +
'清洲会议需要完成以下条件才能触发:\n' +
'本能寺发生之后,织田信长和织田信忠死亡。\n' +
'羽柴秀吉、柴田胜家、织田信雄、织田信孝为大名。\n' +
'清洲城必须为信雄的直辖城,或者清洲城主为信雄一方。\n' +
'前两个条件都很容易达成,主要是要保证清洲城主为信雄这一条件比较难办,需要玩家控制城主封地。',
languageCode: 'zh-CN',
voice: 'zh-CN-XiaochenNeural',
text: 'Marry had a little lamb',
languageCode: 'en-US',
voice: 'en-US-JennyNeural',
}
const output = await synthesizeTextToSpeech(input)
expect(output.audioUrl).to.be.a('string')