Files
omnivore/packages/api/src/utils/textToSpeech.ts

335 lines
9.8 KiB
TypeScript

import { buildLogger } from './logger'
import { createGCSFile, uploadToBucket } from './uploads'
import {
CancellationDetails,
CancellationReason,
ResultReason,
SpeechConfig,
SpeechSynthesisOutputFormat,
SpeechSynthesisResult,
SpeechSynthesizer,
} from 'microsoft-cognitiveservices-speech-sdk'
import { env } from '../env'
import { parseHTML } from 'linkedom'
export interface TextToSpeechInput {
id: string
text: string
voice?: string
languageCode?: string
textType?: 'text' | 'ssml'
rate?: number
volume?: number
complimentaryVoice?: string
}
export interface TextToSpeechOutput {
audioFileName: string
speechMarksFileName: string
}
export interface SpeechMark {
time: number
start?: number
length?: number
word: string
type: 'word' | 'bookmark'
}
const logger = buildLogger('app.dispatch')
export const synthesizeTextToSpeech = async (
input: TextToSpeechInput
): Promise<TextToSpeechOutput> => {
const audioFileName = `speech/${input.id}.mp3`
const audioFile = createGCSFile(audioFileName)
const writeStream = audioFile.createWriteStream({
resumable: true,
})
const speechConfig = SpeechConfig.fromSubscription(
env.azure.speechKey,
env.azure.speechRegion
)
const textType = input.textType || 'text'
if (textType === 'text') {
speechConfig.speechSynthesisLanguage = input.languageCode || 'en-US'
speechConfig.speechSynthesisVoiceName = input.voice || 'en-US-JennyNeural'
}
speechConfig.speechSynthesisOutputFormat =
SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3
// Create the speech synthesizer.
const synthesizer = new SpeechSynthesizer(speechConfig)
const speechMarks: SpeechMark[] = []
let timeOffset = 0
let characterOffset = 0
synthesizer.synthesizing = function (s, e) {
// convert arrayBuffer to stream and write to gcs file
writeStream.write(Buffer.from(e.result.audioData))
}
// The event synthesis completed signals that the synthesis is completed.
synthesizer.synthesisCompleted = (s, e) => {
logger.info(
`(synthesized) Reason: ${ResultReason[e.result.reason]} Audio length: ${
e.result.audioData.byteLength
}`
)
}
// The synthesis started event signals that the synthesis is started.
synthesizer.synthesisStarted = (s, e) => {
logger.info('(synthesis started)')
}
// The event signals that the service has stopped processing speech.
// This can happen when an error is encountered.
synthesizer.SynthesisCanceled = (s, e) => {
const cancellationDetails = CancellationDetails.fromResult(e.result)
let str =
'(cancel) Reason: ' + CancellationReason[cancellationDetails.reason]
if (cancellationDetails.reason === CancellationReason.Error) {
str += ': ' + e.result.errorDetails
}
logger.info(str)
}
// The unit of e.audioOffset is tick (1 tick = 100 nanoseconds), divide by 10,000 to convert to milliseconds.
synthesizer.wordBoundary = (s, e) => {
speechMarks.push({
word: e.text,
time: (timeOffset + e.audioOffset) / 10000,
start: characterOffset + e.textOffset,
length: e.wordLength,
type: 'word',
})
}
synthesizer.bookmarkReached = (s, e) => {
logger.debug(
`(Bookmark reached), Audio offset: ${
e.audioOffset / 10000
}ms, bookmark text: ${e.text}`
)
speechMarks.push({
word: e.text,
time: (timeOffset + e.audioOffset) / 10000,
type: 'bookmark',
})
}
const speakTextAsyncPromise = (
text: string
): Promise<SpeechSynthesisResult> => {
return new Promise((resolve, reject) => {
synthesizer.speakTextAsync(
text,
(result) => {
resolve(result)
},
(error) => {
reject(error)
}
)
})
}
const speakSsmlAsyncPromise = (
text: string
): Promise<SpeechSynthesisResult> => {
return new Promise((resolve, reject) => {
synthesizer.speakSsmlAsync(
text,
(result) => {
resolve(result)
},
(error) => {
reject(error)
}
)
})
}
if (textType === 'text') {
// slice the text into chunks of 5,000 characters
let currentTextChunk = ''
const textChunks = input.text.split('\n')
for (let i = 0; i < textChunks.length; i++) {
currentTextChunk += textChunks[i] + '\n'
if (currentTextChunk.length < 5000 && i < textChunks.length - 1) {
continue
}
logger.debug(`synthesizing ${currentTextChunk}`)
const result = await speakTextAsyncPromise(currentTextChunk)
timeOffset = timeOffset + result.audioDuration
characterOffset = characterOffset + currentTextChunk.length
currentTextChunk = ''
}
} else {
const document = parseHTML(input.text).document
const elements = document.querySelectorAll(
'h1, h2, h3, p, ul, ol, blockquote'
)
// convert html elements to the ssml document
for (const e of Array.from(elements)) {
const htmlElement = e as HTMLElement
if (htmlElement.innerText) {
// use complimentary voice for blockquote, hardcoded for now
const voice =
htmlElement.tagName.toLowerCase() === 'blockquote'
? input.complimentaryVoice || 'en-US-AriaNeural'
: input.voice
const ssml = htmlElementToSsml({
htmlElement: e,
language: input.languageCode,
rate: input.rate,
volume: input.volume,
voice,
})
logger.debug(`synthesizing ${ssml}`)
const result = await speakSsmlAsyncPromise(ssml)
// if (result.reason === ResultReason.Canceled) {
// synthesizer.close()
// throw new Error(result.errorDetails)
// }
timeOffset = timeOffset + result.audioDuration
// characterOffset = characterOffset + htmlElement.innerText.length
}
}
}
writeStream.end()
synthesizer.close()
logger.debug(`audio file: ${audioFileName}`)
// upload Speech Marks file to GCS
const speechMarksFileName = `speech/${input.id}.json`
await uploadToBucket(
speechMarksFileName,
Buffer.from(JSON.stringify(speechMarks))
)
return {
audioFileName,
speechMarksFileName,
}
}
export const htmlElementToSsml = ({
htmlElement,
language = 'en-US',
voice = 'en-US-JennyNeural',
rate = 1,
volume = 100,
}: {
htmlElement: Element
language?: string
voice?: string
rate?: number
volume?: number
}): string => {
const replaceElement = (newElement: Element, oldElement: Element) => {
const id = oldElement.getAttribute('data-omnivore-anchor-idx')
if (id) {
const e = htmlElement.querySelector(`[data-omnivore-anchor-idx="${id}"]`)
e?.parentNode?.replaceChild(newElement, e)
}
}
const appendBookmarkElement = (parent: Element, element: Element) => {
const id = element.getAttribute('data-omnivore-anchor-idx')
if (id) {
const bookMark = ssml.createElement('bookmark')
bookMark.setAttribute('mark', `data-omnivore-anchor-idx-${id}`)
parent.appendChild(bookMark)
}
}
const replaceWithEmphasis = (element: Element, level: string) => {
const parent = ssml.createDocumentFragment() as unknown as Element
appendBookmarkElement(parent, element)
const emphasisElement = ssml.createElement('emphasis')
emphasisElement.setAttribute('level', level)
emphasisElement.innerHTML = element.innerHTML.trim()
parent.appendChild(emphasisElement)
replaceElement(parent, element)
}
const replaceWithSentence = (element: Element) => {
const parent = ssml.createDocumentFragment() as unknown as Element
appendBookmarkElement(parent, element)
const sentenceElement = ssml.createElement('s')
sentenceElement.innerHTML = element.innerHTML.trim()
parent.appendChild(sentenceElement)
replaceElement(parent, element)
}
// create new ssml document
const ssml = parseHTML('').document
const speakElement = ssml.createElement('speak')
speakElement.setAttribute('version', '1.0')
speakElement.setAttribute('xmlns', 'http://www.w3.org/2001/10/synthesis')
speakElement.setAttribute('xml:lang', language)
const voiceElement = ssml.createElement('voice')
voiceElement.setAttribute('name', voice)
speakElement.appendChild(voiceElement)
const prosodyElement = ssml.createElement('prosody')
prosodyElement.setAttribute('rate', `${rate}`)
prosodyElement.setAttribute('volume', volume.toString())
voiceElement.appendChild(prosodyElement)
// add each paragraph to the ssml document
appendBookmarkElement(prosodyElement, htmlElement)
// replace emphasis elements with ssml
htmlElement.querySelectorAll('*').forEach((e) => {
switch (e.tagName.toLowerCase()) {
case 's':
replaceWithEmphasis(e, 'moderate')
break
case 'sub':
if (e.getAttribute('alias') === null) {
replaceWithEmphasis(e, 'moderate')
}
break
case 'i':
case 'em':
case 'q':
case 'blockquote':
case 'cite':
case 'del':
case 'strike':
case 'sup':
case 'summary':
case 'caption':
case 'figcaption':
replaceWithEmphasis(e, 'moderate')
break
case 'b':
case 'strong':
case 'dt':
case 'dfn':
case 'u':
case 'mark':
case 'th':
case 'title':
case 'var':
replaceWithEmphasis(e, 'moderate')
break
case 'li':
replaceWithSentence(e)
break
default: {
const parent = ssml.createDocumentFragment() as unknown as Element
appendBookmarkElement(parent, e)
const text = (e as HTMLElement).innerText.trim()
const textElement = ssml.createTextNode(text)
parent.appendChild(textElement)
replaceElement(parent, e)
}
}
})
prosodyElement.appendChild(htmlElement)
return speakElement.outerHTML.replace(/&nbsp;|\n/g, '')
}