Add synthesize SSML to speech and test
This commit is contained in:
@ -17,6 +17,9 @@ export interface TextToSpeechInput {
|
||||
text: string
|
||||
voice?: string
|
||||
languageCode?: string
|
||||
textType?: 'text' | 'ssml'
|
||||
rate?: number
|
||||
volume?: number
|
||||
}
|
||||
|
||||
export interface TextToSpeechOutput {
|
||||
@ -47,8 +50,11 @@ export const synthesizeTextToSpeech = async (
|
||||
env.azure.speechKey,
|
||||
env.azure.speechRegion
|
||||
)
|
||||
speechConfig.speechSynthesisLanguage = input.languageCode || 'en-US'
|
||||
speechConfig.speechSynthesisVoiceName = input.voice || 'en-US-JennyNeural'
|
||||
const textType = input.textType || 'text'
|
||||
if (textType === 'text') {
|
||||
speechConfig.speechSynthesisLanguage = input.languageCode || 'en-US'
|
||||
speechConfig.speechSynthesisVoiceName = input.voice || 'en-US-JennyNeural'
|
||||
}
|
||||
speechConfig.speechSynthesisOutputFormat =
|
||||
SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3
|
||||
|
||||
@ -129,19 +135,59 @@ export const synthesizeTextToSpeech = async (
|
||||
)
|
||||
})
|
||||
}
|
||||
// slice the text into chunks of 5,000 characters
|
||||
let currentTextChunk = ''
|
||||
const textChunks = input.text.split('\n')
|
||||
for (let i = 0; i < textChunks.length; i++) {
|
||||
currentTextChunk += textChunks[i] + '\n'
|
||||
if (currentTextChunk.length < 5000 && i < textChunks.length - 1) {
|
||||
continue
|
||||
|
||||
const speakSsmlAsyncPromise = (
|
||||
text: string
|
||||
): Promise<SpeechSynthesisResult> => {
|
||||
return new Promise((resolve, reject) => {
|
||||
synthesizer.speakSsmlAsync(
|
||||
text,
|
||||
(result) => {
|
||||
resolve(result)
|
||||
},
|
||||
(error) => {
|
||||
synthesizer.close()
|
||||
reject(error)
|
||||
}
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
if (textType === 'text') {
|
||||
// slice the text into chunks of 5,000 characters
|
||||
let currentTextChunk = ''
|
||||
const textChunks = input.text.split('\n')
|
||||
for (let i = 0; i < textChunks.length; i++) {
|
||||
currentTextChunk += textChunks[i] + '\n'
|
||||
if (currentTextChunk.length < 5000 && i < textChunks.length - 1) {
|
||||
continue
|
||||
}
|
||||
logger.debug(`synthesizing ${currentTextChunk}`)
|
||||
const result = await speakTextAsyncPromise(currentTextChunk)
|
||||
timeOffset = timeOffset + result.audioDuration
|
||||
characterOffset = characterOffset + currentTextChunk.length
|
||||
currentTextChunk = ''
|
||||
}
|
||||
} else {
|
||||
const document = parseHTML(input.text).document
|
||||
const elements = document.querySelectorAll('h1, h2, h3, p, li')
|
||||
// convert html elements to the ssml document
|
||||
for (const e of Array.from(elements)) {
|
||||
const htmlElement = e as HTMLElement
|
||||
if (htmlElement.innerText) {
|
||||
const result = await speakSsmlAsyncPromise(
|
||||
htmlElementToSsml(
|
||||
htmlElement,
|
||||
input.languageCode,
|
||||
input.voice,
|
||||
input.rate,
|
||||
input.volume
|
||||
)
|
||||
)
|
||||
timeOffset = timeOffset + result.audioDuration
|
||||
characterOffset = characterOffset + htmlElement.innerText.length
|
||||
}
|
||||
}
|
||||
logger.debug(`synthesizing ${currentTextChunk}`)
|
||||
const result = await speakTextAsyncPromise(currentTextChunk)
|
||||
timeOffset = timeOffset + result.audioDuration
|
||||
characterOffset = characterOffset + currentTextChunk.length
|
||||
currentTextChunk = ''
|
||||
}
|
||||
writeStream.end()
|
||||
synthesizer.close()
|
||||
@ -164,15 +210,13 @@ export const synthesizeTextToSpeech = async (
|
||||
}
|
||||
}
|
||||
|
||||
export const htmlToSsml = (
|
||||
html: string,
|
||||
export const htmlElementToSsml = (
|
||||
htmlElement: HTMLElement,
|
||||
language = 'en-US',
|
||||
voice = 'en-US-JennyNeural',
|
||||
rate = 100,
|
||||
rate = 1,
|
||||
volume = 100
|
||||
): string => {
|
||||
const document = parseHTML(html).document
|
||||
const paragraphs = document.querySelectorAll('p')
|
||||
// create new ssml document
|
||||
const ssml = parseHTML('').document
|
||||
const speakElement = ssml.createElement('speak')
|
||||
@ -183,20 +227,18 @@ export const htmlToSsml = (
|
||||
voiceElement.setAttribute('name', voice)
|
||||
speakElement.appendChild(voiceElement)
|
||||
const prosodyElement = ssml.createElement('prosody')
|
||||
prosodyElement.setAttribute('rate', `${rate}%`)
|
||||
prosodyElement.setAttribute('rate', `${rate}`)
|
||||
prosodyElement.setAttribute('volume', volume.toString())
|
||||
voiceElement.appendChild(prosodyElement)
|
||||
// add each paragraph to the ssml document
|
||||
paragraphs.forEach((p) => {
|
||||
const id = p.getAttribute('data-omnivore-anchor-idx')
|
||||
if (id) {
|
||||
const text = p.innerText
|
||||
const bookMark = ssml.createElement('bookmark')
|
||||
bookMark.setAttribute('mark', `data-omnivore-anchor-idx-${id}`)
|
||||
bookMark.innerText = text
|
||||
prosodyElement.appendChild(bookMark)
|
||||
}
|
||||
})
|
||||
const id = htmlElement.getAttribute('data-omnivore-anchor-idx')
|
||||
if (id) {
|
||||
const text = htmlElement.innerText
|
||||
const bookMark = ssml.createElement('bookmark')
|
||||
bookMark.setAttribute('mark', `data-omnivore-anchor-idx-${id}`)
|
||||
prosodyElement.appendChild(bookMark)
|
||||
prosodyElement.appendChild(ssml.createTextNode(text))
|
||||
}
|
||||
|
||||
return speakElement.outerHTML
|
||||
}
|
||||
|
||||
@ -1,23 +1,38 @@
|
||||
import 'mocha'
|
||||
import {
|
||||
htmlElementToSsml,
|
||||
synthesizeTextToSpeech,
|
||||
TextToSpeechInput,
|
||||
} from '../../src/utils/textToSpeech'
|
||||
import { expect } from 'chai'
|
||||
import { generateFakeUuid } from '../util'
|
||||
import { parseHTML } from 'linkedom'
|
||||
|
||||
describe('textToSpeech', () => {
|
||||
describe('createAudioWithSpeechMarks', () => {
|
||||
describe('synthesizeTextToSpeech', () => {
|
||||
it('should create an audio file with speech marks', async () => {
|
||||
const input: TextToSpeechInput = {
|
||||
id: generateFakeUuid(),
|
||||
text: 'Marry had a little lamb',
|
||||
languageCode: 'en-US',
|
||||
voice: 'en-US-JennyNeural',
|
||||
textType: 'text',
|
||||
}
|
||||
const output = await synthesizeTextToSpeech(input)
|
||||
expect(output.audioUrl).to.be.a('string')
|
||||
expect(output.speechMarksUrl).to.be.a('string')
|
||||
})
|
||||
})
|
||||
|
||||
describe('htmlElementToSsml', () => {
|
||||
it('should convert Html Element to SSML', async () => {
|
||||
const htmlElement = parseHTML(
|
||||
`<p data-omnivore-anchor-idx="1">Marry had a little lamb</p>`
|
||||
).document.documentElement
|
||||
const ssml = htmlElementToSsml(htmlElement)
|
||||
expect(ssml).to.equal(
|
||||
`<speak xml:lang="en-US" xmlns="http://www.w3.org/2001/10/synthesis" version="1.0"><voice name="en-US-JennyNeural"><prosody volume="100" rate="1"><bookmark mark="data-omnivore-anchor-idx-1"></bookmark>Marry had a little lamb</prosody></voice></speak>`
|
||||
)
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
Reference in New Issue
Block a user