Convert HTML to utterances

This commit is contained in:
Hongbo Wu
2022-11-07 19:30:20 +08:00
parent abc9dd49be
commit 2e47b0879c
3 changed files with 37 additions and 11 deletions

View File

@ -344,6 +344,7 @@ export const enqueueTextToSpeech = async ({
bucket = env.fileUpload.gcsUploadBucket,
queue = 'omnivore-demo-text-to-speech-queue',
location = env.gcp.location,
isUltraRealisticVoice = false,
}: {
userId: string
speechId: string
@ -354,6 +355,7 @@ export const enqueueTextToSpeech = async ({
textType?: 'text' | 'ssml'
queue?: string
location?: string
isUltraRealisticVoice?: boolean
}): Promise<string> => {
const { GOOGLE_CLOUD_PROJECT } = process.env
const payload = {
@ -362,6 +364,7 @@ export const enqueueTextToSpeech = async ({
voice,
bucket,
textType,
isUltraRealisticVoice,
}
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-ignore

View File

@ -167,6 +167,8 @@ export const textToSpeechHandler = Sentry.GCPFunction.wrapHttpFunction(
}) as NodeJS.WriteStream
// synthesize text to speech
const startTime = Date.now()
// temporary solution to use realistic text to speech
input.isUltraRealisticVoice = true
const { speechMarks } = await synthesizeTextToSpeech({
...input,
textType: 'html',

View File

@ -7,6 +7,7 @@ import axios from 'axios'
import ffmpegPath from '@ffmpeg-installer/ffmpeg'
import ffmpeg from 'fluent-ffmpeg'
import { PassThrough } from 'stream'
import { htmlToSpeechFile } from './htmlToSsml'
ffmpeg.setFfmpegPath(ffmpegPath.path)
@ -15,15 +16,24 @@ interface PlayHtConvertResponse {
payload: string[]
}
const streamWavToMp3 = (inputStream: PassThrough, outputSteam: PassThrough) => {
const streamWavToMp3 = (
inputStream: PassThrough,
outputStream: PassThrough
) => {
ffmpeg(inputStream)
.inputFormat('wav')
.format('mp3')
.audioBitrate('32k')
.audioChannels(2)
.audioCodec('libmp3lame')
.on('error', (err) => {
throw err
})
.on('end', () => {
outputSteam.end()
console.debug('transcoding finished')
outputStream.end()
})
.pipe(outputSteam, { end: true })
.pipe(outputStream, { end: true })
}
export class RealisticTextToSpeech implements TextToSpeech {
@ -38,7 +48,7 @@ export class RealisticTextToSpeech implements TextToSpeech {
}
const inputStream = new PassThrough()
const outputStream = input.audioStream
const outputStream = input.audioStream as PassThrough
const HEADERS = {
Authorization: apiKey,
@ -46,9 +56,19 @@ export class RealisticTextToSpeech implements TextToSpeech {
'Content-Type': 'application/json',
}
const speechFile = htmlToSpeechFile({
title: '',
content: input.text,
options: {
primaryVoice: input.voice,
secondaryVoice: input.secondaryVoice,
language: input.language,
},
})
const content = speechFile.utterances.map((u) => u.text)
const data = {
voice: input.voice,
content: [input.text],
content,
}
// get the download url first
@ -70,7 +90,8 @@ export class RealisticTextToSpeech implements TextToSpeech {
// timeout after 1 hour
const timeout = 60 * 60 * 1000
const startTime = Date.now()
while (true) {
let audioData: Buffer | undefined
while (!audioData) {
if (Date.now() - startTime > timeout) {
throw new Error('Timeout when polling the download url')
}
@ -85,8 +106,9 @@ export class RealisticTextToSpeech implements TextToSpeech {
})
// write the audio file to the input stream
inputStream.end(downloadResponse.data)
break
// eslint-disable-next-line @typescript-eslint/no-unsafe-argument
audioData = Buffer.from(downloadResponse.data, 'binary')
inputStream.end(audioData)
} catch (e) {
// ignore error
console.debug('checking status of audio file', downloadUrl)
@ -94,11 +116,10 @@ export class RealisticTextToSpeech implements TextToSpeech {
}
// transcode the audio file to mp3
if (outputStream) {
streamWavToMp3(inputStream, outputStream as PassThrough)
}
streamWavToMp3(inputStream, outputStream)
return {
audioData,
speechMarks: [],
}
}