From 2e47b0879cd95eeefb6346484c40175bee2a8b38 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Mon, 7 Nov 2022 19:30:20 +0800 Subject: [PATCH] Convert HTML to utterances --- packages/api/src/utils/createTask.ts | 3 ++ packages/text-to-speech/src/index.ts | 2 + .../src/realisticTextToSpeech.ts | 43 ++++++++++++++----- 3 files changed, 37 insertions(+), 11 deletions(-) diff --git a/packages/api/src/utils/createTask.ts b/packages/api/src/utils/createTask.ts index c8102fa02..b61de5165 100644 --- a/packages/api/src/utils/createTask.ts +++ b/packages/api/src/utils/createTask.ts @@ -344,6 +344,7 @@ export const enqueueTextToSpeech = async ({ bucket = env.fileUpload.gcsUploadBucket, queue = 'omnivore-demo-text-to-speech-queue', location = env.gcp.location, + isUltraRealisticVoice = false, }: { userId: string speechId: string @@ -354,6 +355,7 @@ export const enqueueTextToSpeech = async ({ textType?: 'text' | 'ssml' queue?: string location?: string + isUltraRealisticVoice?: boolean }): Promise => { const { GOOGLE_CLOUD_PROJECT } = process.env const payload = { @@ -362,6 +364,7 @@ export const enqueueTextToSpeech = async ({ voice, bucket, textType, + isUltraRealisticVoice, } // eslint-disable-next-line @typescript-eslint/ban-ts-comment // @ts-ignore diff --git a/packages/text-to-speech/src/index.ts b/packages/text-to-speech/src/index.ts index 21f7b07ba..a57a4a098 100644 --- a/packages/text-to-speech/src/index.ts +++ b/packages/text-to-speech/src/index.ts @@ -167,6 +167,8 @@ export const textToSpeechHandler = Sentry.GCPFunction.wrapHttpFunction( }) as NodeJS.WriteStream // synthesize text to speech const startTime = Date.now() + // temporary solution to use realistic text to speech + input.isUltraRealisticVoice = true const { speechMarks } = await synthesizeTextToSpeech({ ...input, textType: 'html', diff --git a/packages/text-to-speech/src/realisticTextToSpeech.ts b/packages/text-to-speech/src/realisticTextToSpeech.ts index 295067d03..4b5c5d4d2 100644 --- a/packages/text-to-speech/src/realisticTextToSpeech.ts +++ b/packages/text-to-speech/src/realisticTextToSpeech.ts @@ -7,6 +7,7 @@ import axios from 'axios' import ffmpegPath from '@ffmpeg-installer/ffmpeg' import ffmpeg from 'fluent-ffmpeg' import { PassThrough } from 'stream' +import { htmlToSpeechFile } from './htmlToSsml' ffmpeg.setFfmpegPath(ffmpegPath.path) @@ -15,15 +16,24 @@ interface PlayHtConvertResponse { payload: string[] } -const streamWavToMp3 = (inputStream: PassThrough, outputSteam: PassThrough) => { +const streamWavToMp3 = ( + inputStream: PassThrough, + outputStream: PassThrough +) => { ffmpeg(inputStream) + .inputFormat('wav') + .format('mp3') + .audioBitrate('32k') + .audioChannels(2) + .audioCodec('libmp3lame') .on('error', (err) => { throw err }) .on('end', () => { - outputSteam.end() + console.debug('transcoding finished') + outputStream.end() }) - .pipe(outputSteam, { end: true }) + .pipe(outputStream, { end: true }) } export class RealisticTextToSpeech implements TextToSpeech { @@ -38,7 +48,7 @@ export class RealisticTextToSpeech implements TextToSpeech { } const inputStream = new PassThrough() - const outputStream = input.audioStream + const outputStream = input.audioStream as PassThrough const HEADERS = { Authorization: apiKey, @@ -46,9 +56,19 @@ export class RealisticTextToSpeech implements TextToSpeech { 'Content-Type': 'application/json', } + const speechFile = htmlToSpeechFile({ + title: '', + content: input.text, + options: { + primaryVoice: input.voice, + secondaryVoice: input.secondaryVoice, + language: input.language, + }, + }) + const content = speechFile.utterances.map((u) => u.text) const data = { voice: input.voice, - content: [input.text], + content, } // get the download url first @@ -70,7 +90,8 @@ export class RealisticTextToSpeech implements TextToSpeech { // timeout after 1 hour const timeout = 60 * 60 * 1000 const startTime = Date.now() - while (true) { + let audioData: Buffer | undefined + while (!audioData) { if (Date.now() - startTime > timeout) { throw new Error('Timeout when polling the download url') } @@ -85,8 +106,9 @@ export class RealisticTextToSpeech implements TextToSpeech { }) // write the audio file to the input stream - inputStream.end(downloadResponse.data) - break + // eslint-disable-next-line @typescript-eslint/no-unsafe-argument + audioData = Buffer.from(downloadResponse.data, 'binary') + inputStream.end(audioData) } catch (e) { // ignore error console.debug('checking status of audio file', downloadUrl) @@ -94,11 +116,10 @@ export class RealisticTextToSpeech implements TextToSpeech { } // transcode the audio file to mp3 - if (outputStream) { - streamWavToMp3(inputStream, outputStream as PassThrough) - } + streamWavToMp3(inputStream, outputStream) return { + audioData, speechMarks: [], } }