From 2e47b0879cd95eeefb6346484c40175bee2a8b38 Mon Sep 17 00:00:00 2001
From: Hongbo Wu <hongbo@omnivore.app>
Date: Mon, 7 Nov 2022 19:30:20 +0800
Subject: [PATCH] Convert HTML to utterances

---
 packages/api/src/utils/createTask.ts          |  3 ++
 packages/text-to-speech/src/index.ts          |  2 +
 .../src/realisticTextToSpeech.ts              | 43 ++++++++++++++-----
 3 files changed, 37 insertions(+), 11 deletions(-)
diff --git a/packages/api/src/utils/createTask.ts b/packages/api/src/utils/createTask.ts
index c8102fa02..b61de5165 100644
--- a/packages/api/src/utils/createTask.ts
+++ b/packages/api/src/utils/createTask.ts
@@ -344,6 +344,7 @@ export const enqueueTextToSpeech = async ({
   bucket = env.fileUpload.gcsUploadBucket,
   queue = 'omnivore-demo-text-to-speech-queue',
   location = env.gcp.location,
+  isUltraRealisticVoice = false,
 }: {
   userId: string
   speechId: string
@@ -354,6 +355,7 @@ export const enqueueTextToSpeech = async ({
   textType?: 'text' | 'ssml'
   queue?: string
   location?: string
+  isUltraRealisticVoice?: boolean
 }): Promise<string> => {
   const { GOOGLE_CLOUD_PROJECT } = process.env
   const payload = {
@@ -362,6 +364,7 @@ export const enqueueTextToSpeech = async ({
     voice,
     bucket,
     textType,
+    isUltraRealisticVoice,
   }
   // eslint-disable-next-line @typescript-eslint/ban-ts-comment
   // @ts-ignore
diff --git a/packages/text-to-speech/src/index.ts b/packages/text-to-speech/src/index.ts
index 21f7b07ba..a57a4a098 100644
--- a/packages/text-to-speech/src/index.ts
+++ b/packages/text-to-speech/src/index.ts
@@ -167,6 +167,8 @@ export const textToSpeechHandler = Sentry.GCPFunction.wrapHttpFunction(
       }) as NodeJS.WriteStream
       // synthesize text to speech
       const startTime = Date.now()
+      // temporary solution to use realistic text to speech
+      input.isUltraRealisticVoice = true
       const { speechMarks } = await synthesizeTextToSpeech({
         ...input,
         textType: 'html',
diff --git a/packages/text-to-speech/src/realisticTextToSpeech.ts b/packages/text-to-speech/src/realisticTextToSpeech.ts
index 295067d03..4b5c5d4d2 100644
--- a/packages/text-to-speech/src/realisticTextToSpeech.ts
+++ b/packages/text-to-speech/src/realisticTextToSpeech.ts
@@ -7,6 +7,7 @@ import axios from 'axios'
 import ffmpegPath from '@ffmpeg-installer/ffmpeg'
 import ffmpeg from 'fluent-ffmpeg'
 import { PassThrough } from 'stream'
+import { htmlToSpeechFile } from './htmlToSsml'
 
 ffmpeg.setFfmpegPath(ffmpegPath.path)
 
@@ -15,15 +16,24 @@ interface PlayHtConvertResponse {
   payload: string[]
 }
 
-const streamWavToMp3 = (inputStream: PassThrough, outputSteam: PassThrough) => {
+const streamWavToMp3 = (
+  inputStream: PassThrough,
+  outputStream: PassThrough
+) => {
   ffmpeg(inputStream)
+    .inputFormat('wav')
+    .format('mp3')
+    .audioBitrate('32k')
+    .audioChannels(2)
+    .audioCodec('libmp3lame')
     .on('error', (err) => {
       throw err
     })
     .on('end', () => {
-      outputSteam.end()
+      console.debug('transcoding finished')
+      outputStream.end()
     })
-    .pipe(outputSteam, { end: true })
+    .pipe(outputStream, { end: true })
 }
 
 export class RealisticTextToSpeech implements TextToSpeech {
@@ -38,7 +48,7 @@ export class RealisticTextToSpeech implements TextToSpeech {
     }
 
     const inputStream = new PassThrough()
-    const outputStream = input.audioStream
+    const outputStream = input.audioStream as PassThrough
 
     const HEADERS = {
       Authorization: apiKey,
@@ -46,9 +56,19 @@ export class RealisticTextToSpeech implements TextToSpeech {
       'Content-Type': 'application/json',
     }
 
+    const speechFile = htmlToSpeechFile({
+      title: '',
+      content: input.text,
+      options: {
+        primaryVoice: input.voice,
+        secondaryVoice: input.secondaryVoice,
+        language: input.language,
+      },
+    })
+    const content = speechFile.utterances.map((u) => u.text)
     const data = {
       voice: input.voice,
-      content: [input.text],
+      content,
     }
 
     // get the download url first
@@ -70,7 +90,8 @@ export class RealisticTextToSpeech implements TextToSpeech {
     // timeout after 1 hour
     const timeout = 60 * 60 * 1000
     const startTime = Date.now()
-    while (true) {
+    let audioData: Buffer | undefined
+    while (!audioData) {
       if (Date.now() - startTime > timeout) {
         throw new Error('Timeout when polling the download url')
       }
@@ -85,8 +106,9 @@ export class RealisticTextToSpeech implements TextToSpeech {
         })
 
         // write the audio file to the input stream
-        inputStream.end(downloadResponse.data)
-        break
+        // eslint-disable-next-line @typescript-eslint/no-unsafe-argument
+        audioData = Buffer.from(downloadResponse.data, 'binary')
+        inputStream.end(audioData)
       } catch (e) {
         // ignore error
         console.debug('checking status of audio file', downloadUrl)
@@ -94,11 +116,10 @@ export class RealisticTextToSpeech implements TextToSpeech {
     }
 
     // transcode the audio file to mp3
-    if (outputStream) {
-      streamWavToMp3(inputStream, outputStream as PassThrough)
-    }
+    streamWavToMp3(inputStream, outputStream)
 
     return {
+      audioData,
       speechMarks: [],
     }
   }