Convert HTML to utterances

2022-11-07 19:30:20 +08:00
parent abc9dd49be
commit 2e47b0879c
3 changed files with 37 additions and 11 deletions
--- a/packages/api/src/utils/createTask.ts
+++ b/packages/api/src/utils/createTask.ts
@ -344,6 +344,7 @@ export const enqueueTextToSpeech = async ({
  bucket = env.fileUpload.gcsUploadBucket,
  queue = 'omnivore-demo-text-to-speech-queue',
  location = env.gcp.location,
+  isUltraRealisticVoice = false,
 }: {
  userId: string
  speechId: string
@ -354,6 +355,7 @@ export const enqueueTextToSpeech = async ({
  textType?: 'text' | 'ssml'
  queue?: string
  location?: string
+  isUltraRealisticVoice?: boolean
 }): Promise<string> => {
  const { GOOGLE_CLOUD_PROJECT } = process.env
  const payload = {
@ -362,6 +364,7 @@ export const enqueueTextToSpeech = async ({
    voice,
    bucket,
    textType,
+    isUltraRealisticVoice,
  }
  // eslint-disable-next-line @typescript-eslint/ban-ts-comment
  // @ts-ignore
--- a/packages/text-to-speech/src/index.ts
+++ b/packages/text-to-speech/src/index.ts
@ -167,6 +167,8 @@ export const textToSpeechHandler = Sentry.GCPFunction.wrapHttpFunction(
      }) as NodeJS.WriteStream
      // synthesize text to speech
      const startTime = Date.now()
+      // temporary solution to use realistic text to speech
+      input.isUltraRealisticVoice = true
      const { speechMarks } = await synthesizeTextToSpeech({
        ...input,
        textType: 'html',
--- a/packages/text-to-speech/src/realisticTextToSpeech.ts
+++ b/packages/text-to-speech/src/realisticTextToSpeech.ts
@ -7,6 +7,7 @@ import axios from 'axios'
 import ffmpegPath from '@ffmpeg-installer/ffmpeg'
 import ffmpeg from 'fluent-ffmpeg'
 import { PassThrough } from 'stream'
+import { htmlToSpeechFile } from './htmlToSsml'

 ffmpeg.setFfmpegPath(ffmpegPath.path)

@ -15,15 +16,24 @@ interface PlayHtConvertResponse {
  payload: string[]
 }

-const streamWavToMp3 = (inputStream: PassThrough, outputSteam: PassThrough) => {
+const streamWavToMp3 = (
+  inputStream: PassThrough,
+  outputStream: PassThrough
+) => {
  ffmpeg(inputStream)
+    .inputFormat('wav')
+    .format('mp3')
+    .audioBitrate('32k')
+    .audioChannels(2)
+    .audioCodec('libmp3lame')
    .on('error', (err) => {
      throw err
    })
    .on('end', () => {
-      outputSteam.end()
+      console.debug('transcoding finished')
+      outputStream.end()
    })
-    .pipe(outputSteam, { end: true })
+    .pipe(outputStream, { end: true })
 }

 export class RealisticTextToSpeech implements TextToSpeech {
@ -38,7 +48,7 @@ export class RealisticTextToSpeech implements TextToSpeech {
    }

    const inputStream = new PassThrough()
-    const outputStream = input.audioStream
+    const outputStream = input.audioStream as PassThrough

    const HEADERS = {
      Authorization: apiKey,
@ -46,9 +56,19 @@ export class RealisticTextToSpeech implements TextToSpeech {
      'Content-Type': 'application/json',
    }

+    const speechFile = htmlToSpeechFile({
+      title: '',
+      content: input.text,
+      options: {
+        primaryVoice: input.voice,
+        secondaryVoice: input.secondaryVoice,
+        language: input.language,
+      },
+    })
+    const content = speechFile.utterances.map((u) => u.text)
    const data = {
      voice: input.voice,
-      content: [input.text],
+      content,
    }

    // get the download url first
@ -70,7 +90,8 @@ export class RealisticTextToSpeech implements TextToSpeech {
    // timeout after 1 hour
    const timeout = 60 * 60 * 1000
    const startTime = Date.now()
-    while (true) {
+    let audioData: Buffer | undefined
+    while (!audioData) {
      if (Date.now() - startTime > timeout) {
        throw new Error('Timeout when polling the download url')
      }
@ -85,8 +106,9 @@ export class RealisticTextToSpeech implements TextToSpeech {
        })

        // write the audio file to the input stream
-        inputStream.end(downloadResponse.data)
-        break
+        // eslint-disable-next-line @typescript-eslint/no-unsafe-argument
+        audioData = Buffer.from(downloadResponse.data, 'binary')
+        inputStream.end(audioData)
      } catch (e) {
        // ignore error
        console.debug('checking status of audio file', downloadUrl)
@ -94,11 +116,10 @@ export class RealisticTextToSpeech implements TextToSpeech {
    }

    // transcode the audio file to mp3
-    if (outputStream) {
-      streamWavToMp3(inputStream, outputStream as PassThrough)
-    }
+    streamWavToMp3(inputStream, outputStream)

    return {
+      audioData,
      speechMarks: [],
    }
  }