map realistic voice to openai voice in tts-handler

2024-07-04 16:24:35 +08:00
parent 8c2a59eb1f
commit a841f5748e
3 changed files with 74 additions and 117 deletions
--- a/packages/text-to-speech/src/index.ts
+++ b/packages/text-to-speech/src/index.ts
@ -13,7 +13,6 @@ import * as jwt from 'jsonwebtoken'
 import { AzureTextToSpeech } from './azureTextToSpeech'
 import { endSsml, htmlToSpeechFile, startSsml } from './htmlToSsml'
 import { OpenAITextToSpeech } from './openaiTextToSpeech'
-import { RealisticTextToSpeech } from './realisticTextToSpeech'
 import {
  SpeechMark,
  TextToSpeechInput,
@ -59,11 +58,7 @@ Sentry.GCPFunction.init({
 const MAX_CHARACTER_COUNT = 50000
 const storage = new Storage()

-const textToSpeechHandlers = [
-  new OpenAITextToSpeech(),
-  new AzureTextToSpeech(),
-  new RealisticTextToSpeech(),
-]
+const textToSpeechHandlers = [new OpenAITextToSpeech(), new AzureTextToSpeech()]

 const synthesizeTextToSpeech = async (
  input: TextToSpeechInput
--- a/packages/text-to-speech/src/openaiTextToSpeech.ts
+++ b/packages/text-to-speech/src/openaiTextToSpeech.ts
@ -1,19 +1,82 @@
+import axios from 'axios'
+import { stripEmojis } from './htmlToSsml'
 import {
  TextToSpeech,
  TextToSpeechInput,
  TextToSpeechOutput,
 } from './textToSpeech'
-import axios from 'axios'
-import { stripEmojis } from './htmlToSsml'

-const OPEN_AI_VOICES = ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer']
+const OPENAI_VOICE_PREFIX = 'openai-'
+
+const getVoiceId = (name: string | undefined): string | undefined => {
+  if (!name) {
+    return undefined
+  }
+
+  if (name.startsWith(OPENAI_VOICE_PREFIX)) {
+    return name.substring(OPENAI_VOICE_PREFIX.length)
+  }
+
+  // map realistic voice name to openai voice id
+  const voiceList = [
+    {
+      voiceId: 'ErXwobaYiN019PkySvjV',
+      name: 'echo',
+    },
+    {
+      voiceId: 'pMsXgVXv3BLzUgSXRplE',
+      name: 'alloy',
+    },
+    {
+      voiceId: 'onwK4e9ZLuTAKqWW03F9',
+      name: 'onyx',
+    },
+    {
+      voiceId: 'ThT5KcBeYPX3keUQqHPh',
+      name: 'fable',
+    },
+    {
+      voiceId: 'flq6f7yk4E4fJM5XTYuZ',
+      name: 'onyx',
+    },
+    {
+      voiceId: 'XrExE9yKIg1WjnnlVkGX',
+      name: 'shimmer',
+    },
+    {
+      voiceId: '21m00Tcm4TlvDq8ikWAM',
+      name: 'nova',
+    },
+    {
+      voiceId: 'EXAVITQu4vr4xnSDxMaL',
+      name: 'alloy',
+    },
+    {
+      voiceId: 'MF3mGyEYCl7XYWbV9V6O',
+      name: 'shimmer',
+    },
+    {
+      voiceId: 'TxGEqnHWrfWFTfGW9XjX',
+      name: 'echo',
+    },
+    {
+      voiceId: 'VR6AewLTigWG4xSOukaG',
+      name: 'nova',
+    },
+    {
+      voiceId: 'pNInz6obpgDQGcFmaJgB',
+      name: 'fable',
+    },
+  ]
+  return voiceList.find((voice) => voice.name === name)?.voiceId
+}

 export class OpenAITextToSpeech implements TextToSpeech {
  synthesizeTextToSpeech = async (
    input: TextToSpeechInput
  ): Promise<TextToSpeechOutput> => {
    const apiKey = process.env.OPENAI_API_KEY
-    const voice = input.voice?.substring('openai-'.length)
+    const voice = getVoiceId(input.voice)

    if (!apiKey) {
      throw new Error('API credentials not set')
@ -51,6 +114,12 @@ export class OpenAITextToSpeech implements TextToSpeech {
    if (input.voice?.startsWith('openai-')) {
      return true
    }
+
+    // Use OpenAI voice for ultra realistic voice
+    if (input.isUltraRealisticVoice) {
+      return true
+    }
+
    return false
  }
 }
--- a/packages/text-to-speech/src/realisticTextToSpeech.ts
+++ b/packages/text-to-speech/src/realisticTextToSpeech.ts
@ -1,107 +0,0 @@
-import {
-  TextToSpeech,
-  TextToSpeechInput,
-  TextToSpeechOutput,
-} from './textToSpeech'
-import axios from 'axios'
-import { stripEmojis } from './htmlToSsml'
-
-const getRealisticVoiceId = (name: string | undefined) => {
-  const voiceList = [
-    {
-      voiceId: 'ErXwobaYiN019PkySvjV',
-      name: 'Antoni',
-    },
-    {
-      voiceId: 'pMsXgVXv3BLzUgSXRplE',
-      name: 'Serena',
-    },
-    {
-      voiceId: 'onwK4e9ZLuTAKqWW03F9',
-      name: 'Daniel',
-    },
-    {
-      voiceId: 'ThT5KcBeYPX3keUQqHPh',
-      name: 'Dorothy',
-    },
-    {
-      voiceId: 'flq6f7yk4E4fJM5XTYuZ',
-      name: 'Michael',
-    },
-    {
-      voiceId: 'XrExE9yKIg1WjnnlVkGX',
-      name: 'Matilda',
-    },
-    {
-      voiceId: '21m00Tcm4TlvDq8ikWAM',
-      name: 'Rachel',
-    },
-    {
-      voiceId: 'EXAVITQu4vr4xnSDxMaL',
-      name: 'Bella',
-    },
-    {
-      voiceId: 'MF3mGyEYCl7XYWbV9V6O',
-      name: 'Elli',
-    },
-    {
-      voiceId: 'TxGEqnHWrfWFTfGW9XjX',
-      name: 'Josh',
-    },
-    {
-      voiceId: 'VR6AewLTigWG4xSOukaG',
-      name: 'Arnold',
-    },
-    {
-      voiceId: 'pNInz6obpgDQGcFmaJgB',
-      name: 'Adam',
-    },
-  ]
-  return voiceList.find((voice) => voice.name === name)?.voiceId
-}
-
-export class RealisticTextToSpeech implements TextToSpeech {
-  synthesizeTextToSpeech = async (
-    input: TextToSpeechInput
-  ): Promise<TextToSpeechOutput> => {
-    const voiceId = getRealisticVoiceId(input.voice)
-    const apiKey = process.env.REALISTIC_VOICE_API_KEY
-    const apiEndpoint = process.env.REALISTIC_VOICE_API_ENDPOINT
-
-    if (!apiEndpoint || !apiKey || !voiceId) {
-      throw new Error('API credentials not set')
-    }
-
-    const HEADERS = {
-      'xi-api-key': apiKey,
-      voice_id: voiceId,
-      'Content-Type': 'application/json',
-    }
-
-    const requestUrl = `${apiEndpoint}${voiceId}`
-    const response = await axios.post<Buffer>(
-      requestUrl,
-      {
-        text: stripEmojis(input.text),
-      },
-      {
-        headers: HEADERS,
-        responseType: 'arraybuffer',
-      }
-    )
-
-    if (response.data.length === 0) {
-      console.log('No payload returned: ', response)
-      throw new Error('No payload returned')
-    }
-
-    return {
-      speechMarks: [],
-      audioData: response.data,
-    }
-  }
-
-  use(input: TextToSpeechInput): boolean {
-    return !!input.isUltraRealisticVoice
-  }
-}