diff --git a/packages/text-to-speech/src/index.ts b/packages/text-to-speech/src/index.ts index f868a5c62..064e90411 100644 --- a/packages/text-to-speech/src/index.ts +++ b/packages/text-to-speech/src/index.ts @@ -13,7 +13,6 @@ import * as jwt from 'jsonwebtoken' import { AzureTextToSpeech } from './azureTextToSpeech' import { endSsml, htmlToSpeechFile, startSsml } from './htmlToSsml' import { OpenAITextToSpeech } from './openaiTextToSpeech' -import { RealisticTextToSpeech } from './realisticTextToSpeech' import { SpeechMark, TextToSpeechInput, @@ -59,11 +58,7 @@ Sentry.GCPFunction.init({ const MAX_CHARACTER_COUNT = 50000 const storage = new Storage() -const textToSpeechHandlers = [ - new OpenAITextToSpeech(), - new AzureTextToSpeech(), - new RealisticTextToSpeech(), -] +const textToSpeechHandlers = [new OpenAITextToSpeech(), new AzureTextToSpeech()] const synthesizeTextToSpeech = async ( input: TextToSpeechInput diff --git a/packages/text-to-speech/src/openaiTextToSpeech.ts b/packages/text-to-speech/src/openaiTextToSpeech.ts index 0c4a42225..2a7b85adf 100644 --- a/packages/text-to-speech/src/openaiTextToSpeech.ts +++ b/packages/text-to-speech/src/openaiTextToSpeech.ts @@ -1,19 +1,82 @@ +import axios from 'axios' +import { stripEmojis } from './htmlToSsml' import { TextToSpeech, TextToSpeechInput, TextToSpeechOutput, } from './textToSpeech' -import axios from 'axios' -import { stripEmojis } from './htmlToSsml' -const OPEN_AI_VOICES = ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'] +const OPENAI_VOICE_PREFIX = 'openai-' + +const getVoiceId = (name: string | undefined): string | undefined => { + if (!name) { + return undefined + } + + if (name.startsWith(OPENAI_VOICE_PREFIX)) { + return name.substring(OPENAI_VOICE_PREFIX.length) + } + + // map realistic voice name to openai voice id + const voiceList = [ + { + voiceId: 'ErXwobaYiN019PkySvjV', + name: 'echo', + }, + { + voiceId: 'pMsXgVXv3BLzUgSXRplE', + name: 'alloy', + }, + { + voiceId: 'onwK4e9ZLuTAKqWW03F9', + name: 'onyx', + }, + { + voiceId: 'ThT5KcBeYPX3keUQqHPh', + name: 'fable', + }, + { + voiceId: 'flq6f7yk4E4fJM5XTYuZ', + name: 'onyx', + }, + { + voiceId: 'XrExE9yKIg1WjnnlVkGX', + name: 'shimmer', + }, + { + voiceId: '21m00Tcm4TlvDq8ikWAM', + name: 'nova', + }, + { + voiceId: 'EXAVITQu4vr4xnSDxMaL', + name: 'alloy', + }, + { + voiceId: 'MF3mGyEYCl7XYWbV9V6O', + name: 'shimmer', + }, + { + voiceId: 'TxGEqnHWrfWFTfGW9XjX', + name: 'echo', + }, + { + voiceId: 'VR6AewLTigWG4xSOukaG', + name: 'nova', + }, + { + voiceId: 'pNInz6obpgDQGcFmaJgB', + name: 'fable', + }, + ] + return voiceList.find((voice) => voice.name === name)?.voiceId +} export class OpenAITextToSpeech implements TextToSpeech { synthesizeTextToSpeech = async ( input: TextToSpeechInput ): Promise => { const apiKey = process.env.OPENAI_API_KEY - const voice = input.voice?.substring('openai-'.length) + const voice = getVoiceId(input.voice) if (!apiKey) { throw new Error('API credentials not set') @@ -51,6 +114,12 @@ export class OpenAITextToSpeech implements TextToSpeech { if (input.voice?.startsWith('openai-')) { return true } + + // Use OpenAI voice for ultra realistic voice + if (input.isUltraRealisticVoice) { + return true + } + return false } } diff --git a/packages/text-to-speech/src/realisticTextToSpeech.ts b/packages/text-to-speech/src/realisticTextToSpeech.ts deleted file mode 100644 index 44aa95c46..000000000 --- a/packages/text-to-speech/src/realisticTextToSpeech.ts +++ /dev/null @@ -1,107 +0,0 @@ -import { - TextToSpeech, - TextToSpeechInput, - TextToSpeechOutput, -} from './textToSpeech' -import axios from 'axios' -import { stripEmojis } from './htmlToSsml' - -const getRealisticVoiceId = (name: string | undefined) => { - const voiceList = [ - { - voiceId: 'ErXwobaYiN019PkySvjV', - name: 'Antoni', - }, - { - voiceId: 'pMsXgVXv3BLzUgSXRplE', - name: 'Serena', - }, - { - voiceId: 'onwK4e9ZLuTAKqWW03F9', - name: 'Daniel', - }, - { - voiceId: 'ThT5KcBeYPX3keUQqHPh', - name: 'Dorothy', - }, - { - voiceId: 'flq6f7yk4E4fJM5XTYuZ', - name: 'Michael', - }, - { - voiceId: 'XrExE9yKIg1WjnnlVkGX', - name: 'Matilda', - }, - { - voiceId: '21m00Tcm4TlvDq8ikWAM', - name: 'Rachel', - }, - { - voiceId: 'EXAVITQu4vr4xnSDxMaL', - name: 'Bella', - }, - { - voiceId: 'MF3mGyEYCl7XYWbV9V6O', - name: 'Elli', - }, - { - voiceId: 'TxGEqnHWrfWFTfGW9XjX', - name: 'Josh', - }, - { - voiceId: 'VR6AewLTigWG4xSOukaG', - name: 'Arnold', - }, - { - voiceId: 'pNInz6obpgDQGcFmaJgB', - name: 'Adam', - }, - ] - return voiceList.find((voice) => voice.name === name)?.voiceId -} - -export class RealisticTextToSpeech implements TextToSpeech { - synthesizeTextToSpeech = async ( - input: TextToSpeechInput - ): Promise => { - const voiceId = getRealisticVoiceId(input.voice) - const apiKey = process.env.REALISTIC_VOICE_API_KEY - const apiEndpoint = process.env.REALISTIC_VOICE_API_ENDPOINT - - if (!apiEndpoint || !apiKey || !voiceId) { - throw new Error('API credentials not set') - } - - const HEADERS = { - 'xi-api-key': apiKey, - voice_id: voiceId, - 'Content-Type': 'application/json', - } - - const requestUrl = `${apiEndpoint}${voiceId}` - const response = await axios.post( - requestUrl, - { - text: stripEmojis(input.text), - }, - { - headers: HEADERS, - responseType: 'arraybuffer', - } - ) - - if (response.data.length === 0) { - console.log('No payload returned: ', response) - throw new Error('No payload returned') - } - - return { - speechMarks: [], - audioData: response.data, - } - } - - use(input: TextToSpeechInput): boolean { - return !!input.isUltraRealisticVoice - } -}