From e086a14ac22f02635828819e66efd15a3de91b7b Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Mon, 15 Aug 2022 21:58:31 +0800 Subject: [PATCH] Convert time to ms --- packages/api/src/utils/textToSpeech.ts | 17 ++++++++++------- packages/api/test/utils/textToSpeech.test.ts | 1 + 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/packages/api/src/utils/textToSpeech.ts b/packages/api/src/utils/textToSpeech.ts index 9a1a3ad1a..b3de66131 100644 --- a/packages/api/src/utils/textToSpeech.ts +++ b/packages/api/src/utils/textToSpeech.ts @@ -50,9 +50,10 @@ export const synthesizeTextToSpeech = async ( // Create the speech synthesizer. const synthesizer = new sdk.SpeechSynthesizer(speechConfig) const speechMarks: SpeechMark[] = [] + let timeOffset = 0 + let characterOffset = 0 synthesizer.synthesizing = function (s, e) { - logger.debug(`synthesizing ${e.result.audioData.byteLength} bytes`) // convert arrayBuffer to stream and write to gcs file writeStream.write(Buffer.from(e.result.audioData)) } @@ -84,11 +85,12 @@ export const synthesizeTextToSpeech = async ( logger.info(str) } + // The unit of e.audioOffset is tick (1 tick = 100 nanoseconds), divide by 10,000 to convert to milliseconds. synthesizer.wordBoundary = function (s, e) { speechMarks.push({ word: e.text, - time: e.audioOffset, - start: e.textOffset, + time: (timeOffset + e.audioOffset) / 10000, + start: characterOffset + e.textOffset, length: e.wordLength, }) } @@ -110,16 +112,17 @@ export const synthesizeTextToSpeech = async ( }) } // slice the text into chunks of 1,000 characters - const textChunks = input.text.match(/.{1,1000}/g) || [] + const textChunks = input.text.match(/(.|[\r\n]){1,1000}/g) || [] for (const textChunk of textChunks) { - console.debug(`synthesizing ${textChunk}`) - await speakTextAsyncPromise(textChunk) + logger.debug(`synthesizing ${textChunk}`) + const result = await speakTextAsyncPromise(textChunk) + timeOffset = timeOffset + result.audioDuration + characterOffset = characterOffset + textChunk.length } writeStream.end() synthesizer.close() logger.debug(`audio file: ${audioFile}`) - logger.debug(`speechMarks: ${speechMarks}`) return { audioUrl: getFilePublicUrl(audioFile), diff --git a/packages/api/test/utils/textToSpeech.test.ts b/packages/api/test/utils/textToSpeech.test.ts index 4909192a5..fbc685a4e 100644 --- a/packages/api/test/utils/textToSpeech.test.ts +++ b/packages/api/test/utils/textToSpeech.test.ts @@ -81,6 +81,7 @@ describe('textToSpeech', () => { const output = await synthesizeTextToSpeech(input) expect(output.audioUrl).to.be.a('string') expect(output.speechMarks).to.be.a('array') + console.log(output.speechMarks) }) }) })