From e086a14ac22f02635828819e66efd15a3de91b7b Mon Sep 17 00:00:00 2001
From: Hongbo Wu <hongbo@omnivore.app>
Date: Mon, 15 Aug 2022 21:58:31 +0800
Subject: [PATCH] Convert time to ms

---
 packages/api/src/utils/textToSpeech.ts       | 17 ++++++++++-------
 packages/api/test/utils/textToSpeech.test.ts |  1 +
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/packages/api/src/utils/textToSpeech.ts b/packages/api/src/utils/textToSpeech.ts
index 9a1a3ad1a..b3de66131 100644
--- a/packages/api/src/utils/textToSpeech.ts
+++ b/packages/api/src/utils/textToSpeech.ts
@@ -50,9 +50,10 @@ export const synthesizeTextToSpeech = async (
   // Create the speech synthesizer.
   const synthesizer = new sdk.SpeechSynthesizer(speechConfig)
   const speechMarks: SpeechMark[] = []
+  let timeOffset = 0
+  let characterOffset = 0
 
   synthesizer.synthesizing = function (s, e) {
-    logger.debug(`synthesizing ${e.result.audioData.byteLength} bytes`)
     // convert arrayBuffer to stream and write to gcs file
     writeStream.write(Buffer.from(e.result.audioData))
   }
@@ -84,11 +85,12 @@ export const synthesizeTextToSpeech = async (
     logger.info(str)
   }
 
+  // The unit of e.audioOffset is tick (1 tick = 100 nanoseconds), divide by 10,000 to convert to milliseconds.
   synthesizer.wordBoundary = function (s, e) {
     speechMarks.push({
       word: e.text,
-      time: e.audioOffset,
-      start: e.textOffset,
+      time: (timeOffset + e.audioOffset) / 10000,
+      start: characterOffset + e.textOffset,
       length: e.wordLength,
     })
   }
@@ -110,16 +112,17 @@ export const synthesizeTextToSpeech = async (
     })
   }
   // slice the text into chunks of 1,000 characters
-  const textChunks = input.text.match(/.{1,1000}/g) || []
+  const textChunks = input.text.match(/(.|[\r\n]){1,1000}/g) || []
   for (const textChunk of textChunks) {
-    console.debug(`synthesizing ${textChunk}`)
-    await speakTextAsyncPromise(textChunk)
+    logger.debug(`synthesizing ${textChunk}`)
+    const result = await speakTextAsyncPromise(textChunk)
+    timeOffset = timeOffset + result.audioDuration
+    characterOffset = characterOffset + textChunk.length
   }
   writeStream.end()
   synthesizer.close()
 
   logger.debug(`audio file: ${audioFile}`)
-  logger.debug(`speechMarks: ${speechMarks}`)
 
   return {
     audioUrl: getFilePublicUrl(audioFile),
diff --git a/packages/api/test/utils/textToSpeech.test.ts b/packages/api/test/utils/textToSpeech.test.ts
index 4909192a5..fbc685a4e 100644
--- a/packages/api/test/utils/textToSpeech.test.ts
+++ b/packages/api/test/utils/textToSpeech.test.ts
@@ -81,6 +81,7 @@ describe('textToSpeech', () => {
       const output = await synthesizeTextToSpeech(input)
       expect(output.audioUrl).to.be.a('string')
       expect(output.speechMarks).to.be.a('array')
+      console.log(output.speechMarks)
     })
   })
 })