Add function to parse HTML to SSML

2022-08-16 22:15:44 +08:00
parent def8f28138
commit 447e413605
4 changed files with 101 additions and 103 deletions
--- a/packages/api/src/utils/createTask.ts
+++ b/packages/api/src/utils/createTask.ts
@ -327,14 +327,12 @@ export const enqueueSyncWithIntegration = async (

 export const enqueueTextToSpeech = async (
  userId: string,
-  pageId: string,
-  text: string
+  pageId: string
 ): Promise<string> => {
  const { GOOGLE_CLOUD_PROJECT } = process.env
  const payload = {
    userId,
    pageId,
-    text,
  }

  // If there is no Google Cloud Project Id exposed, it means that we are in local environment
--- a/packages/api/src/utils/parser.ts
+++ b/packages/api/src/utils/parser.ts
@ -276,6 +276,48 @@ export const parsePreparedContent = async (
        })
        article.content = article.dom.outerHTML
      }
+
+      const ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES = [
+        'omnivore-highlight-id',
+        'data-twitter-tweet-id',
+        'data-instagram-id',
+      ]
+
+      // Get the top level element?
+      const pageNode = article.dom.firstElementChild as HTMLElement
+      console.log('pageNode: ', pageNode)
+      const nodesToVisitStack: [HTMLElement] = [pageNode]
+      const visitedNodeList = []
+
+      while (nodesToVisitStack.length > 0) {
+        const currentNode = nodesToVisitStack.pop()
+        console.log('currentNode: ', currentNode?.nodeType)
+        if (
+          currentNode?.nodeType !== 1 ||
+          // Avoiding dynamic elements from being counted as anchor-allowed elements
+          ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES.some((attrib) =>
+            currentNode.hasAttribute(attrib)
+          )
+        ) {
+          continue
+        }
+        visitedNodeList.push(currentNode)
+        ;[].slice
+          .call(currentNode.childNodes)
+          .reverse()
+          .forEach(function (node) {
+            nodesToVisitStack.push(node)
+          })
+      }
+
+      visitedNodeList.shift()
+      visitedNodeList.forEach((node, index) => {
+        // start from index 1, index 0 reserved for anchor unknown.
+        node.setAttribute('data-omnivore-anchor-idx', (index + 1).toString())
+      })
+
+      console.log('article content:', article.dom.outerHTML)
+      article.content = article.dom.outerHTML
    }

    const newWindow = parseHTML('')
--- a/packages/api/src/utils/textToSpeech.ts
+++ b/packages/api/src/utils/textToSpeech.ts
@ -10,13 +10,12 @@ import {
  SpeechSynthesizer,
 } from 'microsoft-cognitiveservices-speech-sdk'
 import { env } from '../env'
+import { parseHTML } from 'linkedom'

 export interface TextToSpeechInput {
  id: string
  text: string
  voice?: string
-  textType?: 'text' | 'ssml'
-  engine?: 'standard' | 'neural'
  languageCode?: string
 }

@ -27,16 +26,14 @@ export interface TextToSpeechOutput {

 export interface SpeechMark {
  time: number
-  start: number
-  length: number
+  start?: number
+  length?: number
  word: string
+  type: 'word' | 'bookmark'
 }

 const logger = buildLogger('app.dispatch')

-// // create a new AWS Polly client
-// const client = new AWS.Polly()
-
 export const synthesizeTextToSpeech = async (
  input: TextToSpeechInput
 ): Promise<TextToSpeechOutput> => {
@ -69,10 +66,9 @@ export const synthesizeTextToSpeech = async (
  // The event synthesis completed signals that the synthesis is completed.
  synthesizer.synthesisCompleted = (s, e) => {
    logger.info(
-      '(synthesized)  Reason: ' +
-        ResultReason[e.result.reason] +
-        ' Audio length: ' +
+      `(synthesized) Reason: ${ResultReason[e.result.reason]} Audio length: ${
        e.result.audioData.byteLength
+      }`
    )
  }

@ -100,6 +96,20 @@ export const synthesizeTextToSpeech = async (
      time: (timeOffset + e.audioOffset) / 10000,
      start: characterOffset + e.textOffset,
      length: e.wordLength,
+      type: 'word',
+    })
+  }
+
+  synthesizer.bookmarkReached = (s, e) => {
+    logger.info(
+      `(Bookmark reached), Audio offset: ${
+        e.audioOffset / 10000
+      }ms, bookmark text: ${e.text}`
+    )
+    speechMarks.push({
+      word: e.text,
+      time: (timeOffset + e.audioOffset) / 10000,
+      type: 'bookmark',
    })
  }

@ -138,84 +148,39 @@ export const synthesizeTextToSpeech = async (
  }
 }

-// export const createAudio = async (
-//   input: TextToSpeechInput
-// ): Promise<Buffer> => {
-//   const { text, voice, textType, engine, languageCode } = input
-//   const params: SynthesizeSpeechInput = {
-//     OutputFormat: 'ogg_vorbis',
-//     Text: text,
-//     TextType: textType || 'text',
-//     VoiceId: voice || 'Joanna',
-//     Engine: engine || 'neural',
-//     LanguageCode: languageCode || 'en-US',
-//   }
-//   try {
-//     const data = await client.synthesizeSpeech(params).promise()
-//     return data.AudioStream as Buffer
-//   } catch (error) {
-//     logger.error('Unable to create audio file', { error })
-//     throw error
-//   }
-// }
+export const htmlToSsml = (
+  html: string,
+  language = 'en-US',
+  voice = 'en-US-JennyNeural',
+  rate = 100,
+  volume = 100
+): string => {
+  const document = parseHTML(html).document
+  const paragraphs = document.querySelectorAll('p')
+  // create new ssml document
+  const ssml = parseHTML('').document
+  const speakElement = ssml.createElement('speak')
+  speakElement.setAttribute('version', '1.0')
+  speakElement.setAttribute('xmlns', 'http://www.w3.org/2001/10/synthesis')
+  speakElement.setAttribute('xml:lang', language)
+  const voiceElement = ssml.createElement('voice')
+  voiceElement.setAttribute('name', voice)
+  speakElement.appendChild(voiceElement)
+  const prosodyElement = ssml.createElement('prosody')
+  prosodyElement.setAttribute('rate', `${rate}%`)
+  prosodyElement.setAttribute('volume', volume.toString())
+  voiceElement.appendChild(prosodyElement)
+  // add each paragraph to the ssml document
+  paragraphs.forEach((p) => {
+    const id = p.getAttribute('data-omnivore-anchor-idx')
+    if (id) {
+      const text = p.innerText
+      const bookMark = ssml.createElement('bookmark')
+      bookMark.setAttribute('mark', `data-omnivore-anchor-idx-${id}`)
+      bookMark.innerText = text
+      prosodyElement.appendChild(bookMark)
+    }
+  })

-// export const createSpeechMarks = async (
-//   input: TextToSpeechInput
-// ): Promise<string> => {
-//   const { text, voice, textType, engine, languageCode } = input
-//   const params: SynthesizeSpeechInput = {
-//     OutputFormat: 'json',
-//     Text: text,
-//     TextType: textType || 'text',
-//     VoiceId: voice || 'Joanna',
-//     Engine: engine || 'neural',
-//     SpeechMarkTypes: ['word'],
-//     LanguageCode: languageCode || 'en-US',
-//   }
-//   try {
-//     const data = await client.synthesizeSpeech(params).promise()
-//     return (data.AudioStream as Buffer).toString()
-//   } catch (error) {
-//     logger.error('Unable to create speech marks', { error })
-//     throw error
-//   }
-// }
-//
-// export const createAudioWithSpeechMarks = async (
-//   input: TextToSpeechInput
-// ): Promise<TextToSpeechOutput> => {
-//   try {
-//     const audio = await createAudio(input)
-//     // upload audio to google cloud storage
-//     const filePath = `speech/${input.id}.ogg`
-//
-//     logger.info('start uploading...', { filePath })
-//     await uploadToBucket(filePath, audio, {
-//       contentType: 'audio/ogg',
-//       public: true,
-//     })
-//
-//     // get public url for audio file
-//     const publicUrl = getFilePublicUrl(filePath)
-//     logger.info('upload complete', { publicUrl })
-//
-//     const speechMarks = await createSpeechMarks(input)
-//     return {
-//       audioUrl: publicUrl,
-//       speechMarks,
-//     }
-//   } catch (error) {
-//     logger.error('Unable to create audio with speech marks', error)
-//     throw error
-//   }
-// }
-
-// export const htmlToSsml = (
-//   html: string,
-//   language = 'en-US',
-//   voice = 'en-US-JennyNeural',
-//   rate = 100,
-//   volume = 100
-// ): string => {
-//   return `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="${language}"><voice name="${voice}"><prosody rate="${rate}%" volume="${volume}%">${html}</prosody></voice></speak>`
-// }
+  return speakElement.outerHTML
+}
--- a/packages/api/test/utils/textToSpeech.test.ts
+++ b/packages/api/test/utils/textToSpeech.test.ts
@ -11,16 +11,9 @@ describe('textToSpeech', () => {
    it('should create an audio file with speech marks', async () => {
      const input: TextToSpeechInput = {
        id: generateFakeUuid(),
-        text:
-          '《太阁立志传5 DX》清洲会议触发教程\n' +
-          '玩家要亲历清洲会议事件，需要位于织田家。\n' +
-          '清洲会议需要完成以下条件才能触发：\n' +
-          '本能寺发生之后，织田信长和织田信忠死亡。\n' +
-          '羽柴秀吉、柴田胜家、织田信雄、织田信孝为大名。\n' +
-          '清洲城必须为信雄的直辖城，或者清洲城主为信雄一方。\n' +
-          '前两个条件都很容易达成，主要是要保证清洲城主为信雄这一条件比较难办，需要玩家控制城主封地。',
-        languageCode: 'zh-CN',
-        voice: 'zh-CN-XiaochenNeural',
+        text: 'Marry had a little lamb',
+        languageCode: 'en-US',
+        voice: 'en-US-JennyNeural',
      }
      const output = await synthesizeTextToSpeech(input)
      expect(output.audioUrl).to.be.a('string')