diff --git a/packages/api/src/utils/createTask.ts b/packages/api/src/utils/createTask.ts
index 41174d891..c69d5a986 100644
--- a/packages/api/src/utils/createTask.ts
+++ b/packages/api/src/utils/createTask.ts
@@ -327,14 +327,12 @@ export const enqueueSyncWithIntegration = async (
 
 export const enqueueTextToSpeech = async (
   userId: string,
-  pageId: string,
-  text: string
+  pageId: string
 ): Promise<string> => {
   const { GOOGLE_CLOUD_PROJECT } = process.env
   const payload = {
     userId,
     pageId,
-    text,
   }
 
   // If there is no Google Cloud Project Id exposed, it means that we are in local environment
diff --git a/packages/api/src/utils/parser.ts b/packages/api/src/utils/parser.ts
index 8900fc5d3..2e9997e58 100644
--- a/packages/api/src/utils/parser.ts
+++ b/packages/api/src/utils/parser.ts
@@ -276,6 +276,48 @@ export const parsePreparedContent = async (
         })
         article.content = article.dom.outerHTML
       }
+
+      const ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES = [
+        'omnivore-highlight-id',
+        'data-twitter-tweet-id',
+        'data-instagram-id',
+      ]
+
+      // Get the top level element?
+      const pageNode = article.dom.firstElementChild as HTMLElement
+      console.log('pageNode: ', pageNode)
+      const nodesToVisitStack: [HTMLElement] = [pageNode]
+      const visitedNodeList = []
+
+      while (nodesToVisitStack.length > 0) {
+        const currentNode = nodesToVisitStack.pop()
+        console.log('currentNode: ', currentNode?.nodeType)
+        if (
+          currentNode?.nodeType !== 1 ||
+          // Avoiding dynamic elements from being counted as anchor-allowed elements
+          ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES.some((attrib) =>
+            currentNode.hasAttribute(attrib)
+          )
+        ) {
+          continue
+        }
+        visitedNodeList.push(currentNode)
+        ;[].slice
+          .call(currentNode.childNodes)
+          .reverse()
+          .forEach(function (node) {
+            nodesToVisitStack.push(node)
+          })
+      }
+
+      visitedNodeList.shift()
+      visitedNodeList.forEach((node, index) => {
+        // start from index 1, index 0 reserved for anchor unknown.
+        node.setAttribute('data-omnivore-anchor-idx', (index + 1).toString())
+      })
+
+      console.log('article content:', article.dom.outerHTML)
+      article.content = article.dom.outerHTML
     }
 
     const newWindow = parseHTML('')
diff --git a/packages/api/src/utils/textToSpeech.ts b/packages/api/src/utils/textToSpeech.ts
index a4daa9743..53aefe881 100644
--- a/packages/api/src/utils/textToSpeech.ts
+++ b/packages/api/src/utils/textToSpeech.ts
@@ -10,13 +10,12 @@ import {
   SpeechSynthesizer,
 } from 'microsoft-cognitiveservices-speech-sdk'
 import { env } from '../env'
+import { parseHTML } from 'linkedom'
 
 export interface TextToSpeechInput {
   id: string
   text: string
   voice?: string
-  textType?: 'text' | 'ssml'
-  engine?: 'standard' | 'neural'
   languageCode?: string
 }
 
@@ -27,16 +26,14 @@ export interface TextToSpeechOutput {
 
 export interface SpeechMark {
   time: number
-  start: number
-  length: number
+  start?: number
+  length?: number
   word: string
+  type: 'word' | 'bookmark'
 }
 
 const logger = buildLogger('app.dispatch')
 
-// // create a new AWS Polly client
-// const client = new AWS.Polly()
-
 export const synthesizeTextToSpeech = async (
   input: TextToSpeechInput
 ): Promise<TextToSpeechOutput> => {
@@ -69,10 +66,9 @@ export const synthesizeTextToSpeech = async (
   // The event synthesis completed signals that the synthesis is completed.
   synthesizer.synthesisCompleted = (s, e) => {
     logger.info(
-      '(synthesized)  Reason: ' +
-        ResultReason[e.result.reason] +
-        ' Audio length: ' +
+      `(synthesized) Reason: ${ResultReason[e.result.reason]} Audio length: ${
         e.result.audioData.byteLength
+      }`
     )
   }
 
@@ -100,6 +96,20 @@ export const synthesizeTextToSpeech = async (
       time: (timeOffset + e.audioOffset) / 10000,
       start: characterOffset + e.textOffset,
       length: e.wordLength,
+      type: 'word',
+    })
+  }
+
+  synthesizer.bookmarkReached = (s, e) => {
+    logger.info(
+      `(Bookmark reached), Audio offset: ${
+        e.audioOffset / 10000
+      }ms, bookmark text: ${e.text}`
+    )
+    speechMarks.push({
+      word: e.text,
+      time: (timeOffset + e.audioOffset) / 10000,
+      type: 'bookmark',
     })
   }
 
@@ -138,84 +148,39 @@ export const synthesizeTextToSpeech = async (
   }
 }
 
-// export const createAudio = async (
-//   input: TextToSpeechInput
-// ): Promise<Buffer> => {
-//   const { text, voice, textType, engine, languageCode } = input
-//   const params: SynthesizeSpeechInput = {
-//     OutputFormat: 'ogg_vorbis',
-//     Text: text,
-//     TextType: textType || 'text',
-//     VoiceId: voice || 'Joanna',
-//     Engine: engine || 'neural',
-//     LanguageCode: languageCode || 'en-US',
-//   }
-//   try {
-//     const data = await client.synthesizeSpeech(params).promise()
-//     return data.AudioStream as Buffer
-//   } catch (error) {
-//     logger.error('Unable to create audio file', { error })
-//     throw error
-//   }
-// }
+export const htmlToSsml = (
+  html: string,
+  language = 'en-US',
+  voice = 'en-US-JennyNeural',
+  rate = 100,
+  volume = 100
+): string => {
+  const document = parseHTML(html).document
+  const paragraphs = document.querySelectorAll('p')
+  // create new ssml document
+  const ssml = parseHTML('').document
+  const speakElement = ssml.createElement('speak')
+  speakElement.setAttribute('version', '1.0')
+  speakElement.setAttribute('xmlns', 'http://www.w3.org/2001/10/synthesis')
+  speakElement.setAttribute('xml:lang', language)
+  const voiceElement = ssml.createElement('voice')
+  voiceElement.setAttribute('name', voice)
+  speakElement.appendChild(voiceElement)
+  const prosodyElement = ssml.createElement('prosody')
+  prosodyElement.setAttribute('rate', `${rate}%`)
+  prosodyElement.setAttribute('volume', volume.toString())
+  voiceElement.appendChild(prosodyElement)
+  // add each paragraph to the ssml document
+  paragraphs.forEach((p) => {
+    const id = p.getAttribute('data-omnivore-anchor-idx')
+    if (id) {
+      const text = p.innerText
+      const bookMark = ssml.createElement('bookmark')
+      bookMark.setAttribute('mark', `data-omnivore-anchor-idx-${id}`)
+      bookMark.innerText = text
+      prosodyElement.appendChild(bookMark)
+    }
+  })
 
-// export const createSpeechMarks = async (
-//   input: TextToSpeechInput
-// ): Promise<string> => {
-//   const { text, voice, textType, engine, languageCode } = input
-//   const params: SynthesizeSpeechInput = {
-//     OutputFormat: 'json',
-//     Text: text,
-//     TextType: textType || 'text',
-//     VoiceId: voice || 'Joanna',
-//     Engine: engine || 'neural',
-//     SpeechMarkTypes: ['word'],
-//     LanguageCode: languageCode || 'en-US',
-//   }
-//   try {
-//     const data = await client.synthesizeSpeech(params).promise()
-//     return (data.AudioStream as Buffer).toString()
-//   } catch (error) {
-//     logger.error('Unable to create speech marks', { error })
-//     throw error
-//   }
-// }
-//
-// export const createAudioWithSpeechMarks = async (
-//   input: TextToSpeechInput
-// ): Promise<TextToSpeechOutput> => {
-//   try {
-//     const audio = await createAudio(input)
-//     // upload audio to google cloud storage
-//     const filePath = `speech/${input.id}.ogg`
-//
-//     logger.info('start uploading...', { filePath })
-//     await uploadToBucket(filePath, audio, {
-//       contentType: 'audio/ogg',
-//       public: true,
-//     })
-//
-//     // get public url for audio file
-//     const publicUrl = getFilePublicUrl(filePath)
-//     logger.info('upload complete', { publicUrl })
-//
-//     const speechMarks = await createSpeechMarks(input)
-//     return {
-//       audioUrl: publicUrl,
-//       speechMarks,
-//     }
-//   } catch (error) {
-//     logger.error('Unable to create audio with speech marks', error)
-//     throw error
-//   }
-// }
-
-// export const htmlToSsml = (
-//   html: string,
-//   language = 'en-US',
-//   voice = 'en-US-JennyNeural',
-//   rate = 100,
-//   volume = 100
-// ): string => {
-//   return `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="${language}"><voice name="${voice}"><prosody rate="${rate}%" volume="${volume}%">${html}</prosody></voice></speak>`
-// }
+  return speakElement.outerHTML
+}
diff --git a/packages/api/test/utils/textToSpeech.test.ts b/packages/api/test/utils/textToSpeech.test.ts
index 48a65abe0..89d8f5edf 100644
--- a/packages/api/test/utils/textToSpeech.test.ts
+++ b/packages/api/test/utils/textToSpeech.test.ts
@@ -11,16 +11,9 @@ describe('textToSpeech', () => {
     it('should create an audio file with speech marks', async () => {
       const input: TextToSpeechInput = {
         id: generateFakeUuid(),
-        text:
-          '《太阁立志传5 DX》清洲会议触发教程\n' +
-          '玩家要亲历清洲会议事件，需要位于织田家。\n' +
-          '清洲会议需要完成以下条件才能触发：\n' +
-          '本能寺发生之后，织田信长和织田信忠死亡。\n' +
-          '羽柴秀吉、柴田胜家、织田信雄、织田信孝为大名。\n' +
-          '清洲城必须为信雄的直辖城，或者清洲城主为信雄一方。\n' +
-          '前两个条件都很容易达成，主要是要保证清洲城主为信雄这一条件比较难办，需要玩家控制城主封地。',
-        languageCode: 'zh-CN',
-        voice: 'zh-CN-XiaochenNeural',
+        text: 'Marry had a little lamb',
+        languageCode: 'en-US',
+        voice: 'en-US-JennyNeural',
       }
       const output = await synthesizeTextToSpeech(input)
       expect(output.audioUrl).to.be.a('string')