From 8c8734d15327fc47bcc2ddfe219398a0a121ecb3 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Mon, 29 Aug 2022 22:09:09 +0800 Subject: [PATCH 1/5] Re-enable auto synthesis in the backend --- packages/api/src/services/speech.ts | 61 ++++++++++++++--------------- 1 file changed, 30 insertions(+), 31 deletions(-) diff --git a/packages/api/src/services/speech.ts b/packages/api/src/services/speech.ts index f127fa39e..8d390c374 100644 --- a/packages/api/src/services/speech.ts +++ b/packages/api/src/services/speech.ts @@ -21,39 +21,38 @@ export const shouldSynthesize = async ( userId: string, page: Page ): Promise => { - return Promise.resolve(false) - // if (page.pageType === PageType.File || !page.content) { - // // we don't synthesize files for now - // return false - // } + if (page.pageType === PageType.File || !page.content) { + // we don't synthesize files for now + return false + } - // if (process.env.TEXT_TO_SPEECH_BETA_TEST) { - // return true - // } + if (process.env.TEXT_TO_SPEECH_BETA_TEST) { + return true + } - // const [recentListenedPage, count] = (await searchPages( - // { - // dateFilters: [ - // { - // field: 'listenedAt', - // startDate: new Date(Date.now() - 30 * 24 * 60 * 60 * 1000), - // }, - // ], - // sort: { - // by: SortBy.LISTENED, - // order: SortOrder.DESCENDING, - // }, - // size: 1, - // }, - // userId - // )) || [[], 0] - // if (count === 0) { - // return false - // } - // return ( - // !!recentListenedPage[0].listenedAt && - // page.savedAt < recentListenedPage[0].listenedAt - // ) + const [recentListenedPage, count] = (await searchPages( + { + dateFilters: [ + { + field: 'listenedAt', + startDate: new Date(Date.now() - 30 * 24 * 60 * 60 * 1000), + }, + ], + sort: { + by: SortBy.LISTENED, + order: SortOrder.DESCENDING, + }, + size: 1, + }, + userId + )) || [[], 0] + if (count === 0) { + return false + } + return ( + !!recentListenedPage[0].listenedAt && + page.savedAt < recentListenedPage[0].listenedAt + ) } export const synthesize = async (page: Page, speech: Speech): Promise => { From d085c86bb60077ef41c46ccd27f283bb3fff3fde Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Mon, 29 Aug 2022 22:23:07 +0800 Subject: [PATCH 2/5] Enqueue text to speech tasks --- packages/api/src/routers/text_to_speech.ts | 15 ++++++-- packages/api/src/services/speech.ts | 45 ---------------------- 2 files changed, 12 insertions(+), 48 deletions(-) diff --git a/packages/api/src/routers/text_to_speech.ts b/packages/api/src/routers/text_to_speech.ts index 70e636e0b..dab71ac4e 100644 --- a/packages/api/src/routers/text_to_speech.ts +++ b/packages/api/src/routers/text_to_speech.ts @@ -9,9 +9,10 @@ import { getPageById } from '../elastic/pages' import { Speech, SpeechState } from '../entity/speech' import { buildLogger } from '../utils/logger' import { getClaimsByToken } from '../utils/auth' -import { shouldSynthesize, synthesize } from '../services/speech' +import { shouldSynthesize } from '../services/speech' import { readPushSubscription } from '../datalayer/pubsub' import { AppDataSource } from '../server' +import { enqueueTextToSpeech } from '../utils/createTask' const logger = buildLogger('app.dispatch') @@ -62,8 +63,16 @@ export function textToSpeechRouter() { state: SpeechState.INITIALIZED, voice: 'en-US-JennyNeural', }) - await synthesize(page, speech) - logger.info('page synthesized') + // enqueue a task to convert text to speech + const taskName = await enqueueTextToSpeech({ + userId, + speechId: speech.id, + text: page.content, + voice: speech.voice, + priority: 'low', + }) + logger.info('Start Text to speech task', { taskName }) + return res.status(202).send('Text to speech task started') } res.status(200).send('Page should not synthesize') diff --git a/packages/api/src/services/speech.ts b/packages/api/src/services/speech.ts index 8d390c374..380a14a36 100644 --- a/packages/api/src/services/speech.ts +++ b/packages/api/src/services/speech.ts @@ -1,16 +1,6 @@ -import { getRepository } from '../entity/utils' -import { Speech, SpeechState } from '../entity/speech' import { searchPages } from '../elastic/pages' import { Page, PageType } from '../elastic/types' import { SortBy, SortOrder } from '../utils/search' -import { synthesizeTextToSpeech } from '../utils/textToSpeech' - -export const setSpeechFailure = async (id: string) => { - // update state - await getRepository(Speech).update(id, { - state: SpeechState.FAILED, - }) -} /* * We should not synthesize the page when: @@ -54,38 +44,3 @@ export const shouldSynthesize = async ( page.savedAt < recentListenedPage[0].listenedAt ) } - -export const synthesize = async (page: Page, speech: Speech): Promise => { - try { - if (page.pageType === PageType.File || !page.content) { - // we don't synthesize files for now - return - } - - console.log('Start synthesizing', { pageId: page.id, speechId: speech.id }) - const startTime = Date.now() - const speechOutput = await synthesizeTextToSpeech({ - id: speech.id, - text: page.content, - languageCode: page.language, - voice: speech.voice, - textType: 'ssml', - }) - console.log('Synthesized article', { - audioFileName: speechOutput.audioFileName, - speechMarksFileName: speechOutput.speechMarksFileName, - duration: Date.now() - startTime, - }) - - // set state to completed - await getRepository(Speech).update(speech.id, { - audioFileName: speechOutput.audioFileName, - speechMarksFileName: speechOutput.speechMarksFileName, - state: SpeechState.COMPLETED, - }) - } catch (error) { - console.log('Error synthesize article', error) - await setSpeechFailure(speech.id) - throw error - } -} From 7353c328d974647cd34790a999782f9886cbb8f3 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Mon, 29 Aug 2022 22:29:40 +0800 Subject: [PATCH 3/5] Remove unused text-to-speech code --- packages/api/src/utils/textToSpeech.ts | 334 ------------------ .../api/test/utils/data/text-to-speech.html | 1 - packages/api/test/utils/textToSpeech.test.ts | 44 --- 3 files changed, 379 deletions(-) delete mode 100644 packages/api/src/utils/textToSpeech.ts delete mode 100644 packages/api/test/utils/data/text-to-speech.html delete mode 100644 packages/api/test/utils/textToSpeech.test.ts diff --git a/packages/api/src/utils/textToSpeech.ts b/packages/api/src/utils/textToSpeech.ts deleted file mode 100644 index c0327ab63..000000000 --- a/packages/api/src/utils/textToSpeech.ts +++ /dev/null @@ -1,334 +0,0 @@ -import { buildLogger } from './logger' -import { createGCSFile, uploadToBucket } from './uploads' -import { - CancellationDetails, - CancellationReason, - ResultReason, - SpeechConfig, - SpeechSynthesisOutputFormat, - SpeechSynthesisResult, - SpeechSynthesizer, -} from 'microsoft-cognitiveservices-speech-sdk' -import { env } from '../env' -import { parseHTML } from 'linkedom' - -export interface TextToSpeechInput { - id: string - text: string - voice?: string - languageCode?: string - textType?: 'text' | 'ssml' - rate?: number - volume?: number - complimentaryVoice?: string -} - -export interface TextToSpeechOutput { - audioFileName: string - speechMarksFileName: string -} - -export interface SpeechMark { - time: number - start?: number - length?: number - word: string - type: 'word' | 'bookmark' -} - -const logger = buildLogger('app.dispatch') - -export const synthesizeTextToSpeech = async ( - input: TextToSpeechInput -): Promise => { - const audioFileName = `speech/${input.id}.mp3` - const audioFile = createGCSFile(audioFileName) - const writeStream = audioFile.createWriteStream({ - resumable: true, - }) - const speechConfig = SpeechConfig.fromSubscription( - env.azure.speechKey, - env.azure.speechRegion - ) - const textType = input.textType || 'text' - if (textType === 'text') { - speechConfig.speechSynthesisLanguage = input.languageCode || 'en-US' - speechConfig.speechSynthesisVoiceName = input.voice || 'en-US-JennyNeural' - } - speechConfig.speechSynthesisOutputFormat = - SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3 - - // Create the speech synthesizer. - const synthesizer = new SpeechSynthesizer(speechConfig) - const speechMarks: SpeechMark[] = [] - let timeOffset = 0 - let characterOffset = 0 - - synthesizer.synthesizing = function (s, e) { - // convert arrayBuffer to stream and write to gcs file - writeStream.write(Buffer.from(e.result.audioData)) - } - - // The event synthesis completed signals that the synthesis is completed. - synthesizer.synthesisCompleted = (s, e) => { - logger.info( - `(synthesized) Reason: ${ResultReason[e.result.reason]} Audio length: ${ - e.result.audioData.byteLength - }` - ) - } - - // The synthesis started event signals that the synthesis is started. - synthesizer.synthesisStarted = (s, e) => { - logger.info('(synthesis started)') - } - - // The event signals that the service has stopped processing speech. - // This can happen when an error is encountered. - synthesizer.SynthesisCanceled = (s, e) => { - const cancellationDetails = CancellationDetails.fromResult(e.result) - let str = - '(cancel) Reason: ' + CancellationReason[cancellationDetails.reason] - if (cancellationDetails.reason === CancellationReason.Error) { - str += ': ' + e.result.errorDetails - } - logger.info(str) - } - - // The unit of e.audioOffset is tick (1 tick = 100 nanoseconds), divide by 10,000 to convert to milliseconds. - synthesizer.wordBoundary = (s, e) => { - speechMarks.push({ - word: e.text, - time: (timeOffset + e.audioOffset) / 10000, - start: characterOffset + e.textOffset, - length: e.wordLength, - type: 'word', - }) - } - - synthesizer.bookmarkReached = (s, e) => { - logger.debug( - `(Bookmark reached), Audio offset: ${ - e.audioOffset / 10000 - }ms, bookmark text: ${e.text}` - ) - speechMarks.push({ - word: e.text, - time: (timeOffset + e.audioOffset) / 10000, - type: 'bookmark', - }) - } - - const speakTextAsyncPromise = ( - text: string - ): Promise => { - return new Promise((resolve, reject) => { - synthesizer.speakTextAsync( - text, - (result) => { - resolve(result) - }, - (error) => { - reject(error) - } - ) - }) - } - - const speakSsmlAsyncPromise = ( - text: string - ): Promise => { - return new Promise((resolve, reject) => { - synthesizer.speakSsmlAsync( - text, - (result) => { - resolve(result) - }, - (error) => { - reject(error) - } - ) - }) - } - - if (textType === 'text') { - // slice the text into chunks of 5,000 characters - let currentTextChunk = '' - const textChunks = input.text.split('\n') - for (let i = 0; i < textChunks.length; i++) { - currentTextChunk += textChunks[i] + '\n' - if (currentTextChunk.length < 5000 && i < textChunks.length - 1) { - continue - } - logger.debug(`synthesizing ${currentTextChunk}`) - const result = await speakTextAsyncPromise(currentTextChunk) - timeOffset = timeOffset + result.audioDuration - characterOffset = characterOffset + currentTextChunk.length - currentTextChunk = '' - } - } else { - const document = parseHTML(input.text).document - const elements = document.querySelectorAll( - 'h1, h2, h3, p, ul, ol, blockquote' - ) - // convert html elements to the ssml document - for (const e of Array.from(elements)) { - const htmlElement = e as HTMLElement - if (htmlElement.innerText) { - // use complimentary voice for blockquote, hardcoded for now - const voice = - htmlElement.tagName.toLowerCase() === 'blockquote' - ? input.complimentaryVoice || 'en-US-AriaNeural' - : input.voice - const ssml = htmlElementToSsml({ - htmlElement: e, - language: input.languageCode, - rate: input.rate, - volume: input.volume, - voice, - }) - logger.debug(`synthesizing ${ssml}`) - const result = await speakSsmlAsyncPromise(ssml) - // if (result.reason === ResultReason.Canceled) { - // synthesizer.close() - // throw new Error(result.errorDetails) - // } - timeOffset = timeOffset + result.audioDuration - // characterOffset = characterOffset + htmlElement.innerText.length - } - } - } - writeStream.end() - synthesizer.close() - - logger.debug(`audio file: ${audioFileName}`) - - // upload Speech Marks file to GCS - const speechMarksFileName = `speech/${input.id}.json` - await uploadToBucket( - speechMarksFileName, - Buffer.from(JSON.stringify(speechMarks)) - ) - - return { - audioFileName, - speechMarksFileName, - } -} - -export const htmlElementToSsml = ({ - htmlElement, - language = 'en-US', - voice = 'en-US-JennyNeural', - rate = 1, - volume = 100, -}: { - htmlElement: Element - language?: string - voice?: string - rate?: number - volume?: number -}): string => { - const replaceElement = (newElement: Element, oldElement: Element) => { - const id = oldElement.getAttribute('data-omnivore-anchor-idx') - if (id) { - const e = htmlElement.querySelector(`[data-omnivore-anchor-idx="${id}"]`) - e?.parentNode?.replaceChild(newElement, e) - } - } - - const appendBookmarkElement = (parent: Element, element: Element) => { - const id = element.getAttribute('data-omnivore-anchor-idx') - if (id) { - const bookMark = ssml.createElement('bookmark') - bookMark.setAttribute('mark', `data-omnivore-anchor-idx-${id}`) - parent.appendChild(bookMark) - } - } - - const replaceWithEmphasis = (element: Element, level: string) => { - const parent = ssml.createDocumentFragment() as unknown as Element - appendBookmarkElement(parent, element) - const emphasisElement = ssml.createElement('emphasis') - emphasisElement.setAttribute('level', level) - emphasisElement.innerHTML = element.innerHTML.trim() - parent.appendChild(emphasisElement) - replaceElement(parent, element) - } - - const replaceWithSentence = (element: Element) => { - const parent = ssml.createDocumentFragment() as unknown as Element - appendBookmarkElement(parent, element) - const sentenceElement = ssml.createElement('s') - sentenceElement.innerHTML = element.innerHTML.trim() - parent.appendChild(sentenceElement) - replaceElement(parent, element) - } - - // create new ssml document - const ssml = parseHTML('').document - const speakElement = ssml.createElement('speak') - speakElement.setAttribute('version', '1.0') - speakElement.setAttribute('xmlns', 'http://www.w3.org/2001/10/synthesis') - speakElement.setAttribute('xml:lang', language) - const voiceElement = ssml.createElement('voice') - voiceElement.setAttribute('name', voice) - speakElement.appendChild(voiceElement) - const prosodyElement = ssml.createElement('prosody') - prosodyElement.setAttribute('rate', `${rate}`) - prosodyElement.setAttribute('volume', volume.toString()) - voiceElement.appendChild(prosodyElement) - // add each paragraph to the ssml document - appendBookmarkElement(prosodyElement, htmlElement) - // replace emphasis elements with ssml - htmlElement.querySelectorAll('*').forEach((e) => { - switch (e.tagName.toLowerCase()) { - case 's': - replaceWithEmphasis(e, 'moderate') - break - case 'sub': - if (e.getAttribute('alias') === null) { - replaceWithEmphasis(e, 'moderate') - } - break - case 'i': - case 'em': - case 'q': - case 'blockquote': - case 'cite': - case 'del': - case 'strike': - case 'sup': - case 'summary': - case 'caption': - case 'figcaption': - replaceWithEmphasis(e, 'moderate') - break - case 'b': - case 'strong': - case 'dt': - case 'dfn': - case 'u': - case 'mark': - case 'th': - case 'title': - case 'var': - replaceWithEmphasis(e, 'moderate') - break - case 'li': - replaceWithSentence(e) - break - default: { - const parent = ssml.createDocumentFragment() as unknown as Element - appendBookmarkElement(parent, e) - const text = (e as HTMLElement).innerText.trim() - const textElement = ssml.createTextNode(text) - parent.appendChild(textElement) - replaceElement(parent, e) - } - } - }) - prosodyElement.appendChild(htmlElement) - - return speakElement.outerHTML.replace(/ |\n/g, '') -} diff --git a/packages/api/test/utils/data/text-to-speech.html b/packages/api/test/utils/data/text-to-speech.html deleted file mode 100644 index 65245fe69..000000000 --- a/packages/api/test/utils/data/text-to-speech.html +++ /dev/null @@ -1 +0,0 @@ -

An Instinct for Dragons is a book by University of Central Florida anthropologist, David E. Jones, in which he seeks to explain the universality of dragon images in the folklore of human societies. In the introduction, Jones conducts a survey of dragon myths from cultures around the world and argues that certain aspects of dragons or dragon-like mythical creatures are found very widely. He claims that even the Inuit have a reptilian dragon-like monster, even though (living in a frigid environment unsuited for cold-blooded animals) they had never seen an actual reptile.

Jones then argues against the common hypothesis that dragon myths might be motivated by primitive discoveries of dinosaur fossils (he argues that there are widespread traits of dragons in folklore which are not observable from fossils), and claims that the common traits of dragons seem to be an amalgam of the principal predators of our ancestral hominids, which he names as the raptors, great cats (especially leopards) and pythons.

The hypothesis to which Jones conforms is that over millions of years of evolution, members of a species will evolve an instinctive fear of their predators, and he proposes ways in which these fearful images may be merged in artistic or cultural expression to create the dragon image and, perhaps, other kinds of hybrid monster.

Finally he suggests sociological reasons for why such images may be perceived differently at different stages of a culture to try to explain why Chinese dragons are considered basically good and representative of government, but the great majority (although not all) European dragons are evil and often represent chaos.

Reception

Jones' theory was opposed in an article by Paul Jordan-Smith in the Spring 2002 issue of Western Folklore and by other authors. Jordan-Smith criticized the lack of evidence given to prove why dragon myths could not have been passed from culture to culture. He also notes that it cannot be demonstrated that the fears of ancestral hominids are coded into the human brain. He concludes his review by writing "One is tempted to say, as Dorothy Parker once did, that this is a book not to be tossed aside lightly but thrown violently. But no, it is not worth spending even that much energy on."[1]

D. Ogden writes that Jones' ideas "might offer pause for thought given the universality of dragon-slaying narratives". He adds, though, that the compound cat, snake, raptor creature imagined by Jones is mostly the Western stereotype based on mediaeval imagery, and that Jones has sought out similar images in a way that lacks rigor. In particular, Ogden notes that the dragons of Graeco-Roman myth do not fit with Jones's prototype, typically lacking one or more of the hybrid components (with the exception of Typhon, who, however, combines many more animals than Jones's three).[2]

References

  1. ^ Jordan-Smith, Paul (2002). "Review: An Instinct for Dragons". Western Folklore. JSTOR .
  2. ^ Ogden, Daniel (2013). Drakon: Dragon Myth and Serpent Cult in the Greek and Roman Worlds. Oxford University Press. pp. 24–25.
diff --git a/packages/api/test/utils/textToSpeech.test.ts b/packages/api/test/utils/textToSpeech.test.ts deleted file mode 100644 index e78492cca..000000000 --- a/packages/api/test/utils/textToSpeech.test.ts +++ /dev/null @@ -1,44 +0,0 @@ -import 'mocha' -import { - htmlElementToSsml, - synthesizeTextToSpeech, - TextToSpeechInput, -} from '../../src/utils/textToSpeech' -import { expect } from 'chai' -import { generateFakeUuid } from '../util' -import { parseHTML } from 'linkedom' -import fs from 'fs' - -describe('textToSpeech', () => { - const load = (path: string): string => { - return fs.readFileSync(path, 'utf8') - } - - describe('synthesizeTextToSpeech', () => { - xit('should create an audio file with speech marks', async () => { - const html = load('./test/utils/data/text-to-speech.html') - const input: TextToSpeechInput = { - id: generateFakeUuid(), - text: html, - languageCode: 'en-US', - voice: 'en-US-JennyNeural', - textType: 'ssml', - } - const output = await synthesizeTextToSpeech(input) - expect(output.audioFileName).to.be.a('string') - expect(output.speechMarksFileName).to.be.a('string') - }) - }) - - describe('htmlElementToSsml', () => { - it('should convert Html Element to SSML', async () => { - const htmlElement = parseHTML( - `

Marry had a little lamb

` - ).document.documentElement - const ssml = htmlElementToSsml({ htmlElement }) - expect(ssml).to.equal( - `

Marry had a little lamb

` - ) - }) - }) -}) From fe30beafe6baa331cbc89177dd508e38d1807c5d Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Mon, 29 Aug 2022 22:59:09 +0800 Subject: [PATCH 4/5] Throw error if synthesis is canceled --- packages/text-to-speech/src/index.ts | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/packages/text-to-speech/src/index.ts b/packages/text-to-speech/src/index.ts index 2ff849856..41964b09d 100644 --- a/packages/text-to-speech/src/index.ts +++ b/packages/text-to-speech/src/index.ts @@ -19,6 +19,7 @@ import axios from 'axios' import * as jwt from 'jsonwebtoken' import * as dotenv from 'dotenv' // see https://github.com/motdotla/dotenv#how-do-i-use-dotenv-with-import import { htmlToSsml, ssmlItemText } from './htmlToSsml' + dotenv.config() interface TextToSpeechInput { @@ -227,10 +228,11 @@ const synthesizeTextToSpeech = async ( const ssml = ssmlItemText(ssmlItem) console.debug(`synthesizing ${ssml}`) const result = await speakSsmlAsyncPromise(ssml) - // if (result.reason === ResultReason.Canceled) { - // synthesizer.close() - // throw new Error(result.errorDetails) - // } + if (result.reason === ResultReason.Canceled) { + writeStream.end() + synthesizer.close() + throw new Error(result.errorDetails) + } timeOffset = timeOffset + result.audioDuration // characterOffset = characterOffset + htmlElement.innerText.length } From 9b736cbf1c81531a064dfa6c68249ee32598b9f9 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Tue, 30 Aug 2022 11:07:19 +0800 Subject: [PATCH 5/5] Add entrypoint for text-to-speech cloud function --- packages/text-to-speech/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/text-to-speech/package.json b/packages/text-to-speech/package.json index fa9db0f24..0e839aa7a 100644 --- a/packages/text-to-speech/package.json +++ b/packages/text-to-speech/package.json @@ -16,7 +16,7 @@ "build": "tsc", "start": "functions-framework --source=build/src/ --target=textToSpeechHandler", "dev": "concurrently \"tsc -w\" \"nodemon --watch ./build/ --exec npm run start\"", - "gcloud-deploy": "gcloud functions deploy text-to-speech --gen2 --trigger-http --allow-unauthenticated --region=us-west2 --runtime nodejs14", + "gcloud-deploy": "gcloud functions deploy text-to-speech --gen2 --entry-point=textToSpeechHandler --trigger-http --allow-unauthenticated --region=us-west2 --runtime nodejs14", "deploy": "yarn build && yarn gcloud-deploy" }, "devDependencies": {