Merge pull request #1155 from omnivore-app/enable-auto-synthesis

enable auto synthesis
This commit is contained in:
Jackson Harper
2022-08-30 18:25:36 +08:00
committed by GitHub
7 changed files with 50 additions and 464 deletions

View File

@ -9,9 +9,10 @@ import { getPageById } from '../elastic/pages'
import { Speech, SpeechState } from '../entity/speech'
import { buildLogger } from '../utils/logger'
import { getClaimsByToken } from '../utils/auth'
import { shouldSynthesize, synthesize } from '../services/speech'
import { shouldSynthesize } from '../services/speech'
import { readPushSubscription } from '../datalayer/pubsub'
import { AppDataSource } from '../server'
import { enqueueTextToSpeech } from '../utils/createTask'
const logger = buildLogger('app.dispatch')
@ -62,8 +63,16 @@ export function textToSpeechRouter() {
state: SpeechState.INITIALIZED,
voice: 'en-US-JennyNeural',
})
await synthesize(page, speech)
logger.info('page synthesized')
// enqueue a task to convert text to speech
const taskName = await enqueueTextToSpeech({
userId,
speechId: speech.id,
text: page.content,
voice: speech.voice,
priority: 'low',
})
logger.info('Start Text to speech task', { taskName })
return res.status(202).send('Text to speech task started')
}
res.status(200).send('Page should not synthesize')

View File

@ -1,16 +1,6 @@
import { getRepository } from '../entity/utils'
import { Speech, SpeechState } from '../entity/speech'
import { searchPages } from '../elastic/pages'
import { Page, PageType } from '../elastic/types'
import { SortBy, SortOrder } from '../utils/search'
import { synthesizeTextToSpeech } from '../utils/textToSpeech'
export const setSpeechFailure = async (id: string) => {
// update state
await getRepository(Speech).update(id, {
state: SpeechState.FAILED,
})
}
/*
* We should not synthesize the page when:
@ -21,72 +11,36 @@ export const shouldSynthesize = async (
userId: string,
page: Page
): Promise<boolean> => {
return Promise.resolve(false)
// if (page.pageType === PageType.File || !page.content) {
// // we don't synthesize files for now
// return false
// }
// if (process.env.TEXT_TO_SPEECH_BETA_TEST) {
// return true
// }
// const [recentListenedPage, count] = (await searchPages(
// {
// dateFilters: [
// {
// field: 'listenedAt',
// startDate: new Date(Date.now() - 30 * 24 * 60 * 60 * 1000),
// },
// ],
// sort: {
// by: SortBy.LISTENED,
// order: SortOrder.DESCENDING,
// },
// size: 1,
// },
// userId
// )) || [[], 0]
// if (count === 0) {
// return false
// }
// return (
// !!recentListenedPage[0].listenedAt &&
// page.savedAt < recentListenedPage[0].listenedAt
// )
}
export const synthesize = async (page: Page, speech: Speech): Promise<void> => {
try {
if (page.pageType === PageType.File || !page.content) {
// we don't synthesize files for now
return
}
console.log('Start synthesizing', { pageId: page.id, speechId: speech.id })
const startTime = Date.now()
const speechOutput = await synthesizeTextToSpeech({
id: speech.id,
text: page.content,
languageCode: page.language,
voice: speech.voice,
textType: 'ssml',
})
console.log('Synthesized article', {
audioFileName: speechOutput.audioFileName,
speechMarksFileName: speechOutput.speechMarksFileName,
duration: Date.now() - startTime,
})
// set state to completed
await getRepository(Speech).update(speech.id, {
audioFileName: speechOutput.audioFileName,
speechMarksFileName: speechOutput.speechMarksFileName,
state: SpeechState.COMPLETED,
})
} catch (error) {
console.log('Error synthesize article', error)
await setSpeechFailure(speech.id)
throw error
if (page.pageType === PageType.File || !page.content) {
// we don't synthesize files for now
return false
}
if (process.env.TEXT_TO_SPEECH_BETA_TEST) {
return true
}
const [recentListenedPage, count] = (await searchPages(
{
dateFilters: [
{
field: 'listenedAt',
startDate: new Date(Date.now() - 30 * 24 * 60 * 60 * 1000),
},
],
sort: {
by: SortBy.LISTENED,
order: SortOrder.DESCENDING,
},
size: 1,
},
userId
)) || [[], 0]
if (count === 0) {
return false
}
return (
!!recentListenedPage[0].listenedAt &&
page.savedAt < recentListenedPage[0].listenedAt
)
}

View File

@ -1,334 +0,0 @@
import { buildLogger } from './logger'
import { createGCSFile, uploadToBucket } from './uploads'
import {
CancellationDetails,
CancellationReason,
ResultReason,
SpeechConfig,
SpeechSynthesisOutputFormat,
SpeechSynthesisResult,
SpeechSynthesizer,
} from 'microsoft-cognitiveservices-speech-sdk'
import { env } from '../env'
import { parseHTML } from 'linkedom'
export interface TextToSpeechInput {
id: string
text: string
voice?: string
languageCode?: string
textType?: 'text' | 'ssml'
rate?: number
volume?: number
complimentaryVoice?: string
}
export interface TextToSpeechOutput {
audioFileName: string
speechMarksFileName: string
}
export interface SpeechMark {
time: number
start?: number
length?: number
word: string
type: 'word' | 'bookmark'
}
const logger = buildLogger('app.dispatch')
export const synthesizeTextToSpeech = async (
input: TextToSpeechInput
): Promise<TextToSpeechOutput> => {
const audioFileName = `speech/${input.id}.mp3`
const audioFile = createGCSFile(audioFileName)
const writeStream = audioFile.createWriteStream({
resumable: true,
})
const speechConfig = SpeechConfig.fromSubscription(
env.azure.speechKey,
env.azure.speechRegion
)
const textType = input.textType || 'text'
if (textType === 'text') {
speechConfig.speechSynthesisLanguage = input.languageCode || 'en-US'
speechConfig.speechSynthesisVoiceName = input.voice || 'en-US-JennyNeural'
}
speechConfig.speechSynthesisOutputFormat =
SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3
// Create the speech synthesizer.
const synthesizer = new SpeechSynthesizer(speechConfig)
const speechMarks: SpeechMark[] = []
let timeOffset = 0
let characterOffset = 0
synthesizer.synthesizing = function (s, e) {
// convert arrayBuffer to stream and write to gcs file
writeStream.write(Buffer.from(e.result.audioData))
}
// The event synthesis completed signals that the synthesis is completed.
synthesizer.synthesisCompleted = (s, e) => {
logger.info(
`(synthesized) Reason: ${ResultReason[e.result.reason]} Audio length: ${
e.result.audioData.byteLength
}`
)
}
// The synthesis started event signals that the synthesis is started.
synthesizer.synthesisStarted = (s, e) => {
logger.info('(synthesis started)')
}
// The event signals that the service has stopped processing speech.
// This can happen when an error is encountered.
synthesizer.SynthesisCanceled = (s, e) => {
const cancellationDetails = CancellationDetails.fromResult(e.result)
let str =
'(cancel) Reason: ' + CancellationReason[cancellationDetails.reason]
if (cancellationDetails.reason === CancellationReason.Error) {
str += ': ' + e.result.errorDetails
}
logger.info(str)
}
// The unit of e.audioOffset is tick (1 tick = 100 nanoseconds), divide by 10,000 to convert to milliseconds.
synthesizer.wordBoundary = (s, e) => {
speechMarks.push({
word: e.text,
time: (timeOffset + e.audioOffset) / 10000,
start: characterOffset + e.textOffset,
length: e.wordLength,
type: 'word',
})
}
synthesizer.bookmarkReached = (s, e) => {
logger.debug(
`(Bookmark reached), Audio offset: ${
e.audioOffset / 10000
}ms, bookmark text: ${e.text}`
)
speechMarks.push({
word: e.text,
time: (timeOffset + e.audioOffset) / 10000,
type: 'bookmark',
})
}
const speakTextAsyncPromise = (
text: string
): Promise<SpeechSynthesisResult> => {
return new Promise((resolve, reject) => {
synthesizer.speakTextAsync(
text,
(result) => {
resolve(result)
},
(error) => {
reject(error)
}
)
})
}
const speakSsmlAsyncPromise = (
text: string
): Promise<SpeechSynthesisResult> => {
return new Promise((resolve, reject) => {
synthesizer.speakSsmlAsync(
text,
(result) => {
resolve(result)
},
(error) => {
reject(error)
}
)
})
}
if (textType === 'text') {
// slice the text into chunks of 5,000 characters
let currentTextChunk = ''
const textChunks = input.text.split('\n')
for (let i = 0; i < textChunks.length; i++) {
currentTextChunk += textChunks[i] + '\n'
if (currentTextChunk.length < 5000 && i < textChunks.length - 1) {
continue
}
logger.debug(`synthesizing ${currentTextChunk}`)
const result = await speakTextAsyncPromise(currentTextChunk)
timeOffset = timeOffset + result.audioDuration
characterOffset = characterOffset + currentTextChunk.length
currentTextChunk = ''
}
} else {
const document = parseHTML(input.text).document
const elements = document.querySelectorAll(
'h1, h2, h3, p, ul, ol, blockquote'
)
// convert html elements to the ssml document
for (const e of Array.from(elements)) {
const htmlElement = e as HTMLElement
if (htmlElement.innerText) {
// use complimentary voice for blockquote, hardcoded for now
const voice =
htmlElement.tagName.toLowerCase() === 'blockquote'
? input.complimentaryVoice || 'en-US-AriaNeural'
: input.voice
const ssml = htmlElementToSsml({
htmlElement: e,
language: input.languageCode,
rate: input.rate,
volume: input.volume,
voice,
})
logger.debug(`synthesizing ${ssml}`)
const result = await speakSsmlAsyncPromise(ssml)
// if (result.reason === ResultReason.Canceled) {
// synthesizer.close()
// throw new Error(result.errorDetails)
// }
timeOffset = timeOffset + result.audioDuration
// characterOffset = characterOffset + htmlElement.innerText.length
}
}
}
writeStream.end()
synthesizer.close()
logger.debug(`audio file: ${audioFileName}`)
// upload Speech Marks file to GCS
const speechMarksFileName = `speech/${input.id}.json`
await uploadToBucket(
speechMarksFileName,
Buffer.from(JSON.stringify(speechMarks))
)
return {
audioFileName,
speechMarksFileName,
}
}
export const htmlElementToSsml = ({
htmlElement,
language = 'en-US',
voice = 'en-US-JennyNeural',
rate = 1,
volume = 100,
}: {
htmlElement: Element
language?: string
voice?: string
rate?: number
volume?: number
}): string => {
const replaceElement = (newElement: Element, oldElement: Element) => {
const id = oldElement.getAttribute('data-omnivore-anchor-idx')
if (id) {
const e = htmlElement.querySelector(`[data-omnivore-anchor-idx="${id}"]`)
e?.parentNode?.replaceChild(newElement, e)
}
}
const appendBookmarkElement = (parent: Element, element: Element) => {
const id = element.getAttribute('data-omnivore-anchor-idx')
if (id) {
const bookMark = ssml.createElement('bookmark')
bookMark.setAttribute('mark', `data-omnivore-anchor-idx-${id}`)
parent.appendChild(bookMark)
}
}
const replaceWithEmphasis = (element: Element, level: string) => {
const parent = ssml.createDocumentFragment() as unknown as Element
appendBookmarkElement(parent, element)
const emphasisElement = ssml.createElement('emphasis')
emphasisElement.setAttribute('level', level)
emphasisElement.innerHTML = element.innerHTML.trim()
parent.appendChild(emphasisElement)
replaceElement(parent, element)
}
const replaceWithSentence = (element: Element) => {
const parent = ssml.createDocumentFragment() as unknown as Element
appendBookmarkElement(parent, element)
const sentenceElement = ssml.createElement('s')
sentenceElement.innerHTML = element.innerHTML.trim()
parent.appendChild(sentenceElement)
replaceElement(parent, element)
}
// create new ssml document
const ssml = parseHTML('').document
const speakElement = ssml.createElement('speak')
speakElement.setAttribute('version', '1.0')
speakElement.setAttribute('xmlns', 'http://www.w3.org/2001/10/synthesis')
speakElement.setAttribute('xml:lang', language)
const voiceElement = ssml.createElement('voice')
voiceElement.setAttribute('name', voice)
speakElement.appendChild(voiceElement)
const prosodyElement = ssml.createElement('prosody')
prosodyElement.setAttribute('rate', `${rate}`)
prosodyElement.setAttribute('volume', volume.toString())
voiceElement.appendChild(prosodyElement)
// add each paragraph to the ssml document
appendBookmarkElement(prosodyElement, htmlElement)
// replace emphasis elements with ssml
htmlElement.querySelectorAll('*').forEach((e) => {
switch (e.tagName.toLowerCase()) {
case 's':
replaceWithEmphasis(e, 'moderate')
break
case 'sub':
if (e.getAttribute('alias') === null) {
replaceWithEmphasis(e, 'moderate')
}
break
case 'i':
case 'em':
case 'q':
case 'blockquote':
case 'cite':
case 'del':
case 'strike':
case 'sup':
case 'summary':
case 'caption':
case 'figcaption':
replaceWithEmphasis(e, 'moderate')
break
case 'b':
case 'strong':
case 'dt':
case 'dfn':
case 'u':
case 'mark':
case 'th':
case 'title':
case 'var':
replaceWithEmphasis(e, 'moderate')
break
case 'li':
replaceWithSentence(e)
break
default: {
const parent = ssml.createDocumentFragment() as unknown as Element
appendBookmarkElement(parent, e)
const text = (e as HTMLElement).innerText.trim()
const textElement = ssml.createTextNode(text)
parent.appendChild(textElement)
replaceElement(parent, e)
}
}
})
prosodyElement.appendChild(htmlElement)
return speakElement.outerHTML.replace(/&nbsp;|\n/g, '')
}

File diff suppressed because one or more lines are too long

View File

@ -1,44 +0,0 @@
import 'mocha'
import {
htmlElementToSsml,
synthesizeTextToSpeech,
TextToSpeechInput,
} from '../../src/utils/textToSpeech'
import { expect } from 'chai'
import { generateFakeUuid } from '../util'
import { parseHTML } from 'linkedom'
import fs from 'fs'
describe('textToSpeech', () => {
const load = (path: string): string => {
return fs.readFileSync(path, 'utf8')
}
describe('synthesizeTextToSpeech', () => {
xit('should create an audio file with speech marks', async () => {
const html = load('./test/utils/data/text-to-speech.html')
const input: TextToSpeechInput = {
id: generateFakeUuid(),
text: html,
languageCode: 'en-US',
voice: 'en-US-JennyNeural',
textType: 'ssml',
}
const output = await synthesizeTextToSpeech(input)
expect(output.audioFileName).to.be.a('string')
expect(output.speechMarksFileName).to.be.a('string')
})
})
describe('htmlElementToSsml', () => {
it('should convert Html Element to SSML', async () => {
const htmlElement = parseHTML(
`<p data-omnivore-anchor-idx="1">Marry had a little lamb</p>`
).document.documentElement
const ssml = htmlElementToSsml({ htmlElement })
expect(ssml).to.equal(
`<speak xml:lang="en-US" xmlns="http://www.w3.org/2001/10/synthesis" version="1.0"><voice name="en-US-JennyNeural"><prosody volume="100" rate="1"><bookmark mark="data-omnivore-anchor-idx-1"></bookmark><p data-omnivore-anchor-idx="1">Marry had a little lamb</p></prosody></voice></speak>`
)
})
})
})

View File

@ -16,7 +16,7 @@
"build": "tsc",
"start": "functions-framework --source=build/src/ --target=textToSpeechHandler",
"dev": "concurrently \"tsc -w\" \"nodemon --watch ./build/ --exec npm run start\"",
"gcloud-deploy": "gcloud functions deploy text-to-speech --gen2 --trigger-http --allow-unauthenticated --region=us-west2 --runtime nodejs14",
"gcloud-deploy": "gcloud functions deploy text-to-speech --gen2 --entry-point=textToSpeechHandler --trigger-http --allow-unauthenticated --region=us-west2 --runtime nodejs14",
"deploy": "yarn build && yarn gcloud-deploy"
},
"devDependencies": {

View File

@ -19,6 +19,7 @@ import axios from 'axios'
import * as jwt from 'jsonwebtoken'
import * as dotenv from 'dotenv' // see https://github.com/motdotla/dotenv#how-do-i-use-dotenv-with-import
import { htmlToSsml, ssmlItemText } from './htmlToSsml'
dotenv.config()
interface TextToSpeechInput {
@ -227,10 +228,11 @@ const synthesizeTextToSpeech = async (
const ssml = ssmlItemText(ssmlItem)
console.debug(`synthesizing ${ssml}`)
const result = await speakSsmlAsyncPromise(ssml)
// if (result.reason === ResultReason.Canceled) {
// synthesizer.close()
// throw new Error(result.errorDetails)
// }
if (result.reason === ResultReason.Canceled) {
writeStream.end()
synthesizer.close()
throw new Error(result.errorDetails)
}
timeOffset = timeOffset + result.audioDuration
// characterOffset = characterOffset + htmlElement.innerText.length
}