Merge pull request #1155 from omnivore-app/enable-auto-synthesis
enable auto synthesis
This commit is contained in:
@ -9,9 +9,10 @@ import { getPageById } from '../elastic/pages'
|
||||
import { Speech, SpeechState } from '../entity/speech'
|
||||
import { buildLogger } from '../utils/logger'
|
||||
import { getClaimsByToken } from '../utils/auth'
|
||||
import { shouldSynthesize, synthesize } from '../services/speech'
|
||||
import { shouldSynthesize } from '../services/speech'
|
||||
import { readPushSubscription } from '../datalayer/pubsub'
|
||||
import { AppDataSource } from '../server'
|
||||
import { enqueueTextToSpeech } from '../utils/createTask'
|
||||
|
||||
const logger = buildLogger('app.dispatch')
|
||||
|
||||
@ -62,8 +63,16 @@ export function textToSpeechRouter() {
|
||||
state: SpeechState.INITIALIZED,
|
||||
voice: 'en-US-JennyNeural',
|
||||
})
|
||||
await synthesize(page, speech)
|
||||
logger.info('page synthesized')
|
||||
// enqueue a task to convert text to speech
|
||||
const taskName = await enqueueTextToSpeech({
|
||||
userId,
|
||||
speechId: speech.id,
|
||||
text: page.content,
|
||||
voice: speech.voice,
|
||||
priority: 'low',
|
||||
})
|
||||
logger.info('Start Text to speech task', { taskName })
|
||||
return res.status(202).send('Text to speech task started')
|
||||
}
|
||||
|
||||
res.status(200).send('Page should not synthesize')
|
||||
|
||||
@ -1,16 +1,6 @@
|
||||
import { getRepository } from '../entity/utils'
|
||||
import { Speech, SpeechState } from '../entity/speech'
|
||||
import { searchPages } from '../elastic/pages'
|
||||
import { Page, PageType } from '../elastic/types'
|
||||
import { SortBy, SortOrder } from '../utils/search'
|
||||
import { synthesizeTextToSpeech } from '../utils/textToSpeech'
|
||||
|
||||
export const setSpeechFailure = async (id: string) => {
|
||||
// update state
|
||||
await getRepository(Speech).update(id, {
|
||||
state: SpeechState.FAILED,
|
||||
})
|
||||
}
|
||||
|
||||
/*
|
||||
* We should not synthesize the page when:
|
||||
@ -21,72 +11,36 @@ export const shouldSynthesize = async (
|
||||
userId: string,
|
||||
page: Page
|
||||
): Promise<boolean> => {
|
||||
return Promise.resolve(false)
|
||||
// if (page.pageType === PageType.File || !page.content) {
|
||||
// // we don't synthesize files for now
|
||||
// return false
|
||||
// }
|
||||
|
||||
// if (process.env.TEXT_TO_SPEECH_BETA_TEST) {
|
||||
// return true
|
||||
// }
|
||||
|
||||
// const [recentListenedPage, count] = (await searchPages(
|
||||
// {
|
||||
// dateFilters: [
|
||||
// {
|
||||
// field: 'listenedAt',
|
||||
// startDate: new Date(Date.now() - 30 * 24 * 60 * 60 * 1000),
|
||||
// },
|
||||
// ],
|
||||
// sort: {
|
||||
// by: SortBy.LISTENED,
|
||||
// order: SortOrder.DESCENDING,
|
||||
// },
|
||||
// size: 1,
|
||||
// },
|
||||
// userId
|
||||
// )) || [[], 0]
|
||||
// if (count === 0) {
|
||||
// return false
|
||||
// }
|
||||
// return (
|
||||
// !!recentListenedPage[0].listenedAt &&
|
||||
// page.savedAt < recentListenedPage[0].listenedAt
|
||||
// )
|
||||
}
|
||||
|
||||
export const synthesize = async (page: Page, speech: Speech): Promise<void> => {
|
||||
try {
|
||||
if (page.pageType === PageType.File || !page.content) {
|
||||
// we don't synthesize files for now
|
||||
return
|
||||
}
|
||||
|
||||
console.log('Start synthesizing', { pageId: page.id, speechId: speech.id })
|
||||
const startTime = Date.now()
|
||||
const speechOutput = await synthesizeTextToSpeech({
|
||||
id: speech.id,
|
||||
text: page.content,
|
||||
languageCode: page.language,
|
||||
voice: speech.voice,
|
||||
textType: 'ssml',
|
||||
})
|
||||
console.log('Synthesized article', {
|
||||
audioFileName: speechOutput.audioFileName,
|
||||
speechMarksFileName: speechOutput.speechMarksFileName,
|
||||
duration: Date.now() - startTime,
|
||||
})
|
||||
|
||||
// set state to completed
|
||||
await getRepository(Speech).update(speech.id, {
|
||||
audioFileName: speechOutput.audioFileName,
|
||||
speechMarksFileName: speechOutput.speechMarksFileName,
|
||||
state: SpeechState.COMPLETED,
|
||||
})
|
||||
} catch (error) {
|
||||
console.log('Error synthesize article', error)
|
||||
await setSpeechFailure(speech.id)
|
||||
throw error
|
||||
if (page.pageType === PageType.File || !page.content) {
|
||||
// we don't synthesize files for now
|
||||
return false
|
||||
}
|
||||
|
||||
if (process.env.TEXT_TO_SPEECH_BETA_TEST) {
|
||||
return true
|
||||
}
|
||||
|
||||
const [recentListenedPage, count] = (await searchPages(
|
||||
{
|
||||
dateFilters: [
|
||||
{
|
||||
field: 'listenedAt',
|
||||
startDate: new Date(Date.now() - 30 * 24 * 60 * 60 * 1000),
|
||||
},
|
||||
],
|
||||
sort: {
|
||||
by: SortBy.LISTENED,
|
||||
order: SortOrder.DESCENDING,
|
||||
},
|
||||
size: 1,
|
||||
},
|
||||
userId
|
||||
)) || [[], 0]
|
||||
if (count === 0) {
|
||||
return false
|
||||
}
|
||||
return (
|
||||
!!recentListenedPage[0].listenedAt &&
|
||||
page.savedAt < recentListenedPage[0].listenedAt
|
||||
)
|
||||
}
|
||||
|
||||
@ -1,334 +0,0 @@
|
||||
import { buildLogger } from './logger'
|
||||
import { createGCSFile, uploadToBucket } from './uploads'
|
||||
import {
|
||||
CancellationDetails,
|
||||
CancellationReason,
|
||||
ResultReason,
|
||||
SpeechConfig,
|
||||
SpeechSynthesisOutputFormat,
|
||||
SpeechSynthesisResult,
|
||||
SpeechSynthesizer,
|
||||
} from 'microsoft-cognitiveservices-speech-sdk'
|
||||
import { env } from '../env'
|
||||
import { parseHTML } from 'linkedom'
|
||||
|
||||
export interface TextToSpeechInput {
|
||||
id: string
|
||||
text: string
|
||||
voice?: string
|
||||
languageCode?: string
|
||||
textType?: 'text' | 'ssml'
|
||||
rate?: number
|
||||
volume?: number
|
||||
complimentaryVoice?: string
|
||||
}
|
||||
|
||||
export interface TextToSpeechOutput {
|
||||
audioFileName: string
|
||||
speechMarksFileName: string
|
||||
}
|
||||
|
||||
export interface SpeechMark {
|
||||
time: number
|
||||
start?: number
|
||||
length?: number
|
||||
word: string
|
||||
type: 'word' | 'bookmark'
|
||||
}
|
||||
|
||||
const logger = buildLogger('app.dispatch')
|
||||
|
||||
export const synthesizeTextToSpeech = async (
|
||||
input: TextToSpeechInput
|
||||
): Promise<TextToSpeechOutput> => {
|
||||
const audioFileName = `speech/${input.id}.mp3`
|
||||
const audioFile = createGCSFile(audioFileName)
|
||||
const writeStream = audioFile.createWriteStream({
|
||||
resumable: true,
|
||||
})
|
||||
const speechConfig = SpeechConfig.fromSubscription(
|
||||
env.azure.speechKey,
|
||||
env.azure.speechRegion
|
||||
)
|
||||
const textType = input.textType || 'text'
|
||||
if (textType === 'text') {
|
||||
speechConfig.speechSynthesisLanguage = input.languageCode || 'en-US'
|
||||
speechConfig.speechSynthesisVoiceName = input.voice || 'en-US-JennyNeural'
|
||||
}
|
||||
speechConfig.speechSynthesisOutputFormat =
|
||||
SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3
|
||||
|
||||
// Create the speech synthesizer.
|
||||
const synthesizer = new SpeechSynthesizer(speechConfig)
|
||||
const speechMarks: SpeechMark[] = []
|
||||
let timeOffset = 0
|
||||
let characterOffset = 0
|
||||
|
||||
synthesizer.synthesizing = function (s, e) {
|
||||
// convert arrayBuffer to stream and write to gcs file
|
||||
writeStream.write(Buffer.from(e.result.audioData))
|
||||
}
|
||||
|
||||
// The event synthesis completed signals that the synthesis is completed.
|
||||
synthesizer.synthesisCompleted = (s, e) => {
|
||||
logger.info(
|
||||
`(synthesized) Reason: ${ResultReason[e.result.reason]} Audio length: ${
|
||||
e.result.audioData.byteLength
|
||||
}`
|
||||
)
|
||||
}
|
||||
|
||||
// The synthesis started event signals that the synthesis is started.
|
||||
synthesizer.synthesisStarted = (s, e) => {
|
||||
logger.info('(synthesis started)')
|
||||
}
|
||||
|
||||
// The event signals that the service has stopped processing speech.
|
||||
// This can happen when an error is encountered.
|
||||
synthesizer.SynthesisCanceled = (s, e) => {
|
||||
const cancellationDetails = CancellationDetails.fromResult(e.result)
|
||||
let str =
|
||||
'(cancel) Reason: ' + CancellationReason[cancellationDetails.reason]
|
||||
if (cancellationDetails.reason === CancellationReason.Error) {
|
||||
str += ': ' + e.result.errorDetails
|
||||
}
|
||||
logger.info(str)
|
||||
}
|
||||
|
||||
// The unit of e.audioOffset is tick (1 tick = 100 nanoseconds), divide by 10,000 to convert to milliseconds.
|
||||
synthesizer.wordBoundary = (s, e) => {
|
||||
speechMarks.push({
|
||||
word: e.text,
|
||||
time: (timeOffset + e.audioOffset) / 10000,
|
||||
start: characterOffset + e.textOffset,
|
||||
length: e.wordLength,
|
||||
type: 'word',
|
||||
})
|
||||
}
|
||||
|
||||
synthesizer.bookmarkReached = (s, e) => {
|
||||
logger.debug(
|
||||
`(Bookmark reached), Audio offset: ${
|
||||
e.audioOffset / 10000
|
||||
}ms, bookmark text: ${e.text}`
|
||||
)
|
||||
speechMarks.push({
|
||||
word: e.text,
|
||||
time: (timeOffset + e.audioOffset) / 10000,
|
||||
type: 'bookmark',
|
||||
})
|
||||
}
|
||||
|
||||
const speakTextAsyncPromise = (
|
||||
text: string
|
||||
): Promise<SpeechSynthesisResult> => {
|
||||
return new Promise((resolve, reject) => {
|
||||
synthesizer.speakTextAsync(
|
||||
text,
|
||||
(result) => {
|
||||
resolve(result)
|
||||
},
|
||||
(error) => {
|
||||
reject(error)
|
||||
}
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
const speakSsmlAsyncPromise = (
|
||||
text: string
|
||||
): Promise<SpeechSynthesisResult> => {
|
||||
return new Promise((resolve, reject) => {
|
||||
synthesizer.speakSsmlAsync(
|
||||
text,
|
||||
(result) => {
|
||||
resolve(result)
|
||||
},
|
||||
(error) => {
|
||||
reject(error)
|
||||
}
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
if (textType === 'text') {
|
||||
// slice the text into chunks of 5,000 characters
|
||||
let currentTextChunk = ''
|
||||
const textChunks = input.text.split('\n')
|
||||
for (let i = 0; i < textChunks.length; i++) {
|
||||
currentTextChunk += textChunks[i] + '\n'
|
||||
if (currentTextChunk.length < 5000 && i < textChunks.length - 1) {
|
||||
continue
|
||||
}
|
||||
logger.debug(`synthesizing ${currentTextChunk}`)
|
||||
const result = await speakTextAsyncPromise(currentTextChunk)
|
||||
timeOffset = timeOffset + result.audioDuration
|
||||
characterOffset = characterOffset + currentTextChunk.length
|
||||
currentTextChunk = ''
|
||||
}
|
||||
} else {
|
||||
const document = parseHTML(input.text).document
|
||||
const elements = document.querySelectorAll(
|
||||
'h1, h2, h3, p, ul, ol, blockquote'
|
||||
)
|
||||
// convert html elements to the ssml document
|
||||
for (const e of Array.from(elements)) {
|
||||
const htmlElement = e as HTMLElement
|
||||
if (htmlElement.innerText) {
|
||||
// use complimentary voice for blockquote, hardcoded for now
|
||||
const voice =
|
||||
htmlElement.tagName.toLowerCase() === 'blockquote'
|
||||
? input.complimentaryVoice || 'en-US-AriaNeural'
|
||||
: input.voice
|
||||
const ssml = htmlElementToSsml({
|
||||
htmlElement: e,
|
||||
language: input.languageCode,
|
||||
rate: input.rate,
|
||||
volume: input.volume,
|
||||
voice,
|
||||
})
|
||||
logger.debug(`synthesizing ${ssml}`)
|
||||
const result = await speakSsmlAsyncPromise(ssml)
|
||||
// if (result.reason === ResultReason.Canceled) {
|
||||
// synthesizer.close()
|
||||
// throw new Error(result.errorDetails)
|
||||
// }
|
||||
timeOffset = timeOffset + result.audioDuration
|
||||
// characterOffset = characterOffset + htmlElement.innerText.length
|
||||
}
|
||||
}
|
||||
}
|
||||
writeStream.end()
|
||||
synthesizer.close()
|
||||
|
||||
logger.debug(`audio file: ${audioFileName}`)
|
||||
|
||||
// upload Speech Marks file to GCS
|
||||
const speechMarksFileName = `speech/${input.id}.json`
|
||||
await uploadToBucket(
|
||||
speechMarksFileName,
|
||||
Buffer.from(JSON.stringify(speechMarks))
|
||||
)
|
||||
|
||||
return {
|
||||
audioFileName,
|
||||
speechMarksFileName,
|
||||
}
|
||||
}
|
||||
|
||||
export const htmlElementToSsml = ({
|
||||
htmlElement,
|
||||
language = 'en-US',
|
||||
voice = 'en-US-JennyNeural',
|
||||
rate = 1,
|
||||
volume = 100,
|
||||
}: {
|
||||
htmlElement: Element
|
||||
language?: string
|
||||
voice?: string
|
||||
rate?: number
|
||||
volume?: number
|
||||
}): string => {
|
||||
const replaceElement = (newElement: Element, oldElement: Element) => {
|
||||
const id = oldElement.getAttribute('data-omnivore-anchor-idx')
|
||||
if (id) {
|
||||
const e = htmlElement.querySelector(`[data-omnivore-anchor-idx="${id}"]`)
|
||||
e?.parentNode?.replaceChild(newElement, e)
|
||||
}
|
||||
}
|
||||
|
||||
const appendBookmarkElement = (parent: Element, element: Element) => {
|
||||
const id = element.getAttribute('data-omnivore-anchor-idx')
|
||||
if (id) {
|
||||
const bookMark = ssml.createElement('bookmark')
|
||||
bookMark.setAttribute('mark', `data-omnivore-anchor-idx-${id}`)
|
||||
parent.appendChild(bookMark)
|
||||
}
|
||||
}
|
||||
|
||||
const replaceWithEmphasis = (element: Element, level: string) => {
|
||||
const parent = ssml.createDocumentFragment() as unknown as Element
|
||||
appendBookmarkElement(parent, element)
|
||||
const emphasisElement = ssml.createElement('emphasis')
|
||||
emphasisElement.setAttribute('level', level)
|
||||
emphasisElement.innerHTML = element.innerHTML.trim()
|
||||
parent.appendChild(emphasisElement)
|
||||
replaceElement(parent, element)
|
||||
}
|
||||
|
||||
const replaceWithSentence = (element: Element) => {
|
||||
const parent = ssml.createDocumentFragment() as unknown as Element
|
||||
appendBookmarkElement(parent, element)
|
||||
const sentenceElement = ssml.createElement('s')
|
||||
sentenceElement.innerHTML = element.innerHTML.trim()
|
||||
parent.appendChild(sentenceElement)
|
||||
replaceElement(parent, element)
|
||||
}
|
||||
|
||||
// create new ssml document
|
||||
const ssml = parseHTML('').document
|
||||
const speakElement = ssml.createElement('speak')
|
||||
speakElement.setAttribute('version', '1.0')
|
||||
speakElement.setAttribute('xmlns', 'http://www.w3.org/2001/10/synthesis')
|
||||
speakElement.setAttribute('xml:lang', language)
|
||||
const voiceElement = ssml.createElement('voice')
|
||||
voiceElement.setAttribute('name', voice)
|
||||
speakElement.appendChild(voiceElement)
|
||||
const prosodyElement = ssml.createElement('prosody')
|
||||
prosodyElement.setAttribute('rate', `${rate}`)
|
||||
prosodyElement.setAttribute('volume', volume.toString())
|
||||
voiceElement.appendChild(prosodyElement)
|
||||
// add each paragraph to the ssml document
|
||||
appendBookmarkElement(prosodyElement, htmlElement)
|
||||
// replace emphasis elements with ssml
|
||||
htmlElement.querySelectorAll('*').forEach((e) => {
|
||||
switch (e.tagName.toLowerCase()) {
|
||||
case 's':
|
||||
replaceWithEmphasis(e, 'moderate')
|
||||
break
|
||||
case 'sub':
|
||||
if (e.getAttribute('alias') === null) {
|
||||
replaceWithEmphasis(e, 'moderate')
|
||||
}
|
||||
break
|
||||
case 'i':
|
||||
case 'em':
|
||||
case 'q':
|
||||
case 'blockquote':
|
||||
case 'cite':
|
||||
case 'del':
|
||||
case 'strike':
|
||||
case 'sup':
|
||||
case 'summary':
|
||||
case 'caption':
|
||||
case 'figcaption':
|
||||
replaceWithEmphasis(e, 'moderate')
|
||||
break
|
||||
case 'b':
|
||||
case 'strong':
|
||||
case 'dt':
|
||||
case 'dfn':
|
||||
case 'u':
|
||||
case 'mark':
|
||||
case 'th':
|
||||
case 'title':
|
||||
case 'var':
|
||||
replaceWithEmphasis(e, 'moderate')
|
||||
break
|
||||
case 'li':
|
||||
replaceWithSentence(e)
|
||||
break
|
||||
default: {
|
||||
const parent = ssml.createDocumentFragment() as unknown as Element
|
||||
appendBookmarkElement(parent, e)
|
||||
const text = (e as HTMLElement).innerText.trim()
|
||||
const textElement = ssml.createTextNode(text)
|
||||
parent.appendChild(textElement)
|
||||
replaceElement(parent, e)
|
||||
}
|
||||
}
|
||||
})
|
||||
prosodyElement.appendChild(htmlElement)
|
||||
|
||||
return speakElement.outerHTML.replace(/ |\n/g, '')
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
@ -1,44 +0,0 @@
|
||||
import 'mocha'
|
||||
import {
|
||||
htmlElementToSsml,
|
||||
synthesizeTextToSpeech,
|
||||
TextToSpeechInput,
|
||||
} from '../../src/utils/textToSpeech'
|
||||
import { expect } from 'chai'
|
||||
import { generateFakeUuid } from '../util'
|
||||
import { parseHTML } from 'linkedom'
|
||||
import fs from 'fs'
|
||||
|
||||
describe('textToSpeech', () => {
|
||||
const load = (path: string): string => {
|
||||
return fs.readFileSync(path, 'utf8')
|
||||
}
|
||||
|
||||
describe('synthesizeTextToSpeech', () => {
|
||||
xit('should create an audio file with speech marks', async () => {
|
||||
const html = load('./test/utils/data/text-to-speech.html')
|
||||
const input: TextToSpeechInput = {
|
||||
id: generateFakeUuid(),
|
||||
text: html,
|
||||
languageCode: 'en-US',
|
||||
voice: 'en-US-JennyNeural',
|
||||
textType: 'ssml',
|
||||
}
|
||||
const output = await synthesizeTextToSpeech(input)
|
||||
expect(output.audioFileName).to.be.a('string')
|
||||
expect(output.speechMarksFileName).to.be.a('string')
|
||||
})
|
||||
})
|
||||
|
||||
describe('htmlElementToSsml', () => {
|
||||
it('should convert Html Element to SSML', async () => {
|
||||
const htmlElement = parseHTML(
|
||||
`<p data-omnivore-anchor-idx="1">Marry had a little lamb</p>`
|
||||
).document.documentElement
|
||||
const ssml = htmlElementToSsml({ htmlElement })
|
||||
expect(ssml).to.equal(
|
||||
`<speak xml:lang="en-US" xmlns="http://www.w3.org/2001/10/synthesis" version="1.0"><voice name="en-US-JennyNeural"><prosody volume="100" rate="1"><bookmark mark="data-omnivore-anchor-idx-1"></bookmark><p data-omnivore-anchor-idx="1">Marry had a little lamb</p></prosody></voice></speak>`
|
||||
)
|
||||
})
|
||||
})
|
||||
})
|
||||
@ -16,7 +16,7 @@
|
||||
"build": "tsc",
|
||||
"start": "functions-framework --source=build/src/ --target=textToSpeechHandler",
|
||||
"dev": "concurrently \"tsc -w\" \"nodemon --watch ./build/ --exec npm run start\"",
|
||||
"gcloud-deploy": "gcloud functions deploy text-to-speech --gen2 --trigger-http --allow-unauthenticated --region=us-west2 --runtime nodejs14",
|
||||
"gcloud-deploy": "gcloud functions deploy text-to-speech --gen2 --entry-point=textToSpeechHandler --trigger-http --allow-unauthenticated --region=us-west2 --runtime nodejs14",
|
||||
"deploy": "yarn build && yarn gcloud-deploy"
|
||||
},
|
||||
"devDependencies": {
|
||||
|
||||
@ -19,6 +19,7 @@ import axios from 'axios'
|
||||
import * as jwt from 'jsonwebtoken'
|
||||
import * as dotenv from 'dotenv' // see https://github.com/motdotla/dotenv#how-do-i-use-dotenv-with-import
|
||||
import { htmlToSsml, ssmlItemText } from './htmlToSsml'
|
||||
|
||||
dotenv.config()
|
||||
|
||||
interface TextToSpeechInput {
|
||||
@ -227,10 +228,11 @@ const synthesizeTextToSpeech = async (
|
||||
const ssml = ssmlItemText(ssmlItem)
|
||||
console.debug(`synthesizing ${ssml}`)
|
||||
const result = await speakSsmlAsyncPromise(ssml)
|
||||
// if (result.reason === ResultReason.Canceled) {
|
||||
// synthesizer.close()
|
||||
// throw new Error(result.errorDetails)
|
||||
// }
|
||||
if (result.reason === ResultReason.Canceled) {
|
||||
writeStream.end()
|
||||
synthesizer.close()
|
||||
throw new Error(result.errorDetails)
|
||||
}
|
||||
timeOffset = timeOffset + result.audioDuration
|
||||
// characterOffset = characterOffset + htmlElement.innerText.length
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user