Split utterance into chunks of 256 chars

This commit is contained in:
Hongbo Wu
2022-10-03 17:23:13 +08:00
parent 0aa17eb6dc
commit ae4c01f2d3
2 changed files with 57 additions and 48 deletions

View File

@ -255,7 +255,7 @@ export const stripEmojis = (text: string): string => {
return text.replace(emojiRegex, '').replace(/\s+/g, ' ')
}
const textToUtterance = ({
const textToUtterances = ({
tokenizer,
idx,
textItems,
@ -269,32 +269,51 @@ const textToUtterance = ({
wordOffset: number
voice?: string
isHtml?: boolean
}): Utterance => {
const text = textItems.join('')
let textWithWordOffset = text
if (isHtml) {
try {
textWithWordOffset = htmlToText(text, { wordwrap: false })
} catch (err) {
console.error(
'Unable to convert HTML to text, html:',
}): Utterance[] => {
let text = textItems.join('')
if (!isHtml) {
// for title
const wordCount = tokenizer.tokenize(text).length
return [
{
idx,
text,
', error:',
err
)
textWithWordOffset =
parseHTML(text).document.documentElement.textContent ?? text
console.info('Converted HTML to text:', textWithWordOffset)
wordOffset,
wordCount,
voice,
},
]
}
const utterances: Utterance[] = []
try {
text = htmlToText(text, { wordwrap: false })
} catch (err) {
console.error(
'Unable to convert HTML to text, html:',
text,
', error:',
err
)
text = parseHTML(text).document.documentElement.textContent ?? text
console.info('Converted HTML to text:', text)
}
// split text into chunks of 256 characters to stream faster without breaking on words
const textChunks = text.match(/.{1,256}(?= |$)/g)
if (textChunks) {
for (const chunk of textChunks) {
const wordCount = tokenizer.tokenize(chunk).length
utterances.push({
idx,
text: chunk,
wordOffset,
wordCount,
voice,
})
wordOffset += wordCount
}
}
const wordCount = tokenizer.tokenize(textWithWordOffset).length
return {
idx,
text,
wordOffset,
wordCount,
voice,
}
return utterances
}
export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => {
@ -331,13 +350,13 @@ export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => {
let wordOffset = 0
if (title) {
// first utterances is the title
const titleUtterance = textToUtterance({
const titleUtterance = textToUtterances({
tokenizer,
idx: '',
textItems: [cleanText(title)], // title could have HTML entity names like & or emoji
wordOffset,
isHtml: false,
})
})[0]
utterances.push(titleUtterance)
wordOffset += titleUtterance.wordCount
}
@ -351,7 +370,7 @@ export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => {
// use paragraph as anchor
const idx = i.toString()
i = emitElement(textItems, node, true)
const utterance = textToUtterance({
const newUtterances = textToUtterances({
tokenizer,
idx,
textItems,
@ -359,8 +378,9 @@ export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => {
voice:
node.nodeName === 'BLOCKQUOTE' ? options.secondaryVoice : undefined,
})
utterance.wordCount > 0 && utterances.push(utterance)
wordOffset += utterance.wordCount
const wordCount = newUtterances.reduce((acc, u) => acc + u.wordCount, 0)
wordCount > 0 && utterances.push(...newUtterances)
wordOffset += wordCount
}
}

View File

@ -138,28 +138,17 @@ export const synthesizeTextToSpeech = async (
}
}
// for ssml
let audioData: Buffer = Buffer.from([])
// split ssml into chunks of 2000 characters to stream faster
// both within limit & without breaking on words and bookmarks <bookmark mark="1"/>
const ssmlChunks = input.text.match(/.{1,2000}(?= |$)(?! mark=)/g)
if (ssmlChunks) {
for (const ssmlChunk of ssmlChunks) {
const startSsmlChunk = startSsml(ssmlOptions)
const ssml = `${startSsmlChunk}${ssmlChunk}${endSsml()}`
// set the text offset to be the end of SSML start tag
wordOffset -= startSsmlChunk.length
const result = await speakSsmlAsyncPromise(ssml)
if (result.reason === ResultReason.Canceled) {
throw new Error(result.errorDetails)
}
timeOffset = timeOffset + result.audioDuration
wordOffset = wordOffset + ssmlChunk.length
audioData = Buffer.concat([audioData, Buffer.from(result.audioData)])
}
const startSsmlTag = startSsml(ssmlOptions)
const ssml = `${startSsmlTag}${input.text}${endSsml()}`
// set the text offset to be the end of SSML start tag
wordOffset -= startSsmlTag.length
const result = await speakSsmlAsyncPromise(ssml)
if (result.reason === ResultReason.Canceled) {
throw new Error(result.errorDetails)
}
return {
audioData,
audioData: Buffer.from(result.audioData),
speechMarks,
}
} catch (error) {