Split utterance into chunks of 256 chars
This commit is contained in:
@ -255,7 +255,7 @@ export const stripEmojis = (text: string): string => {
|
||||
return text.replace(emojiRegex, '').replace(/\s+/g, ' ')
|
||||
}
|
||||
|
||||
const textToUtterance = ({
|
||||
const textToUtterances = ({
|
||||
tokenizer,
|
||||
idx,
|
||||
textItems,
|
||||
@ -269,32 +269,51 @@ const textToUtterance = ({
|
||||
wordOffset: number
|
||||
voice?: string
|
||||
isHtml?: boolean
|
||||
}): Utterance => {
|
||||
const text = textItems.join('')
|
||||
let textWithWordOffset = text
|
||||
if (isHtml) {
|
||||
try {
|
||||
textWithWordOffset = htmlToText(text, { wordwrap: false })
|
||||
} catch (err) {
|
||||
console.error(
|
||||
'Unable to convert HTML to text, html:',
|
||||
}): Utterance[] => {
|
||||
let text = textItems.join('')
|
||||
if (!isHtml) {
|
||||
// for title
|
||||
const wordCount = tokenizer.tokenize(text).length
|
||||
return [
|
||||
{
|
||||
idx,
|
||||
text,
|
||||
', error:',
|
||||
err
|
||||
)
|
||||
textWithWordOffset =
|
||||
parseHTML(text).document.documentElement.textContent ?? text
|
||||
console.info('Converted HTML to text:', textWithWordOffset)
|
||||
wordOffset,
|
||||
wordCount,
|
||||
voice,
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
const utterances: Utterance[] = []
|
||||
try {
|
||||
text = htmlToText(text, { wordwrap: false })
|
||||
} catch (err) {
|
||||
console.error(
|
||||
'Unable to convert HTML to text, html:',
|
||||
text,
|
||||
', error:',
|
||||
err
|
||||
)
|
||||
text = parseHTML(text).document.documentElement.textContent ?? text
|
||||
console.info('Converted HTML to text:', text)
|
||||
}
|
||||
// split text into chunks of 256 characters to stream faster without breaking on words
|
||||
const textChunks = text.match(/.{1,256}(?= |$)/g)
|
||||
if (textChunks) {
|
||||
for (const chunk of textChunks) {
|
||||
const wordCount = tokenizer.tokenize(chunk).length
|
||||
utterances.push({
|
||||
idx,
|
||||
text: chunk,
|
||||
wordOffset,
|
||||
wordCount,
|
||||
voice,
|
||||
})
|
||||
wordOffset += wordCount
|
||||
}
|
||||
}
|
||||
const wordCount = tokenizer.tokenize(textWithWordOffset).length
|
||||
return {
|
||||
idx,
|
||||
text,
|
||||
wordOffset,
|
||||
wordCount,
|
||||
voice,
|
||||
}
|
||||
return utterances
|
||||
}
|
||||
|
||||
export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => {
|
||||
@ -331,13 +350,13 @@ export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => {
|
||||
let wordOffset = 0
|
||||
if (title) {
|
||||
// first utterances is the title
|
||||
const titleUtterance = textToUtterance({
|
||||
const titleUtterance = textToUtterances({
|
||||
tokenizer,
|
||||
idx: '',
|
||||
textItems: [cleanText(title)], // title could have HTML entity names like & or emoji
|
||||
wordOffset,
|
||||
isHtml: false,
|
||||
})
|
||||
})[0]
|
||||
utterances.push(titleUtterance)
|
||||
wordOffset += titleUtterance.wordCount
|
||||
}
|
||||
@ -351,7 +370,7 @@ export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => {
|
||||
// use paragraph as anchor
|
||||
const idx = i.toString()
|
||||
i = emitElement(textItems, node, true)
|
||||
const utterance = textToUtterance({
|
||||
const newUtterances = textToUtterances({
|
||||
tokenizer,
|
||||
idx,
|
||||
textItems,
|
||||
@ -359,8 +378,9 @@ export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => {
|
||||
voice:
|
||||
node.nodeName === 'BLOCKQUOTE' ? options.secondaryVoice : undefined,
|
||||
})
|
||||
utterance.wordCount > 0 && utterances.push(utterance)
|
||||
wordOffset += utterance.wordCount
|
||||
const wordCount = newUtterances.reduce((acc, u) => acc + u.wordCount, 0)
|
||||
wordCount > 0 && utterances.push(...newUtterances)
|
||||
wordOffset += wordCount
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -138,28 +138,17 @@ export const synthesizeTextToSpeech = async (
|
||||
}
|
||||
}
|
||||
// for ssml
|
||||
let audioData: Buffer = Buffer.from([])
|
||||
// split ssml into chunks of 2000 characters to stream faster
|
||||
// both within limit & without breaking on words and bookmarks <bookmark mark="1"/>
|
||||
const ssmlChunks = input.text.match(/.{1,2000}(?= |$)(?! mark=)/g)
|
||||
if (ssmlChunks) {
|
||||
for (const ssmlChunk of ssmlChunks) {
|
||||
const startSsmlChunk = startSsml(ssmlOptions)
|
||||
const ssml = `${startSsmlChunk}${ssmlChunk}${endSsml()}`
|
||||
// set the text offset to be the end of SSML start tag
|
||||
wordOffset -= startSsmlChunk.length
|
||||
const result = await speakSsmlAsyncPromise(ssml)
|
||||
if (result.reason === ResultReason.Canceled) {
|
||||
throw new Error(result.errorDetails)
|
||||
}
|
||||
timeOffset = timeOffset + result.audioDuration
|
||||
wordOffset = wordOffset + ssmlChunk.length
|
||||
audioData = Buffer.concat([audioData, Buffer.from(result.audioData)])
|
||||
}
|
||||
const startSsmlTag = startSsml(ssmlOptions)
|
||||
const ssml = `${startSsmlTag}${input.text}${endSsml()}`
|
||||
// set the text offset to be the end of SSML start tag
|
||||
wordOffset -= startSsmlTag.length
|
||||
const result = await speakSsmlAsyncPromise(ssml)
|
||||
if (result.reason === ResultReason.Canceled) {
|
||||
throw new Error(result.errorDetails)
|
||||
}
|
||||
|
||||
return {
|
||||
audioData,
|
||||
audioData: Buffer.from(result.audioData),
|
||||
speechMarks,
|
||||
}
|
||||
} catch (error) {
|
||||
|
||||
Reference in New Issue
Block a user