Replace SentenceTokenizerNew with SentenceTokenizer

This commit is contained in:
Hongbo Wu
2023-04-18 09:33:48 +08:00
parent c697b0cb4d
commit c63ecc94fb

View File

@ -262,14 +262,14 @@ export const stripEmojis = (text: string): string => {
}
const textToUtterances = ({
tokenizer,
wordTokenizer,
idx,
textItems,
wordOffset,
voice,
isHtml = true,
}: {
tokenizer: WordPunctTokenizer
wordTokenizer: WordPunctTokenizer
idx: string
textItems: string[]
wordOffset: number
@ -284,7 +284,7 @@ const textToUtterances = ({
idx,
text,
wordOffset,
wordCount: tokenizer.tokenize(text).length,
wordCount: wordTokenizer.tokenize(text).length,
voice,
},
]
@ -318,7 +318,7 @@ const textToUtterances = ({
const nextText = currentText + sentence
if (nextText.length > MAX_CHARS) {
if (currentText.length > 0) {
const wordCount = tokenizer.tokenize(currentText).length
const wordCount = wordTokenizer.tokenize(currentText).length
utterances.push({
idx,
text: currentText,
@ -329,7 +329,7 @@ const textToUtterances = ({
wordOffset += wordCount
currentText = sentence
} else {
const wordCount = tokenizer.tokenize(sentence).length
const wordCount = wordTokenizer.tokenize(sentence).length
utterances.push({
idx,
text: sentence,
@ -347,7 +347,7 @@ const textToUtterances = ({
idx,
text: currentText,
wordOffset,
wordCount: tokenizer.tokenize(currentText).length,
wordCount: wordTokenizer.tokenize(currentText).length,
voice,
})
}
@ -385,13 +385,13 @@ export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => {
}
}
const tokenizer = new WordPunctTokenizer()
const wordTokenizer = new WordPunctTokenizer()
const utterances: Utterance[] = []
let wordOffset = 0
if (title) {
// first utterances is the title
const titleUtterance = textToUtterances({
tokenizer,
wordTokenizer,
idx: '',
textItems: [stripEmojis(title)], // title could have emoji
wordOffset,
@ -412,7 +412,7 @@ export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => {
const idx = i.toString()
i = emitElement(textItems, node, true)
const newUtterances = textToUtterances({
tokenizer,
wordTokenizer,
idx,
textItems,
wordOffset,