From 14b7ed5252705220302b0114d5cdab208f936878 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Thu, 20 Apr 2023 19:44:08 +0800 Subject: [PATCH] Fallback to old sentence tokenizer if new one is failed --- packages/text-to-speech/src/htmlToSsml.ts | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/packages/text-to-speech/src/htmlToSsml.ts b/packages/text-to-speech/src/htmlToSsml.ts index 8e03d6ff2..41e780011 100644 --- a/packages/text-to-speech/src/htmlToSsml.ts +++ b/packages/text-to-speech/src/htmlToSsml.ts @@ -1,6 +1,10 @@ import { htmlToText } from 'html-to-text' import { parseHTML } from 'linkedom' -import { SentenceTokenizerNew, WordPunctTokenizer } from 'natural' +import { + SentenceTokenizer, + SentenceTokenizerNew, + WordPunctTokenizer, +} from 'natural' // this code needs to be kept in sync with the // frontend code in: useReadingProgressAnchor @@ -304,8 +308,17 @@ const textToUtterances = ({ console.info('Converted HTML to text:', text) } const MAX_CHARS = 256 - const sentenceTokenizer = new SentenceTokenizerNew() - const sentences = sentenceTokenizer.tokenize(text) + let sentences: string[] = [] + try { + // use new sentence tokenizer + const sentenceTokenizer = new SentenceTokenizerNew() + sentences = sentenceTokenizer.tokenize(text) + } catch (err) { + console.debug('Unable to tokenize sentences, text:', text, ', error:', err) + // fallback to old sentence tokenizer + const sentenceTokenizer = new SentenceTokenizer() + sentences = sentenceTokenizer.tokenize(text) + } let currentText = '' // split text to max 256 chars per utterance and // use nlp lib to detect sentences and