From 8a86f19e61847c20cef9568a8f621c4fb844c35d Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Mon, 17 Apr 2023 18:56:17 +0800 Subject: [PATCH] Upgrade natural nodejs module and use a new sentence tokenizer based on parsing --- packages/text-to-speech/package.json | 2 +- packages/text-to-speech/src/htmlToSsml.ts | 7 +++---- yarn.lock | 20 ++++++++++++++++---- 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/packages/text-to-speech/package.json b/packages/text-to-speech/package.json index 2c4e8591b..6c40d7faf 100644 --- a/packages/text-to-speech/package.json +++ b/packages/text-to-speech/package.json @@ -42,7 +42,7 @@ "jsonwebtoken": "^8.5.1", "linkedom": "^0.14.12", "microsoft-cognitiveservices-speech-sdk": "^1.22.0", - "natural": "^5.2.3", + "natural": "^6.2.0", "redis": "^4.3.1", "underscore": "^1.13.4" } diff --git a/packages/text-to-speech/src/htmlToSsml.ts b/packages/text-to-speech/src/htmlToSsml.ts index e6c1207d6..d3bfa63c2 100644 --- a/packages/text-to-speech/src/htmlToSsml.ts +++ b/packages/text-to-speech/src/htmlToSsml.ts @@ -1,6 +1,6 @@ -import { parseHTML } from 'linkedom' -import { SentenceTokenizer, WordPunctTokenizer } from 'natural' import { htmlToText } from 'html-to-text' +import { parseHTML } from 'linkedom' +import { SentenceTokenizerNew, WordPunctTokenizer } from 'natural' // this code needs to be kept in sync with the // frontend code in: useReadingProgressAnchor @@ -303,9 +303,8 @@ const textToUtterances = ({ text = parseHTML(text).document.documentElement.textContent ?? text console.info('Converted HTML to text:', text) } - const MAX_CHARS = 256 - const sentenceTokenizer = new SentenceTokenizer() + const sentenceTokenizer = new SentenceTokenizerNew() const sentences = sentenceTokenizer.tokenize(text) let currentText = '' // split text to max 256 chars per utterance and diff --git a/yarn.lock b/yarn.lock index 2154a36d6..d0d6c0934 100644 --- a/yarn.lock +++ b/yarn.lock @@ -9470,6 +9470,11 @@ addressparser@^1.0.1: resolved "https://registry.yarnpkg.com/addressparser/-/addressparser-1.0.1.tgz#47afbe1a2a9262191db6838e4fd1d39b40821746" integrity sha512-aQX7AISOMM7HFE0iZ3+YnD07oIeJqWGVnJ+ZIKaBZAk03ftmVYVqsGas/rbXKR21n4D/hKCSHypvcyOkds/xzg== +afinn-165-financialmarketnews@^3.0.0: + version "3.0.0" + resolved "https://registry.yarnpkg.com/afinn-165-financialmarketnews/-/afinn-165-financialmarketnews-3.0.0.tgz#cf422577775bf94f9bc156f3f001a1f29338c3d8" + integrity sha512-0g9A1S3ZomFIGDTzZ0t6xmv4AuokBvBmpes8htiyHpH7N4xDmvSQL6UxL/Zcs2ypRb3VwgCscaD8Q3zEawKYhw== + afinn-165@^1.0.2: version "1.0.4" resolved "https://registry.yarnpkg.com/afinn-165/-/afinn-165-1.0.4.tgz#3abf6b8922dd5db84d84e0abd155924381dd73a4" @@ -20594,14 +20599,16 @@ natural-compare@^1.4.0: resolved "https://registry.yarnpkg.com/natural-compare/-/natural-compare-1.4.0.tgz#4abebfeed7541f2c27acfb29bdbbd15c8d5ba4f7" integrity sha1-Sr6/7tdUHywnrPspvbvRXI1bpPc= -natural@^5.2.3: - version "5.2.3" - resolved "https://registry.yarnpkg.com/natural/-/natural-5.2.3.tgz#bfd9b9710139313edf6ec3172435e48998884535" - integrity sha512-fsGGpbU15YBc2oQCEsi0t7ZeF3VmKyxDhgWucQTPk4zaDFzeZtquRbZt4xlznN2ZUlH88215HcThMYaDHFM48Q== +natural@^6.2.0: + version "6.2.0" + resolved "https://registry.yarnpkg.com/natural/-/natural-6.2.0.tgz#74a45c66336e5c35057aba0fafb0f5351c81a201" + integrity sha512-/+ceiLjldGcMgCtryGJV6jS2IslPeLE+bwjXry9jOFNl586J1rV5egDa4X8nnPCZmzCeXrE4uHajpkuB/ILH2Q== dependencies: afinn-165 "^1.0.2" + afinn-165-financialmarketnews "^3.0.0" apparatus "^0.0.10" safe-stable-stringify "^2.2.0" + stopwords-iso "^1.1.0" sylvester "^0.0.12" underscore "^1.9.1" wordnet-db "^3.1.11" @@ -25446,6 +25453,11 @@ statuses@2.0.1: resolved "https://registry.yarnpkg.com/statuses/-/statuses-1.5.0.tgz#161c7dac177659fd9811f43771fa99381478628c" integrity sha1-Fhx9rBd2Wf2YEfQ3cfqZOBR4Yow= +stopwords-iso@^1.1.0: + version "1.1.0" + resolved "https://registry.yarnpkg.com/stopwords-iso/-/stopwords-iso-1.1.0.tgz#dc303db6b0842d4290bc1339b4eaf37b94219395" + integrity sha512-I6GPS/E0zyieHehMRPQcqkiBMJKGgLta+1hREixhoLPqEA0AlVFiC43dl8uPpmkkeRdDMzYRWFWk5/l9x7nmNg== + store2@^2.12.0: version "2.13.2" resolved "https://registry.yarnpkg.com/store2/-/store2-2.13.2.tgz#01ad8802ca5b445b9c316b55e72645c13a3cd7e3"