Upgrade natural nodejs module and use a new sentence tokenizer based on parsing

This commit is contained in:
Hongbo Wu
2023-04-17 18:56:17 +08:00
parent 45bb8971db
commit 8a86f19e61
3 changed files with 20 additions and 9 deletions

View File

@ -42,7 +42,7 @@
"jsonwebtoken": "^8.5.1",
"linkedom": "^0.14.12",
"microsoft-cognitiveservices-speech-sdk": "^1.22.0",
"natural": "^5.2.3",
"natural": "^6.2.0",
"redis": "^4.3.1",
"underscore": "^1.13.4"
}

View File

@ -1,6 +1,6 @@
import { parseHTML } from 'linkedom'
import { SentenceTokenizer, WordPunctTokenizer } from 'natural'
import { htmlToText } from 'html-to-text'
import { parseHTML } from 'linkedom'
import { SentenceTokenizerNew, WordPunctTokenizer } from 'natural'
// this code needs to be kept in sync with the
// frontend code in: useReadingProgressAnchor
@ -303,9 +303,8 @@ const textToUtterances = ({
text = parseHTML(text).document.documentElement.textContent ?? text
console.info('Converted HTML to text:', text)
}
const MAX_CHARS = 256
const sentenceTokenizer = new SentenceTokenizer()
const sentenceTokenizer = new SentenceTokenizerNew()
const sentences = sentenceTokenizer.tokenize(text)
let currentText = ''
// split text to max 256 chars per utterance and

View File

@ -9470,6 +9470,11 @@ addressparser@^1.0.1:
resolved "https://registry.yarnpkg.com/addressparser/-/addressparser-1.0.1.tgz#47afbe1a2a9262191db6838e4fd1d39b40821746"
integrity sha512-aQX7AISOMM7HFE0iZ3+YnD07oIeJqWGVnJ+ZIKaBZAk03ftmVYVqsGas/rbXKR21n4D/hKCSHypvcyOkds/xzg==
afinn-165-financialmarketnews@^3.0.0:
version "3.0.0"
resolved "https://registry.yarnpkg.com/afinn-165-financialmarketnews/-/afinn-165-financialmarketnews-3.0.0.tgz#cf422577775bf94f9bc156f3f001a1f29338c3d8"
integrity sha512-0g9A1S3ZomFIGDTzZ0t6xmv4AuokBvBmpes8htiyHpH7N4xDmvSQL6UxL/Zcs2ypRb3VwgCscaD8Q3zEawKYhw==
afinn-165@^1.0.2:
version "1.0.4"
resolved "https://registry.yarnpkg.com/afinn-165/-/afinn-165-1.0.4.tgz#3abf6b8922dd5db84d84e0abd155924381dd73a4"
@ -20594,14 +20599,16 @@ natural-compare@^1.4.0:
resolved "https://registry.yarnpkg.com/natural-compare/-/natural-compare-1.4.0.tgz#4abebfeed7541f2c27acfb29bdbbd15c8d5ba4f7"
integrity sha1-Sr6/7tdUHywnrPspvbvRXI1bpPc=
natural@^5.2.3:
version "5.2.3"
resolved "https://registry.yarnpkg.com/natural/-/natural-5.2.3.tgz#bfd9b9710139313edf6ec3172435e48998884535"
integrity sha512-fsGGpbU15YBc2oQCEsi0t7ZeF3VmKyxDhgWucQTPk4zaDFzeZtquRbZt4xlznN2ZUlH88215HcThMYaDHFM48Q==
natural@^6.2.0:
version "6.2.0"
resolved "https://registry.yarnpkg.com/natural/-/natural-6.2.0.tgz#74a45c66336e5c35057aba0fafb0f5351c81a201"
integrity sha512-/+ceiLjldGcMgCtryGJV6jS2IslPeLE+bwjXry9jOFNl586J1rV5egDa4X8nnPCZmzCeXrE4uHajpkuB/ILH2Q==
dependencies:
afinn-165 "^1.0.2"
afinn-165-financialmarketnews "^3.0.0"
apparatus "^0.0.10"
safe-stable-stringify "^2.2.0"
stopwords-iso "^1.1.0"
sylvester "^0.0.12"
underscore "^1.9.1"
wordnet-db "^3.1.11"
@ -25446,6 +25453,11 @@ statuses@2.0.1:
resolved "https://registry.yarnpkg.com/statuses/-/statuses-1.5.0.tgz#161c7dac177659fd9811f43771fa99381478628c"
integrity sha1-Fhx9rBd2Wf2YEfQ3cfqZOBR4Yow=
stopwords-iso@^1.1.0:
version "1.1.0"
resolved "https://registry.yarnpkg.com/stopwords-iso/-/stopwords-iso-1.1.0.tgz#dc303db6b0842d4290bc1339b4eaf37b94219395"
integrity sha512-I6GPS/E0zyieHehMRPQcqkiBMJKGgLta+1hREixhoLPqEA0AlVFiC43dl8uPpmkkeRdDMzYRWFWk5/l9x7nmNg==
store2@^2.12.0:
version "2.13.2"
resolved "https://registry.yarnpkg.com/store2/-/store2-2.13.2.tgz#01ad8802ca5b445b9c316b55e72645c13a3cd7e3"