fix: word count in CJK languages

This commit is contained in:
Hongbo Wu
2024-07-04 12:28:01 +08:00
parent 56032b0765
commit e9dcdc6dd1
7 changed files with 30 additions and 51 deletions

View File

@ -48,6 +48,7 @@
"@sentry/node": "^5.26.0",
"@types/showdown": "^2.0.6",
"addressparser": "^1.0.1",
"alfaaz": "^1.1.0",
"apollo-datasource": "^3.3.1",
"apollo-server-express": "^3.6.3",
"axios": "^0.27.2",
@ -113,7 +114,6 @@
"uuid": "^8.3.1",
"voca": "^1.4.0",
"winston": "^3.3.3",
"word-counting": "^1.1.4",
"yaml": "^2.4.1",
"youtubei": "1.4.0"
},

View File

@ -7,7 +7,7 @@ import { wordsCount } from '../utils/helpers'
const convertToLibraryItem = (item: DeepPartial<LibraryItem>) => {
return {
...item,
wordCount: item.wordCount ?? wordsCount(item.readableContent || ''),
wordCount: item.wordCount ?? wordsCount(item.readableContent || '', true),
}
}

View File

@ -432,7 +432,7 @@ export const functionResolvers = {
if (article.wordCount) return article.wordCount
return article.readableContent
? wordsCount(article.readableContent)
? wordsCount(article.readableContent, true)
: undefined
},
async labels(article: LibraryItem, _: unknown, ctx: ResolverContext) {
@ -499,7 +499,9 @@ export const functionResolvers = {
},
wordsCount(item: LibraryItem) {
if (item.wordCount) return item.wordCount
return item.readableContent ? wordsCount(item.readableContent) : undefined
return item.readableContent
? wordsCount(item.readableContent, true)
: undefined
},
siteIcon(item: LibraryItem) {
if (item.siteIcon && !isBase64Image(item.siteIcon)) {

View File

@ -106,7 +106,9 @@ export const saveEmail = async (
state: LibraryItemState.Succeeded,
siteIcon,
siteName: parseResult.parsedContent?.siteName ?? undefined,
wordCount: wordsCount(content),
wordCount: parseResult.parsedContent?.textContent
? wordsCount(parseResult.parsedContent.textContent)
: wordsCount(content, true),
subscription: input.author,
folder: input.folder,
labelNames: labels.map((label) => label.name),

View File

@ -273,7 +273,9 @@ export const parsedContentToLibraryItem = ({
siteName: parsedContent?.siteName,
itemLanguage: parsedContent?.language,
siteIcon: parsedContent?.siteIcon,
wordCount: wordsCount(parsedContent?.textContent || ''),
wordCount: parsedContent?.textContent
? wordsCount(parsedContent.textContent)
: wordsCount(parsedContent?.content || '', true),
contentReader: contentReaderForLibraryItem(itemType, uploadFileId),
subscription: rssFeedUrl,
folder: folder || 'inbox',

View File

@ -1,12 +1,13 @@
/* eslint-disable @typescript-eslint/no-unsafe-assignment */
import languages from '@cospired/i18n-iso-languages'
import { countWords } from 'alfaaz'
import crypto from 'crypto'
import Redis from 'ioredis'
import { parseHTML } from 'linkedom'
import normalizeUrl from 'normalize-url'
import path from 'path'
import _ from 'underscore'
import slugify from 'voca/slugify'
import wordsCounter from 'word-counting'
import { LibraryItem, LibraryItemState } from '../entity/library_item'
import { CreateArticleError } from '../generated/graphql'
import { createPubSubClient } from '../pubsub'
@ -175,9 +176,14 @@ export const wait = (ms: number): Promise<void> => {
})
}
export const wordsCount = (text: string, isHtml = true): number => {
export const wordsCount = (text: string, isHtml?: boolean): number => {
try {
return wordsCounter(text, { isHtml }).wordsCount
if (isHtml) {
const dom = parseHTML(text).window.document
text = dom.body.textContent || ''
}
return countWords(text)
} catch {
return 0
}

View File

@ -9674,6 +9674,11 @@ ajv@^8.11.0:
require-from-string "^2.0.2"
uri-js "^4.2.2"
alfaaz@^1.1.0:
version "1.1.0"
resolved "https://registry.yarnpkg.com/alfaaz/-/alfaaz-1.1.0.tgz#535d2388eab3f654f27a82bca6bd6025e3b5e907"
integrity sha512-J/P07R41APslK7NmD5303bwStN8jpRA4DdvtLeAr1Jhfj6XWGrASUWI0G6jbWjJAZyw3Lu1Pb4J8rsM/cb+xDQ==
allotment@^1.20.2:
version "1.20.2"
resolved "https://registry.yarnpkg.com/allotment/-/allotment-1.20.2.tgz#5ea3a630b3265479debb69156658244711f83843"
@ -17831,7 +17836,7 @@ html-tags@^3.1.0:
resolved "https://registry.yarnpkg.com/html-tags/-/html-tags-3.2.0.tgz#dbb3518d20b726524e4dd43de397eb0a95726961"
integrity sha512-vy7ClnArOZwCnqZgvv+ddgHgJiAFXe3Ge9ML5/mBctVJoUoYPCdxVucOywjDARn6CVoh3dRSFdPHy2sX80L0Wg==
html-to-text@^8.1.0, html-to-text@^8.2.1:
html-to-text@^8.2.1:
version "8.2.1"
resolved "https://registry.yarnpkg.com/html-to-text/-/html-to-text-8.2.1.tgz#4a75b8a1b646149bd71c50527adb568990bf459b"
integrity sha512-aN/3JvAk8qFsWVeE9InWAWueLXrbkoVZy0TkzaGhoRBC2gCFEeRLDDJN3/ijIGHohy6H+SZzUQWN/hcYtaPK8w==
@ -29148,7 +29153,7 @@ string-template@~0.2.1:
resolved "https://registry.yarnpkg.com/string-template/-/string-template-0.2.1.tgz#42932e598a352d01fc22ec3367d9d84eec6c9add"
integrity sha1-QpMuWYo1LQH8IuwzZ9nYTuxsmt0=
"string-width-cjs@npm:string-width@^4.2.0":
"string-width-cjs@npm:string-width@^4.2.0", "string-width@^1.0.2 || 2 || 3 || 4", string-width@^4.2.2, string-width@^4.2.3:
version "4.2.3"
resolved "https://registry.yarnpkg.com/string-width/-/string-width-4.2.3.tgz#269c7117d27b05ad2e536830a8ec895ef9c6d010"
integrity sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==
@ -29174,15 +29179,6 @@ string-width@^1.0.1:
is-fullwidth-code-point "^2.0.0"
strip-ansi "^4.0.0"
"string-width@^1.0.2 || 2 || 3 || 4", string-width@^4.2.2, string-width@^4.2.3:
version "4.2.3"
resolved "https://registry.yarnpkg.com/string-width/-/string-width-4.2.3.tgz#269c7117d27b05ad2e536830a8ec895ef9c6d010"
integrity sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==
dependencies:
emoji-regex "^8.0.0"
is-fullwidth-code-point "^3.0.0"
strip-ansi "^6.0.1"
string-width@^3.0.0:
version "3.1.0"
resolved "https://registry.yarnpkg.com/string-width/-/string-width-3.1.0.tgz#22767be21b62af1081574306f69ac51b62203961"
@ -29337,7 +29333,7 @@ string_decoder@~1.1.1:
dependencies:
safe-buffer "~5.1.0"
"strip-ansi-cjs@npm:strip-ansi@^6.0.1":
"strip-ansi-cjs@npm:strip-ansi@^6.0.1", strip-ansi@^6.0.1:
version "6.0.1"
resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-6.0.1.tgz#9e26c63d30f53443e9489495b2105d37b67a85d9"
integrity sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==
@ -29372,13 +29368,6 @@ strip-ansi@^6.0.0:
dependencies:
ansi-regex "^5.0.0"
strip-ansi@^6.0.1:
version "6.0.1"
resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-6.0.1.tgz#9e26c63d30f53443e9489495b2105d37b67a85d9"
integrity sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==
dependencies:
ansi-regex "^5.0.1"
strip-ansi@^7.0.0:
version "7.0.1"
resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-7.0.1.tgz#61740a08ce36b61e50e65653f07060d000975fb2"
@ -32000,19 +31989,6 @@ winston@^3.3.3:
triple-beam "^1.3.0"
winston-transport "^4.5.0"
word-counting@^1.1.4:
version "1.1.4"
resolved "https://registry.yarnpkg.com/word-counting/-/word-counting-1.1.4.tgz#4d772df20bd86e2e8b00596c8e1ab2f355578ddd"
integrity sha512-SsAKEoa6FzQTV7fR27vDOHO9m2f7cnGhppP+e0c6JCn+pDg88kCMVfrt0Qr8XcbbW2o6HIum4yFX8WEnxZ5xLA==
dependencies:
html-to-text "^8.1.0"
word-regex "^0.1.2"
word-regex@^0.1.2:
version "0.1.2"
resolved "https://registry.yarnpkg.com/word-regex/-/word-regex-0.1.2.tgz#a3bc7f2d222ce4a93c246c3ef69458f61f511639"
integrity sha512-4jK/OibPeindR9o/sryObhVWNgD2LJCMJFWEME69p48sEYpE9axfyjHK+RqYcOeoEoqcqJEPE9iMdiiFpXHo0Q==
word-wrap@^1.2.3, word-wrap@~1.2.3:
version "1.2.4"
resolved "https://registry.yarnpkg.com/word-wrap/-/word-wrap-1.2.4.tgz#cb4b50ec9aca570abd1f52f33cd45b6c61739a9f"
@ -32057,7 +32033,7 @@ workerpool@6.2.1:
resolved "https://registry.yarnpkg.com/workerpool/-/workerpool-6.2.1.tgz#46fc150c17d826b86a008e5a4508656777e9c343"
integrity sha512-ILEIE97kDZvF9Wb9f6h5aXK4swSlKGUcOEGiIYb2OOu/IrDU9iwj0fD//SsA6E5ibwJxpEvhullJY4Sl4GcpAw==
"wrap-ansi-cjs@npm:wrap-ansi@^7.0.0":
"wrap-ansi-cjs@npm:wrap-ansi@^7.0.0", wrap-ansi@^7.0.0:
version "7.0.0"
resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-7.0.0.tgz#67e145cff510a6a6984bdf1152911d69d2eb9e43"
integrity sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==
@ -32083,15 +32059,6 @@ wrap-ansi@^6.0.1, wrap-ansi@^6.2.0:
string-width "^4.1.0"
strip-ansi "^6.0.0"
wrap-ansi@^7.0.0:
version "7.0.0"
resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-7.0.0.tgz#67e145cff510a6a6984bdf1152911d69d2eb9e43"
integrity sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==
dependencies:
ansi-styles "^4.0.0"
string-width "^4.1.0"
strip-ansi "^6.0.0"
wrap-ansi@^8.1.0:
version "8.1.0"
resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-8.1.0.tgz#56dc22368ee570face1b49819975d9b9a5ead214"