diff --git a/packages/api/package.json b/packages/api/package.json index d5fc669a8..505362610 100644 --- a/packages/api/package.json +++ b/packages/api/package.json @@ -48,6 +48,7 @@ "@sentry/node": "^5.26.0", "@types/showdown": "^2.0.6", "addressparser": "^1.0.1", + "alfaaz": "^1.1.0", "apollo-datasource": "^3.3.1", "apollo-server-express": "^3.6.3", "axios": "^0.27.2", @@ -113,7 +114,6 @@ "uuid": "^8.3.1", "voca": "^1.4.0", "winston": "^3.3.3", - "word-counting": "^1.1.4", "yaml": "^2.4.1", "youtubei": "1.4.0" }, diff --git a/packages/api/src/repository/library_item.ts b/packages/api/src/repository/library_item.ts index fd933faf2..7b1470922 100644 --- a/packages/api/src/repository/library_item.ts +++ b/packages/api/src/repository/library_item.ts @@ -7,7 +7,7 @@ import { wordsCount } from '../utils/helpers' const convertToLibraryItem = (item: DeepPartial) => { return { ...item, - wordCount: item.wordCount ?? wordsCount(item.readableContent || ''), + wordCount: item.wordCount ?? wordsCount(item.readableContent || '', true), } } diff --git a/packages/api/src/resolvers/function_resolvers.ts b/packages/api/src/resolvers/function_resolvers.ts index aa226adc3..91a511036 100644 --- a/packages/api/src/resolvers/function_resolvers.ts +++ b/packages/api/src/resolvers/function_resolvers.ts @@ -432,7 +432,7 @@ export const functionResolvers = { if (article.wordCount) return article.wordCount return article.readableContent - ? wordsCount(article.readableContent) + ? wordsCount(article.readableContent, true) : undefined }, async labels(article: LibraryItem, _: unknown, ctx: ResolverContext) { @@ -499,7 +499,9 @@ export const functionResolvers = { }, wordsCount(item: LibraryItem) { if (item.wordCount) return item.wordCount - return item.readableContent ? wordsCount(item.readableContent) : undefined + return item.readableContent + ? wordsCount(item.readableContent, true) + : undefined }, siteIcon(item: LibraryItem) { if (item.siteIcon && !isBase64Image(item.siteIcon)) { diff --git a/packages/api/src/services/save_email.ts b/packages/api/src/services/save_email.ts index eb75616ac..c7c096a33 100644 --- a/packages/api/src/services/save_email.ts +++ b/packages/api/src/services/save_email.ts @@ -106,7 +106,9 @@ export const saveEmail = async ( state: LibraryItemState.Succeeded, siteIcon, siteName: parseResult.parsedContent?.siteName ?? undefined, - wordCount: wordsCount(content), + wordCount: parseResult.parsedContent?.textContent + ? wordsCount(parseResult.parsedContent.textContent) + : wordsCount(content, true), subscription: input.author, folder: input.folder, labelNames: labels.map((label) => label.name), diff --git a/packages/api/src/services/save_page.ts b/packages/api/src/services/save_page.ts index 6d7a16be0..c21a5db31 100644 --- a/packages/api/src/services/save_page.ts +++ b/packages/api/src/services/save_page.ts @@ -273,7 +273,9 @@ export const parsedContentToLibraryItem = ({ siteName: parsedContent?.siteName, itemLanguage: parsedContent?.language, siteIcon: parsedContent?.siteIcon, - wordCount: wordsCount(parsedContent?.textContent || ''), + wordCount: parsedContent?.textContent + ? wordsCount(parsedContent.textContent) + : wordsCount(parsedContent?.content || '', true), contentReader: contentReaderForLibraryItem(itemType, uploadFileId), subscription: rssFeedUrl, folder: folder || 'inbox', diff --git a/packages/api/src/utils/helpers.ts b/packages/api/src/utils/helpers.ts index 97f495859..d35261f13 100644 --- a/packages/api/src/utils/helpers.ts +++ b/packages/api/src/utils/helpers.ts @@ -1,12 +1,13 @@ /* eslint-disable @typescript-eslint/no-unsafe-assignment */ import languages from '@cospired/i18n-iso-languages' +import { countWords } from 'alfaaz' import crypto from 'crypto' import Redis from 'ioredis' +import { parseHTML } from 'linkedom' import normalizeUrl from 'normalize-url' import path from 'path' import _ from 'underscore' import slugify from 'voca/slugify' -import wordsCounter from 'word-counting' import { LibraryItem, LibraryItemState } from '../entity/library_item' import { CreateArticleError } from '../generated/graphql' import { createPubSubClient } from '../pubsub' @@ -175,9 +176,14 @@ export const wait = (ms: number): Promise => { }) } -export const wordsCount = (text: string, isHtml = true): number => { +export const wordsCount = (text: string, isHtml?: boolean): number => { try { - return wordsCounter(text, { isHtml }).wordsCount + if (isHtml) { + const dom = parseHTML(text).window.document + text = dom.body.textContent || '' + } + + return countWords(text) } catch { return 0 } diff --git a/yarn.lock b/yarn.lock index c36ba5702..071cdbc7a 100644 --- a/yarn.lock +++ b/yarn.lock @@ -9674,6 +9674,11 @@ ajv@^8.11.0: require-from-string "^2.0.2" uri-js "^4.2.2" +alfaaz@^1.1.0: + version "1.1.0" + resolved "https://registry.yarnpkg.com/alfaaz/-/alfaaz-1.1.0.tgz#535d2388eab3f654f27a82bca6bd6025e3b5e907" + integrity sha512-J/P07R41APslK7NmD5303bwStN8jpRA4DdvtLeAr1Jhfj6XWGrASUWI0G6jbWjJAZyw3Lu1Pb4J8rsM/cb+xDQ== + allotment@^1.20.2: version "1.20.2" resolved "https://registry.yarnpkg.com/allotment/-/allotment-1.20.2.tgz#5ea3a630b3265479debb69156658244711f83843" @@ -17831,7 +17836,7 @@ html-tags@^3.1.0: resolved "https://registry.yarnpkg.com/html-tags/-/html-tags-3.2.0.tgz#dbb3518d20b726524e4dd43de397eb0a95726961" integrity sha512-vy7ClnArOZwCnqZgvv+ddgHgJiAFXe3Ge9ML5/mBctVJoUoYPCdxVucOywjDARn6CVoh3dRSFdPHy2sX80L0Wg== -html-to-text@^8.1.0, html-to-text@^8.2.1: +html-to-text@^8.2.1: version "8.2.1" resolved "https://registry.yarnpkg.com/html-to-text/-/html-to-text-8.2.1.tgz#4a75b8a1b646149bd71c50527adb568990bf459b" integrity sha512-aN/3JvAk8qFsWVeE9InWAWueLXrbkoVZy0TkzaGhoRBC2gCFEeRLDDJN3/ijIGHohy6H+SZzUQWN/hcYtaPK8w== @@ -29148,7 +29153,7 @@ string-template@~0.2.1: resolved "https://registry.yarnpkg.com/string-template/-/string-template-0.2.1.tgz#42932e598a352d01fc22ec3367d9d84eec6c9add" integrity sha1-QpMuWYo1LQH8IuwzZ9nYTuxsmt0= -"string-width-cjs@npm:string-width@^4.2.0": +"string-width-cjs@npm:string-width@^4.2.0", "string-width@^1.0.2 || 2 || 3 || 4", string-width@^4.2.2, string-width@^4.2.3: version "4.2.3" resolved "https://registry.yarnpkg.com/string-width/-/string-width-4.2.3.tgz#269c7117d27b05ad2e536830a8ec895ef9c6d010" integrity sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g== @@ -29174,15 +29179,6 @@ string-width@^1.0.1: is-fullwidth-code-point "^2.0.0" strip-ansi "^4.0.0" -"string-width@^1.0.2 || 2 || 3 || 4", string-width@^4.2.2, string-width@^4.2.3: - version "4.2.3" - resolved "https://registry.yarnpkg.com/string-width/-/string-width-4.2.3.tgz#269c7117d27b05ad2e536830a8ec895ef9c6d010" - integrity sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g== - dependencies: - emoji-regex "^8.0.0" - is-fullwidth-code-point "^3.0.0" - strip-ansi "^6.0.1" - string-width@^3.0.0: version "3.1.0" resolved "https://registry.yarnpkg.com/string-width/-/string-width-3.1.0.tgz#22767be21b62af1081574306f69ac51b62203961" @@ -29337,7 +29333,7 @@ string_decoder@~1.1.1: dependencies: safe-buffer "~5.1.0" -"strip-ansi-cjs@npm:strip-ansi@^6.0.1": +"strip-ansi-cjs@npm:strip-ansi@^6.0.1", strip-ansi@^6.0.1: version "6.0.1" resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-6.0.1.tgz#9e26c63d30f53443e9489495b2105d37b67a85d9" integrity sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A== @@ -29372,13 +29368,6 @@ strip-ansi@^6.0.0: dependencies: ansi-regex "^5.0.0" -strip-ansi@^6.0.1: - version "6.0.1" - resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-6.0.1.tgz#9e26c63d30f53443e9489495b2105d37b67a85d9" - integrity sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A== - dependencies: - ansi-regex "^5.0.1" - strip-ansi@^7.0.0: version "7.0.1" resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-7.0.1.tgz#61740a08ce36b61e50e65653f07060d000975fb2" @@ -32000,19 +31989,6 @@ winston@^3.3.3: triple-beam "^1.3.0" winston-transport "^4.5.0" -word-counting@^1.1.4: - version "1.1.4" - resolved "https://registry.yarnpkg.com/word-counting/-/word-counting-1.1.4.tgz#4d772df20bd86e2e8b00596c8e1ab2f355578ddd" - integrity sha512-SsAKEoa6FzQTV7fR27vDOHO9m2f7cnGhppP+e0c6JCn+pDg88kCMVfrt0Qr8XcbbW2o6HIum4yFX8WEnxZ5xLA== - dependencies: - html-to-text "^8.1.0" - word-regex "^0.1.2" - -word-regex@^0.1.2: - version "0.1.2" - resolved "https://registry.yarnpkg.com/word-regex/-/word-regex-0.1.2.tgz#a3bc7f2d222ce4a93c246c3ef69458f61f511639" - integrity sha512-4jK/OibPeindR9o/sryObhVWNgD2LJCMJFWEME69p48sEYpE9axfyjHK+RqYcOeoEoqcqJEPE9iMdiiFpXHo0Q== - word-wrap@^1.2.3, word-wrap@~1.2.3: version "1.2.4" resolved "https://registry.yarnpkg.com/word-wrap/-/word-wrap-1.2.4.tgz#cb4b50ec9aca570abd1f52f33cd45b6c61739a9f" @@ -32057,7 +32033,7 @@ workerpool@6.2.1: resolved "https://registry.yarnpkg.com/workerpool/-/workerpool-6.2.1.tgz#46fc150c17d826b86a008e5a4508656777e9c343" integrity sha512-ILEIE97kDZvF9Wb9f6h5aXK4swSlKGUcOEGiIYb2OOu/IrDU9iwj0fD//SsA6E5ibwJxpEvhullJY4Sl4GcpAw== -"wrap-ansi-cjs@npm:wrap-ansi@^7.0.0": +"wrap-ansi-cjs@npm:wrap-ansi@^7.0.0", wrap-ansi@^7.0.0: version "7.0.0" resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-7.0.0.tgz#67e145cff510a6a6984bdf1152911d69d2eb9e43" integrity sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q== @@ -32083,15 +32059,6 @@ wrap-ansi@^6.0.1, wrap-ansi@^6.2.0: string-width "^4.1.0" strip-ansi "^6.0.0" -wrap-ansi@^7.0.0: - version "7.0.0" - resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-7.0.0.tgz#67e145cff510a6a6984bdf1152911d69d2eb9e43" - integrity sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q== - dependencies: - ansi-styles "^4.0.0" - string-width "^4.1.0" - strip-ansi "^6.0.0" - wrap-ansi@^8.1.0: version "8.1.0" resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-8.1.0.tgz#56dc22368ee570face1b49819975d9b9a5ead214"