From 10a21adc335122a5c654a2b816ff38daf04b183e Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Mon, 27 Nov 2023 22:54:39 +0800 Subject: [PATCH] detect language from content if locale not found in metadata --- packages/readabilityjs/Readability.js | 35 +++++++++++++------ packages/readabilityjs/package.json | 1 + yarn.lock | 48 +++++++++++++++++---------- 3 files changed, 56 insertions(+), 28 deletions(-) diff --git a/packages/readabilityjs/Readability.js b/packages/readabilityjs/Readability.js index 2934841bd..6c9403e03 100644 --- a/packages/readabilityjs/Readability.js +++ b/packages/readabilityjs/Readability.js @@ -23,6 +23,7 @@ var parseSrcset = require('parse-srcset'); var htmlEntities = require('html-entities') const axios = require("axios"); +const cld = require('cld'); /** Checks whether an element is a wrapper for tweet */ const hasTweetInChildren = element => { @@ -3029,17 +3030,26 @@ Readability.prototype = { return false; }, - _getLanguage: function(code) { - if (!code) { - // Default to English - return 'English'; - } - + _getLanguage: async function(locale, content, languageCode) { try { + if (!locale) { + // detect language from the html content + if (content) { + const languages = (await cld.detect(content, { isHTML: true })).languages; + console.log('Detected languages: ', languages); + if (languages.length > 0) { + return languages[0].name; + } + } + } + + console.log('Trying to get language name from locale and language code: ', locale, languageCode); let lang = new Intl.DisplayNames(['en'], {type: 'language'}); - return lang.of(code.split('_')[0]); - } catch { - return 'English' + const code = locale || languageCode.replace('_', '-') || 'en'; + return lang.of(code); + } catch (error) { + console.error('Failed to get language', error); + return 'English'; } }, @@ -3112,12 +3122,15 @@ Readability.prototype = { } var textContent = articleContent.textContent; + const content = this._serializer(articleContent); + console.log('language code', metadata.locale, this._languageCode); + const language = await this._getLanguage(metadata.locale, content, this._languageCode); return { title: this._articleTitle, // remove \n and extra spaces and trim the string byline: author ? author.replace(/\n/g, ' ').replace(/\s+/g, ' ').trim() : null, dir: this._articleDir, - content: this._serializer(articleContent), + content, textContent: textContent, length: textContent.length, excerpt: metadata.excerpt, @@ -3125,7 +3138,7 @@ Readability.prototype = { siteIcon: metadata.siteIcon, previewImage: metadata.previewImage, publishedDate, - language: this._getLanguage(metadata.locale || this._languageCode), + language, }; } }; diff --git a/packages/readabilityjs/package.json b/packages/readabilityjs/package.json index 41c440cdc..0cccc78f5 100644 --- a/packages/readabilityjs/package.json +++ b/packages/readabilityjs/package.json @@ -39,6 +39,7 @@ "sinon": "^7.3.2" }, "dependencies": { + "cld": "^2.9.1", "html-entities": "^2.3.2", "parse-srcset": "^1.0.2" }, diff --git a/yarn.lock b/yarn.lock index 1091ebb97..7347c7690 100644 --- a/yarn.lock +++ b/yarn.lock @@ -10811,6 +10811,15 @@ classnames@^2.2.6: resolved "https://registry.yarnpkg.com/classnames/-/classnames-2.3.1.tgz#dfcfa3891e306ec1dad105d0e88f4417b8535e8e" integrity sha512-OlQdbZ7gLfGarSqxesMesDa5uz7KFbID8Kpq/SxIoNGDqY8lSYs0D+hhtBXhcdB3rcbXArFr7vlHheLk1voeNA== +cld@^2.9.1: + version "2.9.1" + resolved "https://registry.yarnpkg.com/cld/-/cld-2.9.1.tgz#0c6685672d9f4612dfeb75eabfdd17bf282a87a6" + integrity sha512-GfNd5TM2He+pCsEhJWBgnLAoB0pnn6jCoYom7pzv04wpF2d54C0nasHZ8BW59KijwnVRmEBcI+imRnVbj5fVjw== + dependencies: + glob "7" + node-addon-api "*" + underscore "^1.12.1" + clean-css@^4.2.3: version "4.2.4" resolved "https://registry.yarnpkg.com/clean-css/-/clean-css-4.2.4.tgz#733bf46eba4e607c6891ea57c24a989356831178" @@ -15130,6 +15139,18 @@ glob-to-regexp@^0.4.1: resolved "https://registry.yarnpkg.com/glob-to-regexp/-/glob-to-regexp-0.4.1.tgz#c75297087c851b9a578bd217dd59a92f59fe546e" integrity sha512-lkX1HJXwyMcprw/5YUZc2s7DrpAiHB21/V+E1rHUrVNokkvB6bqMzT0VfV6/86ZNabt1k14YOIaT7nDvOX3Iiw== +glob@7, glob@^7.0.5: + version "7.2.3" + resolved "https://registry.yarnpkg.com/glob/-/glob-7.2.3.tgz#b8df0fb802bbfa8e89bd1d938b4e16578ed44f2b" + integrity sha512-nFR0zLpU2YCaRxwoCJvL6UvCH2JFyFVIvwTLsIf21AuHlMskA1hhTdk+LlYJtOlYt9v6dvszD2BGRqBL+iQK9Q== + dependencies: + fs.realpath "^1.0.0" + inflight "^1.0.4" + inherits "2" + minimatch "^3.1.1" + once "^1.3.0" + path-is-absolute "^1.0.0" + glob@7.1.4: version "7.1.4" resolved "https://registry.yarnpkg.com/glob/-/glob-7.1.4.tgz#aa608a2f6c577ad357e1ae5a5c26d9a8d1969255" @@ -15189,18 +15210,6 @@ glob@^10.2.2: minipass "^5.0.0 || ^6.0.2 || ^7.0.0" path-scurry "^1.10.1" -glob@^7.0.5: - version "7.2.3" - resolved "https://registry.yarnpkg.com/glob/-/glob-7.2.3.tgz#b8df0fb802bbfa8e89bd1d938b4e16578ed44f2b" - integrity sha512-nFR0zLpU2YCaRxwoCJvL6UvCH2JFyFVIvwTLsIf21AuHlMskA1hhTdk+LlYJtOlYt9v6dvszD2BGRqBL+iQK9Q== - dependencies: - fs.realpath "^1.0.0" - inflight "^1.0.4" - inherits "2" - minimatch "^3.1.1" - once "^1.3.0" - path-is-absolute "^1.0.0" - glob@^8.0.0: version "8.0.3" resolved "https://registry.yarnpkg.com/glob/-/glob-8.0.3.tgz#415c6eb2deed9e502c68fa44a272e6da6eeca42e" @@ -20814,6 +20823,11 @@ node-abort-controller@^3.0.1: resolved "https://registry.yarnpkg.com/node-abort-controller/-/node-abort-controller-3.0.1.tgz#f91fa50b1dee3f909afabb7e261b1e1d6b0cb74e" integrity sha512-/ujIVxthRs+7q6hsdjHMaj8hRG9NuWmwrz+JdRwZ14jdFoKSkm+vDsCbF9PLpnSqjaWQJuTmVtcWHNLr+vrOFw== +node-addon-api@*: + version "7.0.0" + resolved "https://registry.yarnpkg.com/node-addon-api/-/node-addon-api-7.0.0.tgz#8136add2f510997b3b94814f4af1cce0b0e3962e" + integrity sha512-vgbBJTS4m5/KkE16t5Ly0WW9hz46swAstv0hYYwMtbG7AznRhNyfLRe8HZAiWIpcHzoO7HxhLuBQj9rJ/Ho0ZA== + node-addon-api@^1.2.0: version "1.7.2" resolved "https://registry.yarnpkg.com/node-addon-api/-/node-addon-api-1.7.2.tgz#3df30b95720b53c24e59948b49532b662444f54d" @@ -27285,16 +27299,16 @@ undefsafe@^2.0.5: resolved "https://registry.yarnpkg.com/undefsafe/-/undefsafe-2.0.5.tgz#38733b9327bdcd226db889fb723a6efd162e6e2c" integrity sha512-WxONCrssBM8TSPRqN5EmsjVrsv4A8X12J4ArBiiayv3DyyG3ZlIg6yysuuSYdZsVz3TKcTg2fd//Ujd4CHV1iA== +underscore@^1.12.1, underscore@^1.13.6, underscore@~1.13.2: + version "1.13.6" + resolved "https://registry.yarnpkg.com/underscore/-/underscore-1.13.6.tgz#04786a1f589dc6c09f761fc5f45b89e935136441" + integrity sha512-+A5Sja4HP1M08MaXya7p5LvjuM7K6q/2EaC0+iovj/wOcMsTzMvDFbasi/oSapiwOlt252IqsKqPjCl7huKS0A== + underscore@^1.13.4, underscore@^1.9.1: version "1.13.4" resolved "https://registry.yarnpkg.com/underscore/-/underscore-1.13.4.tgz#7886b46bbdf07f768e0052f1828e1dcab40c0dee" integrity sha512-BQFnUDuAQ4Yf/cYY5LNrK9NCJFKriaRbD9uR1fTeXnBeoa97W0i41qkZfGO9pSo8I5KzjAcSY2XYtdf0oKd7KQ== -underscore@^1.13.6, underscore@~1.13.2: - version "1.13.6" - resolved "https://registry.yarnpkg.com/underscore/-/underscore-1.13.6.tgz#04786a1f589dc6c09f761fc5f45b89e935136441" - integrity sha512-+A5Sja4HP1M08MaXya7p5LvjuM7K6q/2EaC0+iovj/wOcMsTzMvDFbasi/oSapiwOlt252IqsKqPjCl7huKS0A== - undici@^4.9.3: version "4.14.1" resolved "https://registry.yarnpkg.com/undici/-/undici-4.14.1.tgz#7633b143a8a10d6d63335e00511d071e8d52a1d9"