detect language from content if locale not found in metadata

This commit is contained in:
Hongbo Wu
2023-11-27 22:54:39 +08:00
parent ee3b14e46d
commit 10a21adc33
3 changed files with 56 additions and 28 deletions

View File

@ -23,6 +23,7 @@
var parseSrcset = require('parse-srcset');
var htmlEntities = require('html-entities')
const axios = require("axios");
const cld = require('cld');
/** Checks whether an element is a wrapper for tweet */
const hasTweetInChildren = element => {
@ -3029,17 +3030,26 @@ Readability.prototype = {
return false;
},
_getLanguage: function(code) {
if (!code) {
// Default to English
return 'English';
}
_getLanguage: async function(locale, content, languageCode) {
try {
if (!locale) {
// detect language from the html content
if (content) {
const languages = (await cld.detect(content, { isHTML: true })).languages;
console.log('Detected languages: ', languages);
if (languages.length > 0) {
return languages[0].name;
}
}
}
console.log('Trying to get language name from locale and language code: ', locale, languageCode);
let lang = new Intl.DisplayNames(['en'], {type: 'language'});
return lang.of(code.split('_')[0]);
} catch {
return 'English'
const code = locale || languageCode.replace('_', '-') || 'en';
return lang.of(code);
} catch (error) {
console.error('Failed to get language', error);
return 'English';
}
},
@ -3112,12 +3122,15 @@ Readability.prototype = {
}
var textContent = articleContent.textContent;
const content = this._serializer(articleContent);
console.log('language code', metadata.locale, this._languageCode);
const language = await this._getLanguage(metadata.locale, content, this._languageCode);
return {
title: this._articleTitle,
// remove \n and extra spaces and trim the string
byline: author ? author.replace(/\n/g, ' ').replace(/\s+/g, ' ').trim() : null,
dir: this._articleDir,
content: this._serializer(articleContent),
content,
textContent: textContent,
length: textContent.length,
excerpt: metadata.excerpt,
@ -3125,7 +3138,7 @@ Readability.prototype = {
siteIcon: metadata.siteIcon,
previewImage: metadata.previewImage,
publishedDate,
language: this._getLanguage(metadata.locale || this._languageCode),
language,
};
}
};

View File

@ -39,6 +39,7 @@
"sinon": "^7.3.2"
},
"dependencies": {
"cld": "^2.9.1",
"html-entities": "^2.3.2",
"parse-srcset": "^1.0.2"
},

View File

@ -10811,6 +10811,15 @@ classnames@^2.2.6:
resolved "https://registry.yarnpkg.com/classnames/-/classnames-2.3.1.tgz#dfcfa3891e306ec1dad105d0e88f4417b8535e8e"
integrity sha512-OlQdbZ7gLfGarSqxesMesDa5uz7KFbID8Kpq/SxIoNGDqY8lSYs0D+hhtBXhcdB3rcbXArFr7vlHheLk1voeNA==
cld@^2.9.1:
version "2.9.1"
resolved "https://registry.yarnpkg.com/cld/-/cld-2.9.1.tgz#0c6685672d9f4612dfeb75eabfdd17bf282a87a6"
integrity sha512-GfNd5TM2He+pCsEhJWBgnLAoB0pnn6jCoYom7pzv04wpF2d54C0nasHZ8BW59KijwnVRmEBcI+imRnVbj5fVjw==
dependencies:
glob "7"
node-addon-api "*"
underscore "^1.12.1"
clean-css@^4.2.3:
version "4.2.4"
resolved "https://registry.yarnpkg.com/clean-css/-/clean-css-4.2.4.tgz#733bf46eba4e607c6891ea57c24a989356831178"
@ -15130,6 +15139,18 @@ glob-to-regexp@^0.4.1:
resolved "https://registry.yarnpkg.com/glob-to-regexp/-/glob-to-regexp-0.4.1.tgz#c75297087c851b9a578bd217dd59a92f59fe546e"
integrity sha512-lkX1HJXwyMcprw/5YUZc2s7DrpAiHB21/V+E1rHUrVNokkvB6bqMzT0VfV6/86ZNabt1k14YOIaT7nDvOX3Iiw==
glob@7, glob@^7.0.5:
version "7.2.3"
resolved "https://registry.yarnpkg.com/glob/-/glob-7.2.3.tgz#b8df0fb802bbfa8e89bd1d938b4e16578ed44f2b"
integrity sha512-nFR0zLpU2YCaRxwoCJvL6UvCH2JFyFVIvwTLsIf21AuHlMskA1hhTdk+LlYJtOlYt9v6dvszD2BGRqBL+iQK9Q==
dependencies:
fs.realpath "^1.0.0"
inflight "^1.0.4"
inherits "2"
minimatch "^3.1.1"
once "^1.3.0"
path-is-absolute "^1.0.0"
glob@7.1.4:
version "7.1.4"
resolved "https://registry.yarnpkg.com/glob/-/glob-7.1.4.tgz#aa608a2f6c577ad357e1ae5a5c26d9a8d1969255"
@ -15189,18 +15210,6 @@ glob@^10.2.2:
minipass "^5.0.0 || ^6.0.2 || ^7.0.0"
path-scurry "^1.10.1"
glob@^7.0.5:
version "7.2.3"
resolved "https://registry.yarnpkg.com/glob/-/glob-7.2.3.tgz#b8df0fb802bbfa8e89bd1d938b4e16578ed44f2b"
integrity sha512-nFR0zLpU2YCaRxwoCJvL6UvCH2JFyFVIvwTLsIf21AuHlMskA1hhTdk+LlYJtOlYt9v6dvszD2BGRqBL+iQK9Q==
dependencies:
fs.realpath "^1.0.0"
inflight "^1.0.4"
inherits "2"
minimatch "^3.1.1"
once "^1.3.0"
path-is-absolute "^1.0.0"
glob@^8.0.0:
version "8.0.3"
resolved "https://registry.yarnpkg.com/glob/-/glob-8.0.3.tgz#415c6eb2deed9e502c68fa44a272e6da6eeca42e"
@ -20814,6 +20823,11 @@ node-abort-controller@^3.0.1:
resolved "https://registry.yarnpkg.com/node-abort-controller/-/node-abort-controller-3.0.1.tgz#f91fa50b1dee3f909afabb7e261b1e1d6b0cb74e"
integrity sha512-/ujIVxthRs+7q6hsdjHMaj8hRG9NuWmwrz+JdRwZ14jdFoKSkm+vDsCbF9PLpnSqjaWQJuTmVtcWHNLr+vrOFw==
node-addon-api@*:
version "7.0.0"
resolved "https://registry.yarnpkg.com/node-addon-api/-/node-addon-api-7.0.0.tgz#8136add2f510997b3b94814f4af1cce0b0e3962e"
integrity sha512-vgbBJTS4m5/KkE16t5Ly0WW9hz46swAstv0hYYwMtbG7AznRhNyfLRe8HZAiWIpcHzoO7HxhLuBQj9rJ/Ho0ZA==
node-addon-api@^1.2.0:
version "1.7.2"
resolved "https://registry.yarnpkg.com/node-addon-api/-/node-addon-api-1.7.2.tgz#3df30b95720b53c24e59948b49532b662444f54d"
@ -27285,16 +27299,16 @@ undefsafe@^2.0.5:
resolved "https://registry.yarnpkg.com/undefsafe/-/undefsafe-2.0.5.tgz#38733b9327bdcd226db889fb723a6efd162e6e2c"
integrity sha512-WxONCrssBM8TSPRqN5EmsjVrsv4A8X12J4ArBiiayv3DyyG3ZlIg6yysuuSYdZsVz3TKcTg2fd//Ujd4CHV1iA==
underscore@^1.12.1, underscore@^1.13.6, underscore@~1.13.2:
version "1.13.6"
resolved "https://registry.yarnpkg.com/underscore/-/underscore-1.13.6.tgz#04786a1f589dc6c09f761fc5f45b89e935136441"
integrity sha512-+A5Sja4HP1M08MaXya7p5LvjuM7K6q/2EaC0+iovj/wOcMsTzMvDFbasi/oSapiwOlt252IqsKqPjCl7huKS0A==
underscore@^1.13.4, underscore@^1.9.1:
version "1.13.4"
resolved "https://registry.yarnpkg.com/underscore/-/underscore-1.13.4.tgz#7886b46bbdf07f768e0052f1828e1dcab40c0dee"
integrity sha512-BQFnUDuAQ4Yf/cYY5LNrK9NCJFKriaRbD9uR1fTeXnBeoa97W0i41qkZfGO9pSo8I5KzjAcSY2XYtdf0oKd7KQ==
underscore@^1.13.6, underscore@~1.13.2:
version "1.13.6"
resolved "https://registry.yarnpkg.com/underscore/-/underscore-1.13.6.tgz#04786a1f589dc6c09f761fc5f45b89e935136441"
integrity sha512-+A5Sja4HP1M08MaXya7p5LvjuM7K6q/2EaC0+iovj/wOcMsTzMvDFbasi/oSapiwOlt252IqsKqPjCl7huKS0A==
undici@^4.9.3:
version "4.14.1"
resolved "https://registry.yarnpkg.com/undici/-/undici-4.14.1.tgz#7633b143a8a10d6d63335e00511d071e8d52a1d9"