detect language from content if locale not found in metadata
This commit is contained in:
@ -23,6 +23,7 @@
|
||||
var parseSrcset = require('parse-srcset');
|
||||
var htmlEntities = require('html-entities')
|
||||
const axios = require("axios");
|
||||
const cld = require('cld');
|
||||
|
||||
/** Checks whether an element is a wrapper for tweet */
|
||||
const hasTweetInChildren = element => {
|
||||
@ -3029,17 +3030,26 @@ Readability.prototype = {
|
||||
return false;
|
||||
},
|
||||
|
||||
_getLanguage: function(code) {
|
||||
if (!code) {
|
||||
// Default to English
|
||||
return 'English';
|
||||
}
|
||||
|
||||
_getLanguage: async function(locale, content, languageCode) {
|
||||
try {
|
||||
if (!locale) {
|
||||
// detect language from the html content
|
||||
if (content) {
|
||||
const languages = (await cld.detect(content, { isHTML: true })).languages;
|
||||
console.log('Detected languages: ', languages);
|
||||
if (languages.length > 0) {
|
||||
return languages[0].name;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.log('Trying to get language name from locale and language code: ', locale, languageCode);
|
||||
let lang = new Intl.DisplayNames(['en'], {type: 'language'});
|
||||
return lang.of(code.split('_')[0]);
|
||||
} catch {
|
||||
return 'English'
|
||||
const code = locale || languageCode.replace('_', '-') || 'en';
|
||||
return lang.of(code);
|
||||
} catch (error) {
|
||||
console.error('Failed to get language', error);
|
||||
return 'English';
|
||||
}
|
||||
},
|
||||
|
||||
@ -3112,12 +3122,15 @@ Readability.prototype = {
|
||||
}
|
||||
|
||||
var textContent = articleContent.textContent;
|
||||
const content = this._serializer(articleContent);
|
||||
console.log('language code', metadata.locale, this._languageCode);
|
||||
const language = await this._getLanguage(metadata.locale, content, this._languageCode);
|
||||
return {
|
||||
title: this._articleTitle,
|
||||
// remove \n and extra spaces and trim the string
|
||||
byline: author ? author.replace(/\n/g, ' ').replace(/\s+/g, ' ').trim() : null,
|
||||
dir: this._articleDir,
|
||||
content: this._serializer(articleContent),
|
||||
content,
|
||||
textContent: textContent,
|
||||
length: textContent.length,
|
||||
excerpt: metadata.excerpt,
|
||||
@ -3125,7 +3138,7 @@ Readability.prototype = {
|
||||
siteIcon: metadata.siteIcon,
|
||||
previewImage: metadata.previewImage,
|
||||
publishedDate,
|
||||
language: this._getLanguage(metadata.locale || this._languageCode),
|
||||
language,
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
@ -39,6 +39,7 @@
|
||||
"sinon": "^7.3.2"
|
||||
},
|
||||
"dependencies": {
|
||||
"cld": "^2.9.1",
|
||||
"html-entities": "^2.3.2",
|
||||
"parse-srcset": "^1.0.2"
|
||||
},
|
||||
|
||||
48
yarn.lock
48
yarn.lock
@ -10811,6 +10811,15 @@ classnames@^2.2.6:
|
||||
resolved "https://registry.yarnpkg.com/classnames/-/classnames-2.3.1.tgz#dfcfa3891e306ec1dad105d0e88f4417b8535e8e"
|
||||
integrity sha512-OlQdbZ7gLfGarSqxesMesDa5uz7KFbID8Kpq/SxIoNGDqY8lSYs0D+hhtBXhcdB3rcbXArFr7vlHheLk1voeNA==
|
||||
|
||||
cld@^2.9.1:
|
||||
version "2.9.1"
|
||||
resolved "https://registry.yarnpkg.com/cld/-/cld-2.9.1.tgz#0c6685672d9f4612dfeb75eabfdd17bf282a87a6"
|
||||
integrity sha512-GfNd5TM2He+pCsEhJWBgnLAoB0pnn6jCoYom7pzv04wpF2d54C0nasHZ8BW59KijwnVRmEBcI+imRnVbj5fVjw==
|
||||
dependencies:
|
||||
glob "7"
|
||||
node-addon-api "*"
|
||||
underscore "^1.12.1"
|
||||
|
||||
clean-css@^4.2.3:
|
||||
version "4.2.4"
|
||||
resolved "https://registry.yarnpkg.com/clean-css/-/clean-css-4.2.4.tgz#733bf46eba4e607c6891ea57c24a989356831178"
|
||||
@ -15130,6 +15139,18 @@ glob-to-regexp@^0.4.1:
|
||||
resolved "https://registry.yarnpkg.com/glob-to-regexp/-/glob-to-regexp-0.4.1.tgz#c75297087c851b9a578bd217dd59a92f59fe546e"
|
||||
integrity sha512-lkX1HJXwyMcprw/5YUZc2s7DrpAiHB21/V+E1rHUrVNokkvB6bqMzT0VfV6/86ZNabt1k14YOIaT7nDvOX3Iiw==
|
||||
|
||||
glob@7, glob@^7.0.5:
|
||||
version "7.2.3"
|
||||
resolved "https://registry.yarnpkg.com/glob/-/glob-7.2.3.tgz#b8df0fb802bbfa8e89bd1d938b4e16578ed44f2b"
|
||||
integrity sha512-nFR0zLpU2YCaRxwoCJvL6UvCH2JFyFVIvwTLsIf21AuHlMskA1hhTdk+LlYJtOlYt9v6dvszD2BGRqBL+iQK9Q==
|
||||
dependencies:
|
||||
fs.realpath "^1.0.0"
|
||||
inflight "^1.0.4"
|
||||
inherits "2"
|
||||
minimatch "^3.1.1"
|
||||
once "^1.3.0"
|
||||
path-is-absolute "^1.0.0"
|
||||
|
||||
glob@7.1.4:
|
||||
version "7.1.4"
|
||||
resolved "https://registry.yarnpkg.com/glob/-/glob-7.1.4.tgz#aa608a2f6c577ad357e1ae5a5c26d9a8d1969255"
|
||||
@ -15189,18 +15210,6 @@ glob@^10.2.2:
|
||||
minipass "^5.0.0 || ^6.0.2 || ^7.0.0"
|
||||
path-scurry "^1.10.1"
|
||||
|
||||
glob@^7.0.5:
|
||||
version "7.2.3"
|
||||
resolved "https://registry.yarnpkg.com/glob/-/glob-7.2.3.tgz#b8df0fb802bbfa8e89bd1d938b4e16578ed44f2b"
|
||||
integrity sha512-nFR0zLpU2YCaRxwoCJvL6UvCH2JFyFVIvwTLsIf21AuHlMskA1hhTdk+LlYJtOlYt9v6dvszD2BGRqBL+iQK9Q==
|
||||
dependencies:
|
||||
fs.realpath "^1.0.0"
|
||||
inflight "^1.0.4"
|
||||
inherits "2"
|
||||
minimatch "^3.1.1"
|
||||
once "^1.3.0"
|
||||
path-is-absolute "^1.0.0"
|
||||
|
||||
glob@^8.0.0:
|
||||
version "8.0.3"
|
||||
resolved "https://registry.yarnpkg.com/glob/-/glob-8.0.3.tgz#415c6eb2deed9e502c68fa44a272e6da6eeca42e"
|
||||
@ -20814,6 +20823,11 @@ node-abort-controller@^3.0.1:
|
||||
resolved "https://registry.yarnpkg.com/node-abort-controller/-/node-abort-controller-3.0.1.tgz#f91fa50b1dee3f909afabb7e261b1e1d6b0cb74e"
|
||||
integrity sha512-/ujIVxthRs+7q6hsdjHMaj8hRG9NuWmwrz+JdRwZ14jdFoKSkm+vDsCbF9PLpnSqjaWQJuTmVtcWHNLr+vrOFw==
|
||||
|
||||
node-addon-api@*:
|
||||
version "7.0.0"
|
||||
resolved "https://registry.yarnpkg.com/node-addon-api/-/node-addon-api-7.0.0.tgz#8136add2f510997b3b94814f4af1cce0b0e3962e"
|
||||
integrity sha512-vgbBJTS4m5/KkE16t5Ly0WW9hz46swAstv0hYYwMtbG7AznRhNyfLRe8HZAiWIpcHzoO7HxhLuBQj9rJ/Ho0ZA==
|
||||
|
||||
node-addon-api@^1.2.0:
|
||||
version "1.7.2"
|
||||
resolved "https://registry.yarnpkg.com/node-addon-api/-/node-addon-api-1.7.2.tgz#3df30b95720b53c24e59948b49532b662444f54d"
|
||||
@ -27285,16 +27299,16 @@ undefsafe@^2.0.5:
|
||||
resolved "https://registry.yarnpkg.com/undefsafe/-/undefsafe-2.0.5.tgz#38733b9327bdcd226db889fb723a6efd162e6e2c"
|
||||
integrity sha512-WxONCrssBM8TSPRqN5EmsjVrsv4A8X12J4ArBiiayv3DyyG3ZlIg6yysuuSYdZsVz3TKcTg2fd//Ujd4CHV1iA==
|
||||
|
||||
underscore@^1.12.1, underscore@^1.13.6, underscore@~1.13.2:
|
||||
version "1.13.6"
|
||||
resolved "https://registry.yarnpkg.com/underscore/-/underscore-1.13.6.tgz#04786a1f589dc6c09f761fc5f45b89e935136441"
|
||||
integrity sha512-+A5Sja4HP1M08MaXya7p5LvjuM7K6q/2EaC0+iovj/wOcMsTzMvDFbasi/oSapiwOlt252IqsKqPjCl7huKS0A==
|
||||
|
||||
underscore@^1.13.4, underscore@^1.9.1:
|
||||
version "1.13.4"
|
||||
resolved "https://registry.yarnpkg.com/underscore/-/underscore-1.13.4.tgz#7886b46bbdf07f768e0052f1828e1dcab40c0dee"
|
||||
integrity sha512-BQFnUDuAQ4Yf/cYY5LNrK9NCJFKriaRbD9uR1fTeXnBeoa97W0i41qkZfGO9pSo8I5KzjAcSY2XYtdf0oKd7KQ==
|
||||
|
||||
underscore@^1.13.6, underscore@~1.13.2:
|
||||
version "1.13.6"
|
||||
resolved "https://registry.yarnpkg.com/underscore/-/underscore-1.13.6.tgz#04786a1f589dc6c09f761fc5f45b89e935136441"
|
||||
integrity sha512-+A5Sja4HP1M08MaXya7p5LvjuM7K6q/2EaC0+iovj/wOcMsTzMvDFbasi/oSapiwOlt252IqsKqPjCl7huKS0A==
|
||||
|
||||
undici@^4.9.3:
|
||||
version "4.14.1"
|
||||
resolved "https://registry.yarnpkg.com/undici/-/undici-4.14.1.tgz#7633b143a8a10d6d63335e00511d071e8d52a1d9"
|
||||
|
||||
Reference in New Issue
Block a user