diff --git a/packages/api/src/readability.d.ts b/packages/api/src/readability.d.ts index 33fbe0578..ba8278b14 100644 --- a/packages/api/src/readability.d.ts +++ b/packages/api/src/readability.d.ts @@ -165,6 +165,7 @@ declare module '@omnivore/readability' { /** Article published date */ publishedDate?: Date dom?: Element + language?: string } } diff --git a/packages/api/src/utils/parser.ts b/packages/api/src/utils/parser.ts index 33c238ff0..b397365f9 100644 --- a/packages/api/src/utils/parser.ts +++ b/packages/api/src/utils/parser.ts @@ -287,6 +287,7 @@ export const parsePreparedContent = async ( siteName: article?.siteName || (await jsonLdLinkMetadata).siteName, siteIcon: article?.siteIcon, byline: article?.byline || (await jsonLdLinkMetadata).byline, + language: article?.language, }) logRecord.parseSuccess = true } catch (error) { diff --git a/packages/readabilityjs/Readability.js b/packages/readabilityjs/Readability.js index 47c1d9386..3b77ef871 100644 --- a/packages/readabilityjs/Readability.js +++ b/packages/readabilityjs/Readability.js @@ -85,7 +85,7 @@ function Readability(doc, options) { this._articleByline = null; this._articlePublishedDate = null; this._articleDir = null; - this._articleSiteName = null; + this._languageCode = null; this._attempts = []; // Configurable options @@ -1921,6 +1921,8 @@ Readability.prototype = { values["weibo:article:image"] || values["weibo:webpage:image"]; + metadata.locale = values["og:locale"]; + // TODO: Add canonical ULR search here as well // in many sites the meta value is escaped with HTML entities, @@ -2833,6 +2835,11 @@ Readability.prototype = { return false; }, + _getLanguage: function(code) { + let lang = new Intl.DisplayNames(['en'], {type: 'language'}); + return lang.of(code.split('-')[0]); + }, + /** * Runs readability. * @@ -2860,6 +2867,8 @@ Readability.prototype = { // Extract JSON-LD metadata before removing scripts var jsonLd = this._disableJSONLD ? {} : this._getJSONLD(this._doc); + this._languageCode = this._doc.documentElement.lang + // Remove script tags from the document. this._removeScripts(this._doc); @@ -2898,11 +2907,12 @@ Readability.prototype = { textContent: textContent, length: textContent.length, excerpt: metadata.excerpt, - siteName: metadata.siteName || this._articleSiteName, + siteName: metadata.siteName, siteIcon: metadata.siteIcon, previewImage: metadata.previewImage, publishedDate: metadata.publishedDate || publishedAt || this._articlePublishedDate, dom: articleContent, + language: this._getLanguage(metadata.locale || this._languageCode), }; } };