Parse language in readability

This commit is contained in:
Hongbo Wu
2022-05-23 21:51:49 +08:00
parent 0ef9bb3944
commit 7d4d1d7b67
3 changed files with 14 additions and 2 deletions

View File

@ -165,6 +165,7 @@ declare module '@omnivore/readability' {
/** Article published date */
publishedDate?: Date
dom?: Element
language?: string
}
}

View File

@ -287,6 +287,7 @@ export const parsePreparedContent = async (
siteName: article?.siteName || (await jsonLdLinkMetadata).siteName,
siteIcon: article?.siteIcon,
byline: article?.byline || (await jsonLdLinkMetadata).byline,
language: article?.language,
})
logRecord.parseSuccess = true
} catch (error) {

View File

@ -85,7 +85,7 @@ function Readability(doc, options) {
this._articleByline = null;
this._articlePublishedDate = null;
this._articleDir = null;
this._articleSiteName = null;
this._languageCode = null;
this._attempts = [];
// Configurable options
@ -1921,6 +1921,8 @@ Readability.prototype = {
values["weibo:article:image"] ||
values["weibo:webpage:image"];
metadata.locale = values["og:locale"];
// TODO: Add canonical ULR search here as well
// in many sites the meta value is escaped with HTML entities,
@ -2833,6 +2835,11 @@ Readability.prototype = {
return false;
},
_getLanguage: function(code) {
let lang = new Intl.DisplayNames(['en'], {type: 'language'});
return lang.of(code.split('-')[0]);
},
/**
* Runs readability.
*
@ -2860,6 +2867,8 @@ Readability.prototype = {
// Extract JSON-LD metadata before removing scripts
var jsonLd = this._disableJSONLD ? {} : this._getJSONLD(this._doc);
this._languageCode = this._doc.documentElement.lang
// Remove script tags from the document.
this._removeScripts(this._doc);
@ -2898,11 +2907,12 @@ Readability.prototype = {
textContent: textContent,
length: textContent.length,
excerpt: metadata.excerpt,
siteName: metadata.siteName || this._articleSiteName,
siteName: metadata.siteName,
siteIcon: metadata.siteIcon,
previewImage: metadata.previewImage,
publishedDate: metadata.publishedDate || publishedAt || this._articlePublishedDate,
dom: articleContent,
language: this._getLanguage(metadata.locale || this._languageCode),
};
}
};