Ignore link density checks in newsletters

This commit is contained in:
Jackson Harper
2023-08-31 18:16:55 +08:00
parent 11f267b0bf
commit 293becf596
3 changed files with 7 additions and 4 deletions

View File

@ -139,6 +139,7 @@ declare module '@omnivore/readability' {
* By setting this to `true`, these tables will be retained. * By setting this to `true`, these tables will be retained.
*/ */
keepTables?: boolean keepTables?: boolean
ignoreLinkDensity?: boolean
} }
interface ParseResult { interface ParseResult {

View File

@ -176,6 +176,7 @@ const getReadabilityResult = async (
debug: DEBUG_MODE, debug: DEBUG_MODE,
createImageProxyUrl, createImageProxyUrl,
keepTables: isNewsletter, keepTables: isNewsletter,
ignoreLinkDensity: isNewsletter,
url, url,
}).parse() }).parse()

View File

@ -88,6 +88,7 @@ function Readability(doc, options) {
this._articleDir = null; this._articleDir = null;
this._languageCode = null; this._languageCode = null;
this._attempts = []; this._attempts = [];
this._testLinkDensity = options.testLinkDensity || true
// Configurable options // Configurable options
this._debug = !!options.debug; this._debug = !!options.debug;
@ -2828,12 +2829,12 @@ Readability.prototype = {
(img > 1 && p / img < 0.5 && !this._hasAncestorTag(node, "figure")) || (img > 1 && p / img < 0.5 && !this._hasAncestorTag(node, "figure")) ||
(!isList && li > p) || (!isList && li > p) ||
(input > Math.floor(p/3)) || (input > Math.floor(p/3)) ||
(!isList && headingDensity < 0.9 && contentLength < 25 && (img === 0 || img > 2) && !this._hasAncestorTag(node, "figure")) || (this._testLinkDensity && !isList && headingDensity < 0.9 && contentLength < 25 && (img === 0 || img > 2) && !this._hasAncestorTag(node, "figure")) ||
// ignores link density for the links inside the .post-body div (the main content) // ignores link density for the links inside the .post-body div (the main content)
(!isList && weight < 25 && linkDensity > 0.2 && !(this.CLASSES_TO_SKIP.some((c) => parentClasses.contains(c))) )|| (this._testLinkDensity && !isList && weight < 25 && linkDensity > 0.2 && !(this.CLASSES_TO_SKIP.some((c) => parentClasses.contains(c))) )||
// some website like https://substack.com might have their custom styling of tweets // some website like https://substack.com might have their custom styling of tweets
// we should omit ignoring their particular case by checking against "tweet" classname // we should omit ignoring their particular case by checking against "tweet" classname
(weight >= 25 && linkDensity > 0.5 && !(node.className === "tweet" && linkDensity === 1)) || (weight >= 25 && (this._testLinkDensity && linkDensity > 0.5) && !(node.className === "tweet" && linkDensity === 1)) ||
((embedCount === 1 && contentLength < 75) || embedCount > 1)) ((embedCount === 1 && contentLength < 75) || embedCount > 1))
// Allow simple lists of images to remain in pages // Allow simple lists of images to remain in pages
@ -2853,7 +2854,7 @@ Readability.prototype = {
} }
if (haveToRemove) { if (haveToRemove) {
this.log("Cleaning Conditionally", { className: node.className, children: Array.from(node.children).map(ch => ch.tagName) }); console.log("Cleaning Conditionally", { className: node.textContent, children: Array.from(node.children).map(ch => ch.tagName) });
} }
return haveToRemove; return haveToRemove;