From 293becf5960f249f552f78dd34e72ee0c8402cf9 Mon Sep 17 00:00:00 2001 From: Jackson Harper Date: Thu, 31 Aug 2023 18:16:55 +0800 Subject: [PATCH] Ignore link density checks in newsletters --- packages/api/src/readability.d.ts | 1 + packages/api/src/utils/parser.ts | 1 + packages/readabilityjs/Readability.js | 9 +++++---- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/packages/api/src/readability.d.ts b/packages/api/src/readability.d.ts index 6b7bf0077..4722588cd 100644 --- a/packages/api/src/readability.d.ts +++ b/packages/api/src/readability.d.ts @@ -139,6 +139,7 @@ declare module '@omnivore/readability' { * By setting this to `true`, these tables will be retained. */ keepTables?: boolean + ignoreLinkDensity?: boolean } interface ParseResult { diff --git a/packages/api/src/utils/parser.ts b/packages/api/src/utils/parser.ts index 803a60bdc..329bd977a 100644 --- a/packages/api/src/utils/parser.ts +++ b/packages/api/src/utils/parser.ts @@ -176,6 +176,7 @@ const getReadabilityResult = async ( debug: DEBUG_MODE, createImageProxyUrl, keepTables: isNewsletter, + ignoreLinkDensity: isNewsletter, url, }).parse() diff --git a/packages/readabilityjs/Readability.js b/packages/readabilityjs/Readability.js index 2727d3ea1..84255ad80 100644 --- a/packages/readabilityjs/Readability.js +++ b/packages/readabilityjs/Readability.js @@ -88,6 +88,7 @@ function Readability(doc, options) { this._articleDir = null; this._languageCode = null; this._attempts = []; + this._testLinkDensity = options.testLinkDensity || true // Configurable options this._debug = !!options.debug; @@ -2828,12 +2829,12 @@ Readability.prototype = { (img > 1 && p / img < 0.5 && !this._hasAncestorTag(node, "figure")) || (!isList && li > p) || (input > Math.floor(p/3)) || - (!isList && headingDensity < 0.9 && contentLength < 25 && (img === 0 || img > 2) && !this._hasAncestorTag(node, "figure")) || + (this._testLinkDensity && !isList && headingDensity < 0.9 && contentLength < 25 && (img === 0 || img > 2) && !this._hasAncestorTag(node, "figure")) || // ignores link density for the links inside the .post-body div (the main content) - (!isList && weight < 25 && linkDensity > 0.2 && !(this.CLASSES_TO_SKIP.some((c) => parentClasses.contains(c))) )|| + (this._testLinkDensity && !isList && weight < 25 && linkDensity > 0.2 && !(this.CLASSES_TO_SKIP.some((c) => parentClasses.contains(c))) )|| // some website like https://substack.com might have their custom styling of tweets // we should omit ignoring their particular case by checking against "tweet" classname - (weight >= 25 && linkDensity > 0.5 && !(node.className === "tweet" && linkDensity === 1)) || + (weight >= 25 && (this._testLinkDensity && linkDensity > 0.5) && !(node.className === "tweet" && linkDensity === 1)) || ((embedCount === 1 && contentLength < 75) || embedCount > 1)) // Allow simple lists of images to remain in pages @@ -2853,7 +2854,7 @@ Readability.prototype = { } if (haveToRemove) { - this.log("Cleaning Conditionally", { className: node.className, children: Array.from(node.children).map(ch => ch.tagName) }); + console.log("Cleaning Conditionally", { className: node.textContent, children: Array.from(node.children).map(ch => ch.tagName) }); } return haveToRemove;