Ignore link density checks in newsletters
This commit is contained in:
1
packages/api/src/readability.d.ts
vendored
1
packages/api/src/readability.d.ts
vendored
@ -139,6 +139,7 @@ declare module '@omnivore/readability' {
|
|||||||
* By setting this to `true`, these tables will be retained.
|
* By setting this to `true`, these tables will be retained.
|
||||||
*/
|
*/
|
||||||
keepTables?: boolean
|
keepTables?: boolean
|
||||||
|
ignoreLinkDensity?: boolean
|
||||||
}
|
}
|
||||||
|
|
||||||
interface ParseResult {
|
interface ParseResult {
|
||||||
|
|||||||
@ -176,6 +176,7 @@ const getReadabilityResult = async (
|
|||||||
debug: DEBUG_MODE,
|
debug: DEBUG_MODE,
|
||||||
createImageProxyUrl,
|
createImageProxyUrl,
|
||||||
keepTables: isNewsletter,
|
keepTables: isNewsletter,
|
||||||
|
ignoreLinkDensity: isNewsletter,
|
||||||
url,
|
url,
|
||||||
}).parse()
|
}).parse()
|
||||||
|
|
||||||
|
|||||||
@ -88,6 +88,7 @@ function Readability(doc, options) {
|
|||||||
this._articleDir = null;
|
this._articleDir = null;
|
||||||
this._languageCode = null;
|
this._languageCode = null;
|
||||||
this._attempts = [];
|
this._attempts = [];
|
||||||
|
this._testLinkDensity = options.testLinkDensity || true
|
||||||
|
|
||||||
// Configurable options
|
// Configurable options
|
||||||
this._debug = !!options.debug;
|
this._debug = !!options.debug;
|
||||||
@ -2828,12 +2829,12 @@ Readability.prototype = {
|
|||||||
(img > 1 && p / img < 0.5 && !this._hasAncestorTag(node, "figure")) ||
|
(img > 1 && p / img < 0.5 && !this._hasAncestorTag(node, "figure")) ||
|
||||||
(!isList && li > p) ||
|
(!isList && li > p) ||
|
||||||
(input > Math.floor(p/3)) ||
|
(input > Math.floor(p/3)) ||
|
||||||
(!isList && headingDensity < 0.9 && contentLength < 25 && (img === 0 || img > 2) && !this._hasAncestorTag(node, "figure")) ||
|
(this._testLinkDensity && !isList && headingDensity < 0.9 && contentLength < 25 && (img === 0 || img > 2) && !this._hasAncestorTag(node, "figure")) ||
|
||||||
// ignores link density for the links inside the .post-body div (the main content)
|
// ignores link density for the links inside the .post-body div (the main content)
|
||||||
(!isList && weight < 25 && linkDensity > 0.2 && !(this.CLASSES_TO_SKIP.some((c) => parentClasses.contains(c))) )||
|
(this._testLinkDensity && !isList && weight < 25 && linkDensity > 0.2 && !(this.CLASSES_TO_SKIP.some((c) => parentClasses.contains(c))) )||
|
||||||
// some website like https://substack.com might have their custom styling of tweets
|
// some website like https://substack.com might have their custom styling of tweets
|
||||||
// we should omit ignoring their particular case by checking against "tweet" classname
|
// we should omit ignoring their particular case by checking against "tweet" classname
|
||||||
(weight >= 25 && linkDensity > 0.5 && !(node.className === "tweet" && linkDensity === 1)) ||
|
(weight >= 25 && (this._testLinkDensity && linkDensity > 0.5) && !(node.className === "tweet" && linkDensity === 1)) ||
|
||||||
((embedCount === 1 && contentLength < 75) || embedCount > 1))
|
((embedCount === 1 && contentLength < 75) || embedCount > 1))
|
||||||
|
|
||||||
// Allow simple lists of images to remain in pages
|
// Allow simple lists of images to remain in pages
|
||||||
@ -2853,7 +2854,7 @@ Readability.prototype = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (haveToRemove) {
|
if (haveToRemove) {
|
||||||
this.log("Cleaning Conditionally", { className: node.className, children: Array.from(node.children).map(ch => ch.tagName) });
|
console.log("Cleaning Conditionally", { className: node.textContent, children: Array.from(node.children).map(ch => ch.tagName) });
|
||||||
}
|
}
|
||||||
|
|
||||||
return haveToRemove;
|
return haveToRemove;
|
||||||
|
|||||||
Reference in New Issue
Block a user