diff --git a/packages/readabilityjs/Readability.js b/packages/readabilityjs/Readability.js index 3fa82285c..9fc6e6e88 100644 --- a/packages/readabilityjs/Readability.js +++ b/packages/readabilityjs/Readability.js @@ -2783,7 +2783,7 @@ Readability.prototype = { var contentScore = 0; if (weight + contentScore < 0) { - this.log("Cleaning Conditionally by weight", { node, className: node.className, children: Array.from(node.children).map(ch => ch.tagName)}); + this.log("Cleaning Conditionally by weight", { text: node.innerText, className: node.className, children: Array.from(node.children).map(ch => ch.tagName)}); return true; } @@ -2816,8 +2816,12 @@ Readability.prototype = { embedCount++; } + var innerText = this._getInnerText(node) var linkDensity = this._getLinkDensity(node); - var contentLength = this._getInnerText(node).length; + var contentLength = innerText.length; + + const emojiRegex = /[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F700}-\u{1F77F}\u{1F780}-\u{1F7FF}\u{1F800}-\u{1F8FF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}\u{1F900}-\u{1F9FF}\u{1FA00}-\u{1FA6F}\u{1FA70}-\u{1FAFF}]/gu; + const textHasEmoji = Array.from(innerText.matchAll(emojiRegex)).length > 0 if (hasTweetInChildren(node)) { return false; @@ -2833,7 +2837,7 @@ Readability.prototype = { (img > 1 && p / img < 0.5 && !this._hasAncestorTag(node, "figure")) || (!isList && li > p) || (input > Math.floor(p/3)) || - (!isList && headingDensity < 0.9 && contentLength < 25 && (img === 0 || img > 2) && !this._hasAncestorTag(node, "figure")) || + (!isList && headingDensity < 0.9 && contentLength < 25 && !textHasEmoji && (img === 0 || img > 2) && !this._hasAncestorTag(node, "figure")) || // ignores link density for the links inside the .post-body div (the main content) (!isList && weight < 25 && linkDensity > 0.2 && !(this.CLASSES_TO_SKIP.some((c) => parentClasses.contains(c))) )|| // some website like https://substack.com might have their custom styling of tweets diff --git a/packages/readabilityjs/test/generate-testcase.js b/packages/readabilityjs/test/generate-testcase.js index f9eab147b..f34031c1f 100644 --- a/packages/readabilityjs/test/generate-testcase.js +++ b/packages/readabilityjs/test/generate-testcase.js @@ -1,102 +1,109 @@ -var debug = false; +var debug = false -var path = require("path"); -var fs = require("fs"); -var prettyPrint = require("./utils").prettyPrint; -var htmltidy = require("htmltidy2").tidy; +var path = require('path') +var fs = require('fs') +var prettyPrint = require('./utils').prettyPrint +var htmltidy = require('htmltidy2').tidy -var { Readability, isProbablyReaderable } = require("../index"); -const { parseHTML } = require("linkedom"); +var { Readability, isProbablyReaderable } = require('../index') +const { parseHTML } = require('linkedom') -const puppeteer = require('puppeteer-extra'); +const puppeteer = require('puppeteer-extra') // Add stealth plugin to hide puppeteer usage -const StealthPlugin = require('puppeteer-extra-plugin-stealth'); -puppeteer.use(StealthPlugin()); +const StealthPlugin = require('puppeteer-extra-plugin-stealth') +puppeteer.use(StealthPlugin()) // Add adblocker plugin to block all ads and trackers (saves bandwidth) -const AdblockerPlugin = require('puppeteer-extra-plugin-adblocker'); -puppeteer.use(AdblockerPlugin({ blockTrackers: true })); +const AdblockerPlugin = require('puppeteer-extra-plugin-adblocker') +puppeteer.use(AdblockerPlugin({ blockTrackers: true })) -var testcaseRoot = path.join(__dirname, "test-pages"); +var testcaseRoot = path.join(__dirname, 'test-pages') -var argURL = process.argv[3]; // Could be undefined, we'll warn if it is if that is an issue. +var argURL = process.argv[3] // Could be undefined, we'll warn if it is if that is an issue. -const DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36' -const NON_BOT_DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36' +const DESKTOP_USER_AGENT = + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36' +const NON_BOT_DESKTOP_USER_AGENT = + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36' const NON_BOT_HOSTS = ['bloomberg.com', 'forbes.com'] -const NON_SCRIPT_HOSTS= ['medium.com', 'fastcompany.com']; +const NON_SCRIPT_HOSTS = ['medium.com', 'fastcompany.com'] const userAgentForUrl = (url) => { try { - const u = new URL(url); + const u = new URL(url) for (const host of NON_BOT_HOSTS) { if (u.hostname.endsWith(host)) { - return NON_BOT_DESKTOP_USER_AGENT; + return NON_BOT_DESKTOP_USER_AGENT } } } catch (e) { console.log('error getting user agent for url', url, e) } return DESKTOP_USER_AGENT -}; +} const enableJavascriptForUrl = (url) => { try { - const u = new URL(url); + const u = new URL(url) for (const host of NON_SCRIPT_HOSTS) { if (u.hostname.endsWith(host)) { - return false; + return false } } } catch (e) { console.log('error getting hostname for url', url, e) } return true -}; +} function generateTestcase(slug) { const options = { debug, - }; - if (slug.startsWith("newsletters/")) { - // keep the newsletter content in tables - options.keepTables = true; } - var destRoot = path.join(testcaseRoot, slug); + if (slug.startsWith('newsletters/')) { + // keep the newsletter content in tables + options.keepTables = true + options.ignoreLinkDensity = true + } + var destRoot = path.join(testcaseRoot, slug) fs.mkdir(destRoot, function (err) { if (err) { - var sourceFile = path.join(destRoot, "source.html"); + var sourceFile = path.join(destRoot, 'source.html') fs.exists(sourceFile, function (exists) { if (exists) { - fs.readFile(sourceFile, { encoding: "utf-8" }, function (readFileErr, data) { - if (readFileErr) { - console.error("Source existed but couldn't be read?"); - process.exit(1); + fs.readFile( + sourceFile, + { encoding: 'utf-8' }, + function (readFileErr, data) { + if (readFileErr) { + console.error("Source existed but couldn't be read?") + process.exit(1) + } + onResponseReceived(null, data, destRoot, options) } - onResponseReceived(null, data, destRoot, options); - }); + ) } else { - fs.writeFile(path.join(destRoot, 'url.txt'), argURL, () => null); + fs.writeFile(path.join(destRoot, 'url.txt'), argURL, () => null) fetchSource(argURL, function (fetchErr, data) { - onResponseReceived(fetchErr, data, destRoot, options); - }); + onResponseReceived(fetchErr, data, destRoot, options) + }) } - }); - return; + }) + return } - fs.writeFile(path.join(destRoot, 'url.txt'), argURL, () => null); + fs.writeFile(path.join(destRoot, 'url.txt'), argURL, () => null) fetchSource(argURL, function (fetchErr, data) { - onResponseReceived(fetchErr, data, destRoot, options); - }); - }); + onResponseReceived(fetchErr, data, destRoot, options) + }) + }) } async function fetchSource(url, callbackFn) { if (!url) { - console.error("You should pass a URL if the source doesn't exist yet!"); - process.exit(1); + console.error("You should pass a URL if the source doesn't exist yet!") + process.exit(1) } const browser = await puppeteer.launch({ @@ -129,42 +136,42 @@ async function fetchSource(url, callbackFn) { height: 1080, isLandscape: true, isMobile: false, - width: 1920 + width: 1920, }, headless: true, executablePath: process.env.CHROMIUM_PATH || '/opt/homebrew/bin/chromium', - }); + }) - const page = await browser.newPage(); + const page = await browser.newPage() if (!enableJavascriptForUrl(url)) { - await page.setJavaScriptEnabled(false); + await page.setJavaScriptEnabled(false) } - await page.setUserAgent(userAgentForUrl(url)); + await page.setUserAgent(userAgentForUrl(url)) try { /* - * Disallow MathJax from running in Puppeteer and modifying the document, - * we shall instead run it in our frontend application to transform any - * mathjax content when present. - */ - await page.setRequestInterception(true); - page.on('request', request => { + * Disallow MathJax from running in Puppeteer and modifying the document, + * we shall instead run it in our frontend application to transform any + * mathjax content when present. + */ + await page.setRequestInterception(true) + page.on('request', (request) => { if ( request.resourceType() === 'script' && request.url().toLowerCase().indexOf('mathjax') > -1 ) { - request.abort(); + request.abort() } else { - request.continue(); + request.continue() } - }); + }) - await page.goto(url, { waitUntil: ['networkidle2'] }); + await page.goto(url, { waitUntil: ['networkidle2'] }) /* scroll with a 5 second timeout */ await Promise.race([ - new Promise(resolve => { - (async function () { + new Promise((resolve) => { + ;(async function () { try { await page.evaluate(`(async () => { /* credit: https://github.com/puppeteer/puppeteer/issues/305 */ @@ -181,30 +188,33 @@ async function fetchSource(url, callbackFn) { } }, 10); }); - })()`); + })()`) } catch (e) { - console.error('error in scrolling url', { e, url }); + console.error('error in scrolling url', { e, url }) } finally { - resolve(true); + resolve(true) } - })(); + })() }), page.waitForTimeout(5000), //5 second timeout - ]); + ]) // get document body with all hidden elements removed const domContent = await page.evaluate(() => { - const BI_SRC_REGEXP = /url\("(.+?)"\)/gi; + const BI_SRC_REGEXP = /url\("(.+?)"\)/gi - Array.from(document.body.getElementsByTagName('*')).forEach(el => { - const style = window.getComputedStyle(el); + Array.from(document.body.getElementsByTagName('*')).forEach((el) => { + const style = window.getComputedStyle(el) try { // Removing blurred images since they are mostly the copies of lazy loaded ones - if (el.tagName && ['img', 'image'].includes(el.tagName.toLowerCase())) { - const filter = style.getPropertyValue('filter'); + if ( + el.tagName && + ['img', 'image'].includes(el.tagName.toLowerCase()) + ) { + const filter = style.getPropertyValue('filter') if (filter && filter.startsWith('blur')) { - el.parentNode && el.parentNode.removeChild(el); + el.parentNode && el.parentNode.removeChild(el) } } } catch (err) { @@ -212,140 +222,164 @@ async function fetchSource(url, callbackFn) { } // convert all nodes with background image to img nodes - if (!['', 'none'].includes(style.getPropertyValue('background-image'))) { - const filter = style.getPropertyValue('filter'); + if ( + !['', 'none'].includes(style.getPropertyValue('background-image')) + ) { + const filter = style.getPropertyValue('filter') // avoiding image nodes with a blur effect creation if (filter && filter.startsWith('blur')) { - el && el.parentNode && el.parentNode.removeChild(el); + el && el.parentNode && el.parentNode.removeChild(el) } else { - const matchedSRC = BI_SRC_REGEXP.exec(style.getPropertyValue('background-image')); + const matchedSRC = BI_SRC_REGEXP.exec( + style.getPropertyValue('background-image') + ) // Using "g" flag with a regex we have to manually break down lastIndex to zero after every usage // More details here: https://stackoverflow.com/questions/1520800/why-does-a-regexp-with-global-flag-give-wrong-results - BI_SRC_REGEXP.lastIndex = 0; + BI_SRC_REGEXP.lastIndex = 0 if (matchedSRC && matchedSRC[1] && !el.src) { // Replacing element only of there are no content inside, b/c might remove important div with content. // Article example: http://www.josiahzayner.com/2017/01/genetic-designer-part-i.html // DIV with class "content-inner" has `url("https://resources.blogblog.com/blogblog/data/1kt/travel/bg_container.png")` background image. if (!el.textContent) { - const img = document.createElement('img'); - img.src = matchedSRC[1]; - el && el.parentNode && el.parentNode.replaceChild(img, el); + const img = document.createElement('img') + img.src = matchedSRC[1] + el && el.parentNode && el.parentNode.replaceChild(img, el) } } } } + }) + return document.documentElement.innerHTML + }) - }); - return document.documentElement.innerHTML; - }); - - sanitizeSource(domContent, callbackFn); + sanitizeSource(domContent, callbackFn) } catch (error) { console.error('Error occured while fetching content') console.error(error) } finally { - await page.close(); - await browser.close(); + await page.close() + await browser.close() } } function sanitizeSource(html, callbackFn) { - htmltidy(html, { - "indent": true, - "indent-spaces": 4, - "numeric-entities": true, - "output-xhtml": true, - "wrap": 0 - }, callbackFn); + htmltidy( + html, + { + indent: true, + 'indent-spaces': 4, + 'numeric-entities': true, + 'output-xhtml': true, + wrap: 0, + }, + callbackFn + ) } function onResponseReceived(error, source, destRoot, options) { if (error) { - console.error("Couldn't tidy source html!"); - console.error(error); - return; + console.error("Couldn't tidy source html!") + console.error(error) + return } if (debug) { - console.log("writing"); + console.log('writing') } - var sourcePath = path.join(destRoot, "source.html"); - fs.writeFile(sourcePath, source, async function(err) { + var sourcePath = path.join(destRoot, 'source.html') + fs.writeFile(sourcePath, source, async function (err) { if (err) { - console.error("Couldn't write data to source.html!"); - console.error(err); - return; + console.error("Couldn't write data to source.html!") + console.error(err) + return } if (debug) { - console.log("Running readability stuff"); + console.log('Running readability stuff') } - await runReadability(source, path.join(destRoot, "expected.html"), path.join(destRoot, "expected-metadata.json"), options); - }); + await runReadability( + source, + path.join(destRoot, 'expected.html'), + path.join(destRoot, 'expected-metadata.json'), + options + ) + }) } async function runReadability(source, destPath, metadataDestPath, options) { console.log('running readability') - var uri = "http://fakehost/test/page.html"; - var myReader, result, readerable; + var uri = 'http://fakehost/test/page.html' + var myReader, result, readerable try { // Use linkedom for isProbablyReaderable because it supports querySelectorAll - var dom = parseHTML(source).document; - readerable = isProbablyReaderable(dom); + var dom = parseHTML(source).document + readerable = isProbablyReaderable(dom) // We pass `caption` as a class to check that passing in extra classes works, // given that it appears in some of the test documents. - myReader = new Readability(dom, { classesToPreserve: ["caption"], url: uri, ...options }); - result = await myReader.parse(); + myReader = new Readability(dom, { + classesToPreserve: ['caption'], + url: uri, + ...options, + }) + result = await myReader.parse() } catch (ex) { - console.error(ex); - ex.stack.forEach(console.log.bind(console)); + console.error(ex) + ex.stack.forEach(console.log.bind(console)) } console.log('result', result) if (!result) { - console.error("No content generated by readability, not going to write expected.html!"); - return; + console.error( + 'No content generated by readability, not going to write expected.html!' + ) + return } - fs.writeFile(destPath, prettyPrint(result.content), function(fileWriteErr) { + fs.writeFile(destPath, prettyPrint(result.content), function (fileWriteErr) { if (fileWriteErr) { - console.error("Couldn't write data to expected.html!"); - console.error(fileWriteErr); + console.error("Couldn't write data to expected.html!") + console.error(fileWriteErr) } // Delete the result data we don't care about checking. - delete result.content; - delete result.textContent; - delete result.length; - delete result.dom; + delete result.content + delete result.textContent + delete result.length + delete result.dom // Add isProbablyReaderable result - result.readerable = readerable; + result.readerable = readerable - fs.writeFile(metadataDestPath, JSON.stringify(result, null, 2) + "\n", function(metadataWriteErr) { - if (metadataWriteErr) { - console.error("Couldn't write data to expected-metadata.json!"); - console.error(metadataWriteErr); + fs.writeFile( + metadataDestPath, + JSON.stringify(result, null, 2) + '\n', + function (metadataWriteErr) { + if (metadataWriteErr) { + console.error("Couldn't write data to expected-metadata.json!") + console.error(metadataWriteErr) + } } - }); - }); + ) + }) } if (process.argv.length < 3) { - console.error("Need at least a destination slug and potentially a URL (if the slug doesn't have source)."); - process.exit(0); + console.error( + "Need at least a destination slug and potentially a URL (if the slug doesn't have source)." + ) + process.exit(0) } -if (process.argv[2] === "all") { +if (process.argv[2] === 'all') { fs.readdir(testcaseRoot, function (err, files) { if (err) { - console.error("error reading testcases"); - return; + console.error('error reading testcases') + return } files.forEach(function (file) { - generateTestcase(file); - }); - }); + generateTestcase(file) + }) + }) } else { - generateTestcase(process.argv[2]); + generateTestcase(process.argv[2]) } diff --git a/packages/readabilityjs/test/test-pages/newsletters/tldr/expected-metadata.json b/packages/readabilityjs/test/test-pages/newsletters/tldr/expected-metadata.json new file mode 100644 index 000000000..2b3bc9481 --- /dev/null +++ b/packages/readabilityjs/test/test-pages/newsletters/tldr/expected-metadata.json @@ -0,0 +1,11 @@ +{ + "title": "Tesla robotaxi π, Bun's faster NodeJS replacement π¨βπ», Flexport rescinds job offers π°", + "byline": null, + "dir": null, + "excerpt": "The life expectancy of someone in the US infected with HIV is currently about the same as the rest of the population. This is largely due to...", + "siteName": "fakehost", + "siteIcon": "http://fakehost/favicon.ico", + "publishedDate": "2023-09-10T16:00:00.000Z", + "language": "English", + "readerable": false +} diff --git a/packages/readabilityjs/test/test-pages/newsletters/tldr/expected.html b/packages/readabilityjs/test/test-pages/newsletters/tldr/expected.html new file mode 100644 index 000000000..52ef6a991 --- /dev/null +++ b/packages/readabilityjs/test/test-pages/newsletters/tldr/expected.html @@ -0,0 +1,129 @@ +
π±
+Apple will likely unveil this year's newest iPhones, Apple Watches, and AirPods at its fall event on Tuesday. This page contains a list of the products expected to be announced at the event along with industry reports, analyses, and the latest rumors. It also discusses what likely won't be unveiled at the event but may appear in the near feature.
+
+ Inside Tesla's decision to build a $25,000 "global car" (5 minute read)
+
Elon Musk apparently vetoed plans for Tesla to build a $25,000 electric car in favor of developing robotaxis. He finally relented when aides revealed a plan to build both a robotaxi and an inexpensive small car side by side. A $25,000 electric car would be a game changer for the auto industry. Tesla's Master Plan always called for using money made from early models to bring down the cost of future vehicles.
+π
+
+ At least 5 people have been cured of HIV. Is the AIDS pandemic ending? (10 minute read)
+
The life expectancy of someone in the US infected with HIV is currently about the same as the rest of the population. This is largely due to antiretroviral therapy, a daily treatment regimen that decreases the level of HIV in a person's blood to the point it's no longer detectable or transmittable. There have been a handful of people over the last couple of decades that have been able to reach undetectable HIV levels while off treatment, essentially meaning they've been cured of the disease. New universal cures are now on the horizon.
+
+ The first experiment to produce oxygen on another planet has come to an end (4 minute read)
+
The Mars Oxygen In-Situ Resource Utilization Experiment (MOXIE) has come to an end. It exceeded NASA's initial goals, demonstrating capabilities that could help future astronauts explore Mars. It will also make it possible to generate oxygen for powering spacecraft. MOXIE has generated 122 grams of oxygen since the experiment kicked off two years ago. It works by converting Mars' plentiful carbon dioxide into oxygen.
+π»
+Bun is an all-in-one toolkit for running, building, and debugging JavaScript and TypeScript. Version 1.0 is now available. Bun is a drop-in replacement for Node.js and a JavaScript bundler with an esbuild-compatible plugin API. It also functions as an npm-compatible package manager and a Jest-compatible test runner. Bun can run virtually any Node.js application in the wild.
+NGINX Unit is a universal web app server that serves static media assets and runs application code in seven languages. It is intended as a universal building block for any web architecture. NGINX Unit's native RESTful JSON API enables dynamic updates with zero interruptions and flexible configuration. It has a complex asynchronous multithreading architecture that ensures security and robustness.
+π
+Flexport founder Ryan Petersen has rescinded dozens of employment offers two days after Dave Clark's departure as CEO. Petersen says that the company is also looking to lease out office space. The moves are intended to get company costs under control. It appears that Flexport is now shifting from a growth at any cost strategy to curbing spending and becoming profitable.
+
+ How Elon Musk set Tesla on a new course for self-driving (9 minute read)
+
This article contains an adapted excerpt from Walter Isaacson's biography of Elon Musk. The book is set to be published on September 12. This chapter describes Musk's involvement with the company's Full Self-Driving (FSD) project. The newest version of FSD imitates what humans do. This presents some issues, as even the best drivers fudge traffic rules.
+β‘
+
+ Why prices are still going up when companies are spending less (8 minute read)
+
A look into why software prices are surging despite reduced spending - software costs are rising as budgets tighten, and companies are still waiting for long term investments to pay off.
+
+ How to pass the interview for software engineering roles in Big Tech (32 minute read)
+
This article contains advice on how to pass job interviews from someone who has completed more than hundred interviews as an interviewer at Amazon, Meta, and Datadog.
+
+ Japan joins Moon race with successful rocket launch (2 minute read)
+
Japan has successfully launched a rocket with a lunar lander - the lander will attempt a Moon landing in February if all goes well.
+
+ The Decomposition of Rotten Tomatoes (19 minute read)
+
Studios can game Rotten Tomatoes and boost movie ratings.
+Elysia is a fast and friendly Bun web framework that automatically infers types from code.
+
+ Report on Frontier Model Training (25 minute read)
+
This document is a collection of resources on the different aspects of frontier model training created to help people learn about the field and use the information to help inform their own models.
+TLDR is a daily newsletter with links and TLDRs of the most interesting stories in startups π, tech π±, and programming π»! +
+ +