Files
omnivore/packages/readabilityjs/test/generate-testcase.js
2023-08-02 15:58:32 +08:00

352 lines
11 KiB
JavaScript

var debug = false;
var path = require("path");
var fs = require("fs");
var prettyPrint = require("./utils").prettyPrint;
var htmltidy = require("htmltidy2").tidy;
var { Readability, isProbablyReaderable } = require("../index");
const { parseHTML } = require("linkedom");
const puppeteer = require('puppeteer-extra');
// Add stealth plugin to hide puppeteer usage
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
puppeteer.use(StealthPlugin());
// Add adblocker plugin to block all ads and trackers (saves bandwidth)
const AdblockerPlugin = require('puppeteer-extra-plugin-adblocker');
puppeteer.use(AdblockerPlugin({ blockTrackers: true }));
var testcaseRoot = path.join(__dirname, "test-pages");
var argURL = process.argv[3]; // Could be undefined, we'll warn if it is if that is an issue.
const DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
const NON_BOT_DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
const NON_BOT_HOSTS = ['bloomberg.com', 'forbes.com']
const NON_SCRIPT_HOSTS= ['medium.com', 'fastcompany.com'];
const userAgentForUrl = (url) => {
try {
const u = new URL(url);
for (const host of NON_BOT_HOSTS) {
if (u.hostname.endsWith(host)) {
return NON_BOT_DESKTOP_USER_AGENT;
}
}
} catch (e) {
console.log('error getting user agent for url', url, e)
}
return DESKTOP_USER_AGENT
};
const enableJavascriptForUrl = (url) => {
try {
const u = new URL(url);
for (const host of NON_SCRIPT_HOSTS) {
if (u.hostname.endsWith(host)) {
return false;
}
}
} catch (e) {
console.log('error getting hostname for url', url, e)
}
return true
};
function generateTestcase(slug) {
const options = {};
if (slug.startsWith("newsletters/")) {
// keep the newsletter content in tables
options.keepTables = true;
}
var destRoot = path.join(testcaseRoot, slug);
fs.mkdir(destRoot, function (err) {
if (err) {
var sourceFile = path.join(destRoot, "source.html");
fs.exists(sourceFile, function (exists) {
if (exists) {
fs.readFile(sourceFile, { encoding: "utf-8" }, function (readFileErr, data) {
if (readFileErr) {
console.error("Source existed but couldn't be read?");
process.exit(1);
}
onResponseReceived(null, data, destRoot, options);
});
} else {
fs.writeFile(path.join(destRoot, 'url.txt'), argURL, () => null);
fetchSource(argURL, function (fetchErr, data) {
onResponseReceived(fetchErr, data, destRoot, options);
});
}
});
return;
}
fs.writeFile(path.join(destRoot, 'url.txt'), argURL, () => null);
fetchSource(argURL, function (fetchErr, data) {
onResponseReceived(fetchErr, data, destRoot, options);
});
});
}
async function fetchSource(url, callbackFn) {
if (!url) {
console.error("You should pass a URL if the source doesn't exist yet!");
process.exit(1);
}
const browser = await puppeteer.launch({
args: [
'--allow-running-insecure-content',
'--autoplay-policy=user-gesture-required',
'--disable-component-update',
'--disable-domain-reliability',
'--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process',
'--disable-print-preview',
'--disable-setuid-sandbox',
'--disable-site-isolation-trials',
'--disable-speech-api',
'--disable-web-security',
'--disk-cache-size=33554432',
'--enable-features=SharedArrayBuffer',
'--hide-scrollbars',
'--ignore-gpu-blocklist',
'--in-process-gpu',
'--mute-audio',
'--no-default-browser-check',
'--no-pings',
'--no-sandbox',
'--no-zygote',
'--use-gl=swiftshader',
'--window-size=1920,1080',
],
defaultViewport: {
deviceScaleFactor: 1,
hasTouch: false,
height: 1080,
isLandscape: true,
isMobile: false,
width: 1920
},
headless: true,
executablePath: process.env.CHROMIUM_PATH || '/opt/homebrew/bin/chromium',
});
const page = await browser.newPage();
if (!enableJavascriptForUrl(url)) {
await page.setJavaScriptEnabled(false);
}
await page.setUserAgent(userAgentForUrl(url));
try {
/*
* Disallow MathJax from running in Puppeteer and modifying the document,
* we shall instead run it in our frontend application to transform any
* mathjax content when present.
*/
await page.setRequestInterception(true);
page.on('request', request => {
if (
request.resourceType() === 'script' &&
request.url().toLowerCase().indexOf('mathjax') > -1
) {
request.abort();
} else {
request.continue();
}
});
await page.goto(url, { waitUntil: ['networkidle2'] });
/* scroll with a 5 second timeout */
await Promise.race([
new Promise(resolve => {
(async function () {
try {
await page.evaluate(`(async () => {
/* credit: https://github.com/puppeteer/puppeteer/issues/305 */
return new Promise((resolve, reject) => {
let scrollHeight = document.body.scrollHeight;
let totalHeight = 0;
let distance = 500;
let timer = setInterval(() => {
window.scrollBy(0, distance);
totalHeight += distance;
if(totalHeight >= scrollHeight){
clearInterval(timer);
resolve(true);
}
}, 10);
});
})()`);
} catch (e) {
console.error('error in scrolling url', { e, url });
} finally {
resolve(true);
}
})();
}),
page.waitForTimeout(5000), //5 second timeout
]);
// get document body with all hidden elements removed
const domContent = await page.evaluate(() => {
const BI_SRC_REGEXP = /url\("(.+?)"\)/gi;
Array.from(document.body.getElementsByTagName('*')).forEach(el => {
const style = window.getComputedStyle(el);
try {
// Removing blurred images since they are mostly the copies of lazy loaded ones
if (el.tagName && ['img', 'image'].includes(el.tagName.toLowerCase())) {
const filter = style.getPropertyValue('filter');
if (filter && filter.startsWith('blur')) {
el.parentNode && el.parentNode.removeChild(el);
}
}
} catch (err) {
// throw Error('error with element: ' + JSON.stringify(Array.from(document.body.getElementsByTagName('*'))))
}
// convert all nodes with background image to img nodes
if (!['', 'none'].includes(style.getPropertyValue('background-image'))) {
const filter = style.getPropertyValue('filter');
// avoiding image nodes with a blur effect creation
if (filter && filter.startsWith('blur')) {
el && el.parentNode && el.parentNode.removeChild(el);
} else {
const matchedSRC = BI_SRC_REGEXP.exec(style.getPropertyValue('background-image'));
// Using "g" flag with a regex we have to manually break down lastIndex to zero after every usage
// More details here: https://stackoverflow.com/questions/1520800/why-does-a-regexp-with-global-flag-give-wrong-results
BI_SRC_REGEXP.lastIndex = 0;
if (matchedSRC && matchedSRC[1] && !el.src) {
// Replacing element only of there are no content inside, b/c might remove important div with content.
// Article example: http://www.josiahzayner.com/2017/01/genetic-designer-part-i.html
// DIV with class "content-inner" has `url("https://resources.blogblog.com/blogblog/data/1kt/travel/bg_container.png")` background image.
if (!el.textContent) {
const img = document.createElement('img');
img.src = matchedSRC[1];
el && el.parentNode && el.parentNode.replaceChild(img, el);
}
}
}
}
});
return document.documentElement.innerHTML;
});
sanitizeSource(domContent, callbackFn);
} catch (error) {
console.error('Error occured while fetching content')
console.error(error)
} finally {
await page.close();
await browser.close();
}
}
function sanitizeSource(html, callbackFn) {
htmltidy(html, {
"indent": true,
"indent-spaces": 4,
"numeric-entities": true,
"output-xhtml": true,
"wrap": 0
}, callbackFn);
}
function onResponseReceived(error, source, destRoot, options) {
if (error) {
console.error("Couldn't tidy source html!");
console.error(error);
return;
}
if (debug) {
console.log("writing");
}
var sourcePath = path.join(destRoot, "source.html");
fs.writeFile(sourcePath, source, async function(err) {
if (err) {
console.error("Couldn't write data to source.html!");
console.error(err);
return;
}
if (debug) {
console.log("Running readability stuff");
}
await runReadability(source, path.join(destRoot, "expected.html"), path.join(destRoot, "expected-metadata.json"), options);
});
}
async function runReadability(source, destPath, metadataDestPath, options) {
console.log('running readability')
var uri = "http://fakehost/test/page.html";
var myReader, result, readerable;
try {
// Use linkedom for isProbablyReaderable because it supports querySelectorAll
var dom = parseHTML(source).document;
console.log('dom', parseHTML(source).innerHTML)
readerable = isProbablyReaderable(dom);
// We pass `caption` as a class to check that passing in extra classes works,
// given that it appears in some of the test documents.
myReader = new Readability(dom, { classesToPreserve: ["caption"], url: uri, ...options });
result = await myReader.parse();
} catch (ex) {
console.error(ex);
ex.stack.forEach(console.log.bind(console));
}
console.log('result', result)
if (!result) {
console.error("No content generated by readability, not going to write expected.html!");
return;
}
fs.writeFile(destPath, prettyPrint(result.content), function(fileWriteErr) {
if (fileWriteErr) {
console.error("Couldn't write data to expected.html!");
console.error(fileWriteErr);
}
// Delete the result data we don't care about checking.
delete result.content;
delete result.textContent;
delete result.length;
delete result.dom;
// Add isProbablyReaderable result
result.readerable = readerable;
fs.writeFile(metadataDestPath, JSON.stringify(result, null, 2) + "\n", function(metadataWriteErr) {
if (metadataWriteErr) {
console.error("Couldn't write data to expected-metadata.json!");
console.error(metadataWriteErr);
}
});
});
}
if (process.argv.length < 3) {
console.error("Need at least a destination slug and potentially a URL (if the slug doesn't have source).");
process.exit(0);
}
if (process.argv[2] === "all") {
fs.readdir(testcaseRoot, function (err, files) {
if (err) {
console.error("error reading testcases");
return;
}
files.forEach(function (file) {
generateTestcase(file);
});
});
} else {
generateTestcase(process.argv[2]);
}