312 lines
10 KiB
JavaScript
312 lines
10 KiB
JavaScript
var debug = false;
|
|
|
|
var path = require("path");
|
|
var fs = require("fs");
|
|
var prettyPrint = require("./utils").prettyPrint;
|
|
var htmltidy = require("htmltidy2").tidy;
|
|
|
|
var { Readability, isProbablyReaderable } = require("../index");
|
|
const puppeteer = require('puppeteer');
|
|
const { parseHTML } = require("linkedom");
|
|
|
|
var testcaseRoot = path.join(__dirname, "test-pages");
|
|
|
|
var argURL = process.argv[3]; // Could be undefined, we'll warn if it is if that is an issue.
|
|
|
|
const DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
|
|
const NON_BOT_DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
|
|
const NON_BOT_HOSTS = ['bloomberg.com', 'forbes.com']
|
|
const NON_SCRIPT_HOSTS= ['medium.com', 'fastcompany.com'];
|
|
|
|
const userAgentForUrl = (url) => {
|
|
try {
|
|
const u = new URL(url);
|
|
for (const host of NON_BOT_HOSTS) {
|
|
if (u.hostname.endsWith(host)) {
|
|
return NON_BOT_DESKTOP_USER_AGENT;
|
|
}
|
|
}
|
|
} catch (e) {
|
|
console.log('error getting user agent for url', url, e)
|
|
}
|
|
return DESKTOP_USER_AGENT
|
|
};
|
|
|
|
const enableJavascriptForUrl = (url) => {
|
|
try {
|
|
const u = new URL(url);
|
|
for (const host of NON_SCRIPT_HOSTS) {
|
|
if (u.hostname.endsWith(host)) {
|
|
return false;
|
|
}
|
|
}
|
|
} catch (e) {
|
|
console.log('error getting hostname for url', url, e)
|
|
}
|
|
return true
|
|
};
|
|
|
|
function generateTestcase(slug) {
|
|
const options = {};
|
|
if (slug.startsWith("newsletters/")) {
|
|
// keep the newsletter content in tables
|
|
options.keepTables = true;
|
|
}
|
|
var destRoot = path.join(testcaseRoot, slug);
|
|
|
|
fs.mkdir(destRoot, function (err) {
|
|
if (err) {
|
|
var sourceFile = path.join(destRoot, "source.html");
|
|
fs.exists(sourceFile, function (exists) {
|
|
if (exists) {
|
|
fs.readFile(sourceFile, { encoding: "utf-8" }, function (readFileErr, data) {
|
|
if (readFileErr) {
|
|
console.error("Source existed but couldn't be read?");
|
|
process.exit(1);
|
|
}
|
|
onResponseReceived(null, data, destRoot, options);
|
|
});
|
|
} else {
|
|
fs.writeFile(path.join(destRoot, 'url.txt'), argURL, () => null);
|
|
fetchSource(argURL, function (fetchErr, data) {
|
|
onResponseReceived(fetchErr, data, destRoot, options);
|
|
});
|
|
}
|
|
});
|
|
return;
|
|
}
|
|
fs.writeFile(path.join(destRoot, 'url.txt'), argURL, () => null);
|
|
fetchSource(argURL, function (fetchErr, data) {
|
|
onResponseReceived(fetchErr, data, destRoot, options);
|
|
});
|
|
});
|
|
}
|
|
|
|
async function fetchSource(url, callbackFn) {
|
|
if (!url) {
|
|
console.error("You should pass a URL if the source doesn't exist yet!");
|
|
process.exit(1);
|
|
}
|
|
|
|
const browser = await puppeteer.launch({
|
|
headless: true,
|
|
defaultViewport: { height: 1080, width: 1920 },
|
|
args: ['--no-sandbox'],
|
|
});
|
|
|
|
const page = await browser.newPage();
|
|
if (!enableJavascriptForUrl(url)) {
|
|
await page.setJavaScriptEnabled(false);
|
|
}
|
|
await page.setUserAgent(userAgentForUrl(url));
|
|
|
|
try {
|
|
/*
|
|
* Disallow MathJax from running in Puppeteer and modifying the document,
|
|
* we shall instead run it in our frontend application to transform any
|
|
* mathjax content when present.
|
|
*/
|
|
await page.setRequestInterception(true);
|
|
page.on('request', request => {
|
|
if (
|
|
request.resourceType() === 'script' &&
|
|
request.url().toLowerCase().indexOf('mathjax') > -1
|
|
) {
|
|
request.abort();
|
|
} else {
|
|
request.continue();
|
|
}
|
|
});
|
|
|
|
await page.goto(url, { waitUntil: ['networkidle2'] });
|
|
|
|
/* scroll with a 5 second timeout */
|
|
await Promise.race([
|
|
new Promise(resolve => {
|
|
(async function () {
|
|
try {
|
|
await page.evaluate(`(async () => {
|
|
/* credit: https://github.com/puppeteer/puppeteer/issues/305 */
|
|
return new Promise((resolve, reject) => {
|
|
let scrollHeight = document.body.scrollHeight;
|
|
let totalHeight = 0;
|
|
let distance = 500;
|
|
let timer = setInterval(() => {
|
|
window.scrollBy(0, distance);
|
|
totalHeight += distance;
|
|
if(totalHeight >= scrollHeight){
|
|
clearInterval(timer);
|
|
resolve(true);
|
|
}
|
|
}, 10);
|
|
});
|
|
})()`);
|
|
} catch (e) {
|
|
console.error('error in scrolling url', { e, url });
|
|
} finally {
|
|
resolve(true);
|
|
}
|
|
})();
|
|
}),
|
|
page.waitForTimeout(5000), //5 second timeout
|
|
]);
|
|
|
|
// get document body with all hidden elements removed
|
|
const domContent = await page.evaluate(() => {
|
|
const BI_SRC_REGEXP = /url\("(.+?)"\)/gi;
|
|
|
|
Array.from(document.body.getElementsByTagName('*')).forEach(el => {
|
|
const style = window.getComputedStyle(el);
|
|
|
|
try {
|
|
// Removing blurred images since they are mostly the copies of lazy loaded ones
|
|
if (el.tagName && ['img', 'image'].includes(el.tagName.toLowerCase())) {
|
|
const filter = style.getPropertyValue('filter');
|
|
if (filter && filter.startsWith('blur')) {
|
|
el.parentNode && el.parentNode.removeChild(el);
|
|
}
|
|
}
|
|
} catch (err) {
|
|
// throw Error('error with element: ' + JSON.stringify(Array.from(document.body.getElementsByTagName('*'))))
|
|
}
|
|
|
|
// convert all nodes with background image to img nodes
|
|
if (!['', 'none'].includes(style.getPropertyValue('background-image'))) {
|
|
const filter = style.getPropertyValue('filter');
|
|
// avoiding image nodes with a blur effect creation
|
|
if (filter && filter.startsWith('blur')) {
|
|
el && el.parentNode && el.parentNode.removeChild(el);
|
|
} else {
|
|
const matchedSRC = BI_SRC_REGEXP.exec(style.getPropertyValue('background-image'));
|
|
// Using "g" flag with a regex we have to manually break down lastIndex to zero after every usage
|
|
// More details here: https://stackoverflow.com/questions/1520800/why-does-a-regexp-with-global-flag-give-wrong-results
|
|
BI_SRC_REGEXP.lastIndex = 0;
|
|
|
|
if (matchedSRC && matchedSRC[1] && !el.src) {
|
|
// Replacing element only of there are no content inside, b/c might remove important div with content.
|
|
// Article example: http://www.josiahzayner.com/2017/01/genetic-designer-part-i.html
|
|
// DIV with class "content-inner" has `url("https://resources.blogblog.com/blogblog/data/1kt/travel/bg_container.png")` background image.
|
|
if (el.innerHTML.length < 25) {
|
|
const img = document.createElement('img');
|
|
img.src = matchedSRC[1];
|
|
el && el.parentNode && el.parentNode.removeChild(el);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
});
|
|
return document.documentElement.innerHTML;
|
|
});
|
|
|
|
sanitizeSource(domContent, callbackFn);
|
|
} catch (error) {
|
|
console.error('Error occured while fetching content')
|
|
console.error(error)
|
|
} finally {
|
|
await page.close();
|
|
await browser.close();
|
|
}
|
|
}
|
|
|
|
function sanitizeSource(html, callbackFn) {
|
|
htmltidy(html, {
|
|
"indent": true,
|
|
"indent-spaces": 4,
|
|
"numeric-entities": true,
|
|
"output-xhtml": true,
|
|
"wrap": 0
|
|
}, callbackFn);
|
|
}
|
|
|
|
function onResponseReceived(error, source, destRoot, options) {
|
|
if (error) {
|
|
console.error("Couldn't tidy source html!");
|
|
console.error(error);
|
|
return;
|
|
}
|
|
if (debug) {
|
|
console.log("writing");
|
|
}
|
|
var sourcePath = path.join(destRoot, "source.html");
|
|
fs.writeFile(sourcePath, source, async function(err) {
|
|
if (err) {
|
|
console.error("Couldn't write data to source.html!");
|
|
console.error(err);
|
|
return;
|
|
}
|
|
if (debug) {
|
|
console.log("Running readability stuff");
|
|
}
|
|
await runReadability(source, path.join(destRoot, "expected.html"), path.join(destRoot, "expected-metadata.json"), options);
|
|
});
|
|
}
|
|
|
|
async function runReadability(source, destPath, metadataDestPath, options) {
|
|
console.log('running readability')
|
|
|
|
var uri = "http://fakehost/test/page.html";
|
|
var myReader, result, readerable;
|
|
try {
|
|
// Use linkedom for isProbablyReaderable because it supports querySelectorAll
|
|
var dom = parseHTML(source).document;
|
|
console.log('dom', parseHTML(source).innerHTML)
|
|
readerable = isProbablyReaderable(dom);
|
|
// We pass `caption` as a class to check that passing in extra classes works,
|
|
// given that it appears in some of the test documents.
|
|
myReader = new Readability(dom, { classesToPreserve: ["caption"], url: uri, ...options });
|
|
result = await myReader.parse();
|
|
} catch (ex) {
|
|
console.error(ex);
|
|
ex.stack.forEach(console.log.bind(console));
|
|
}
|
|
console.log('result', result)
|
|
if (!result) {
|
|
console.error("No content generated by readability, not going to write expected.html!");
|
|
return;
|
|
}
|
|
|
|
fs.writeFile(destPath, prettyPrint(result.content), function(fileWriteErr) {
|
|
if (fileWriteErr) {
|
|
console.error("Couldn't write data to expected.html!");
|
|
console.error(fileWriteErr);
|
|
}
|
|
|
|
// Delete the result data we don't care about checking.
|
|
delete result.content;
|
|
delete result.textContent;
|
|
delete result.length;
|
|
delete result.dom;
|
|
|
|
// Add isProbablyReaderable result
|
|
result.readerable = readerable;
|
|
|
|
fs.writeFile(metadataDestPath, JSON.stringify(result, null, 2) + "\n", function(metadataWriteErr) {
|
|
if (metadataWriteErr) {
|
|
console.error("Couldn't write data to expected-metadata.json!");
|
|
console.error(metadataWriteErr);
|
|
}
|
|
});
|
|
});
|
|
}
|
|
|
|
if (process.argv.length < 3) {
|
|
console.error("Need at least a destination slug and potentially a URL (if the slug doesn't have source).");
|
|
process.exit(0);
|
|
}
|
|
|
|
if (process.argv[2] === "all") {
|
|
fs.readdir(testcaseRoot, function (err, files) {
|
|
if (err) {
|
|
console.error("error reading testcases");
|
|
return;
|
|
}
|
|
|
|
files.forEach(function (file) {
|
|
generateTestcase(file);
|
|
});
|
|
});
|
|
} else {
|
|
generateTestcase(process.argv[2]);
|
|
}
|