omnivore/packages/readabilityjs/test/generate-testcase.js

var debug = false;

var path = require("path");
var fs = require("fs");
var prettyPrint = require("./utils").prettyPrint;
var htmltidy = require("htmltidy2").tidy;

var { Readability, isProbablyReaderable } = require("../index");
const puppeteer = require('puppeteer');
const { parseHTML } = require("linkedom");

var testcaseRoot = path.join(__dirname, "test-pages");

var argURL = process.argv[3]; // Could be undefined, we'll warn if it is if that is an issue.

const DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
const NON_BOT_DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
const NON_BOT_HOSTS = ['bloomberg.com', 'forbes.com']
const NON_SCRIPT_HOSTS= ['medium.com', 'fastcompany.com'];

const userAgentForUrl = (url) => {
  try {
    const u = new URL(url);
    for (const host of NON_BOT_HOSTS) {
      if (u.hostname.endsWith(host)) {
        return NON_BOT_DESKTOP_USER_AGENT;
      }
    }
  } catch (e) {
    console.log('error getting user agent for url', url, e)
  }
  return DESKTOP_USER_AGENT
};

const enableJavascriptForUrl = (url) => {
  try {
    const u = new URL(url);
    for (const host of NON_SCRIPT_HOSTS) {
      if (u.hostname.endsWith(host)) {
        return false;
      }
    }
  } catch (e) {
    console.log('error getting hostname for url', url, e)
  }
  return true
};

function generateTestcase(slug) {
  const options = {};
  if (slug.startsWith("newsletters/")) {
    // keep the newsletter content in tables
    options.keepTables = true;
  }
  var destRoot = path.join(testcaseRoot, slug);

  fs.mkdir(destRoot, function (err) {
    if (err) {
      var sourceFile = path.join(destRoot, "source.html");
      fs.exists(sourceFile, function (exists) {
        if (exists) {
          fs.readFile(sourceFile, { encoding: "utf-8" }, function (readFileErr, data) {
            if (readFileErr) {
              console.error("Source existed but couldn't be read?");
              process.exit(1);
            }
            onResponseReceived(null, data, destRoot, options);
          });
        } else {
          fs.writeFile(path.join(destRoot, 'url.txt'), argURL, () => null);
          fetchSource(argURL, function (fetchErr, data) {
            onResponseReceived(fetchErr, data, destRoot, options);
          });
        }
      });
      return;
    }
    fs.writeFile(path.join(destRoot, 'url.txt'), argURL, () => null);
    fetchSource(argURL, function (fetchErr, data) {
      onResponseReceived(fetchErr, data, destRoot, options);
    });
  });
}

async function fetchSource(url, callbackFn) {
  if (!url) {
    console.error("You should pass a URL if the source doesn't exist yet!");
    process.exit(1);
  }

  const browser = await puppeteer.launch({
    headless: true,
    defaultViewport: { height: 1080, width: 1920 },
    args: ['--no-sandbox'],
  });

  const page = await browser.newPage();
  if (!enableJavascriptForUrl(url)) {
    await page.setJavaScriptEnabled(false);
  }
  await page.setUserAgent(userAgentForUrl(url));

  try {
    /*
    * Disallow MathJax from running in Puppeteer and modifying the document,
    * we shall instead run it in our frontend application to transform any
    * mathjax content when present.
    */
    await page.setRequestInterception(true);
    page.on('request', request => {
      if (
        request.resourceType() === 'script' &&
        request.url().toLowerCase().indexOf('mathjax') > -1
      ) {
        request.abort();
      } else {
        request.continue();
      }
    });

    await page.goto(url, { waitUntil: ['networkidle2'] });

    /* scroll with a 5 second timeout */
    await Promise.race([
      new Promise(resolve => {
        (async function () {
          try {
            await page.evaluate(`(async () => {
              /* credit: https://github.com/puppeteer/puppeteer/issues/305 */
              return new Promise((resolve, reject) => {
                let scrollHeight = document.body.scrollHeight;
                let totalHeight = 0;
                let distance = 500;
                let timer = setInterval(() => {
                  window.scrollBy(0, distance);
                  totalHeight += distance;
                  if(totalHeight >= scrollHeight){
                    clearInterval(timer);
                    resolve(true);
                  }
                }, 10);
              });
            })()`);
          } catch (e) {
            console.error('error in scrolling url', { e, url });
          } finally {
            resolve(true);
          }
        })();
      }),
      page.waitForTimeout(5000), //5 second timeout
    ]);

    // get document body with all hidden elements removed
    const domContent = await page.evaluate(() => {
      const BI_SRC_REGEXP = /url\("(.+?)"\)/gi;

      Array.from(document.body.getElementsByTagName('*')).forEach(el => {
        const style = window.getComputedStyle(el);

        try {
          // Removing blurred images since they are mostly the copies of lazy loaded ones
          if (el.tagName && ['img', 'image'].includes(el.tagName.toLowerCase())) {
            const filter = style.getPropertyValue('filter');
            if (filter && filter.startsWith('blur')) {
              el.parentNode && el.parentNode.removeChild(el);
            }
          }
        } catch (err) {
          // throw Error('error with element: ' + JSON.stringify(Array.from(document.body.getElementsByTagName('*'))))
        }

        // convert all nodes with background image to img nodes
        if (!['', 'none'].includes(style.getPropertyValue('background-image'))) {
          const filter = style.getPropertyValue('filter');
          // avoiding image nodes with a blur effect creation
          if (filter && filter.startsWith('blur')) {
            el && el.parentNode && el.parentNode.removeChild(el);
          } else {
            const matchedSRC = BI_SRC_REGEXP.exec(style.getPropertyValue('background-image'));
            // Using "g" flag with a regex we have to manually break down lastIndex to zero after every usage
            // More details here: https://stackoverflow.com/questions/1520800/why-does-a-regexp-with-global-flag-give-wrong-results
            BI_SRC_REGEXP.lastIndex = 0;

            if (matchedSRC && matchedSRC[1] && !el.src) {
              // Replacing element only of there are no content inside, b/c might remove important div with content.
              // Article example: http://www.josiahzayner.com/2017/01/genetic-designer-part-i.html
              // DIV with class "content-inner" has `url("https://resources.blogblog.com/blogblog/data/1kt/travel/bg_container.png")` background image.
              if (el.innerHTML.length < 25) {
                const img = document.createElement('img');
                img.src = matchedSRC[1];
                el && el.parentNode && el.parentNode.removeChild(el);
              }
            }
          }
        }

      });
      return document.documentElement.innerHTML;
    });

    sanitizeSource(domContent, callbackFn);
  } catch (error) {
    console.error('Error occured while fetching content')
    console.error(error)
  } finally {
    await page.close();
    await browser.close();
  }
}

function sanitizeSource(html, callbackFn) {
  htmltidy(html, {
    "indent": true,
    "indent-spaces": 4,
    "numeric-entities": true,
    "output-xhtml": true,
    "wrap": 0
  }, callbackFn);
}

function onResponseReceived(error, source, destRoot, options) {
  if (error) {
    console.error("Couldn't tidy source html!");
    console.error(error);
    return;
  }
  if (debug) {
    console.log("writing");
  }
  var sourcePath = path.join(destRoot, "source.html");
  fs.writeFile(sourcePath, source, async function(err) {
    if (err) {
      console.error("Couldn't write data to source.html!");
      console.error(err);
      return;
    }
    if (debug) {
      console.log("Running readability stuff");
    }
    await runReadability(source, path.join(destRoot, "expected.html"), path.join(destRoot, "expected-metadata.json"), options);
  });
}

async function runReadability(source, destPath, metadataDestPath, options) {
  console.log('running readability')

  var uri = "http://fakehost/test/page.html";
  var myReader, result, readerable;
  try {
    // Use linkedom for isProbablyReaderable because it supports querySelectorAll
    var dom = parseHTML(source).document;
    console.log('dom', parseHTML(source).innerHTML)
    readerable = isProbablyReaderable(dom);
    // We pass `caption` as a class to check that passing in extra classes works,
    // given that it appears in some of the test documents.
    myReader = new Readability(dom, { classesToPreserve: ["caption"], url: uri, ...options });
    result = await myReader.parse();
  } catch (ex) {
    console.error(ex);
    ex.stack.forEach(console.log.bind(console));
  }
  console.log('result', result)
  if (!result) {
    console.error("No content generated by readability, not going to write expected.html!");
    return;
  }

  fs.writeFile(destPath, prettyPrint(result.content), function(fileWriteErr) {
    if (fileWriteErr) {
      console.error("Couldn't write data to expected.html!");
      console.error(fileWriteErr);
    }

    // Delete the result data we don't care about checking.
    delete result.content;
    delete result.textContent;
    delete result.length;
    delete result.dom;

    // Add isProbablyReaderable result
    result.readerable = readerable;

    fs.writeFile(metadataDestPath, JSON.stringify(result, null, 2) + "\n", function(metadataWriteErr) {
      if (metadataWriteErr) {
        console.error("Couldn't write data to expected-metadata.json!");
        console.error(metadataWriteErr);
      }
    });
  });
}

if (process.argv.length < 3) {
  console.error("Need at least a destination slug and potentially a URL (if the slug doesn't have source).");
  process.exit(0);
}

if (process.argv[2] === "all") {
  fs.readdir(testcaseRoot, function (err, files) {
    if (err) {
      console.error("error reading testcases");
      return;
    }

    files.forEach(function (file) {
      generateTestcase(file);
    });
  });
} else {
  generateTestcase(process.argv[2]);
}