omnivore/packages/api/src/readability.d.ts

// Type definitions for non-npm package mozilla-readability 0.2
// Project: https://github.com/mozilla/readability
// Definitions by: Charles Vandevoorde <https://github.com/charlesvdv>, Alex Wendland <https://github.com/awendland>
// Definitions: https://github.com/DefinitelyTyped/DefinitelyTyped
// TypeScript Version: 2.2

declare module '@omnivore/readability' {
  /**
   * A standalone version of the readability library used for Firefox Reader View.
   *
   * Note that isProbablyReaderable() was moved into a separate file in https://github.com/mozilla/readability/commit/2620542dd1e8380220d82afa97a2c283ae636e40
   * and therefore is no longer part of the Readability class.
   */
  class Readability {
    /**
     * ## Usage on the web
     *
     * To parse a document, you must create a new Readability object from a
     * DOM document object, and then call parse(). Here's an example:
     *
     * ```js
     * var article = new Readability(document).parse();
     * ```
     *
     * If you're using Readability on the web, you will likely be able to
     * use a document reference from elsewhere (e.g. fetched via XMLHttpRequest,
     * in a same-origin <iframe> you have access to, etc.).
     *
     * ## Usage from node.js
     *
     * In node.js, you won't generally have a DOM document object. To obtain one, you can use external
     * libraries like [jsdom](https://github.com/tmpvar/jsdom). While this repository contains a parser of
     * its own (`JSDOMParser`), that is restricted to reading XML-compatible markup and therefore we do
     * not recommend it for general use.
     *
     * If you're using `jsdom` to create a DOM object, you should ensure that the page doesn't run (page)
     * scripts (avoid fetching remote resources etc.) as well as passing it the page's URI as the `url`
     * property of the `options` object you pass the `JSDOM` constructor.
     *
     * ```js
     * var JSDOM = require('jsdom').JSDOM;
     * var doc = new JSDOM("<body>Here's a bunch of text</body>", {
     *   url: "https://www.example.com/the-page-i-got-the-source-from",
     * });
     * let reader = new Readability(doc.window.document);
     * let article = reader.parse();
     * ```
     */
    constructor(doc: Document, options?: Readability.Options)

    /**
     * Runs readability.
     *
     * ## Workflow:
     *
     *  1. Prep the document by removing script tags, css, etc.
     *  2. Build readability's DOM tree.
     *  3. Grab the article content from the current dom tree.
     *  4. Replace the current DOM tree with the new one.
     *  5. Read peacefully.
     *
     * ## Additional notes:
     *
     * Readability's parse() works by modifying the DOM. This removes some
     * elements in the web page. You could avoid this by passing the clone
     * of the document object while creating a Readability object.
     *
     * ```js
     * var documentClone = document.cloneNode(true);
     * var article = new Readability(documentClone).parse();
     * ```
     *
     * The response will be null if the processing failed (https://github.com/mozilla/readability/blob/52ab9b5c8916c306a47b2119270dcdabebf9d203/Readability.js#L2038)
     */
    async parse(): Promise<Readability.ParseResult | null>
  }

  namespace Readability {
    interface Options {
      /**
       * Control whether log messages are sent to the console
       */
      debug?: boolean

      /**
       * Set a maximum size on the documents that will be processed. This size is
       * checked before any parsing operations occur. If the number of elements in
       * the document exceeds this threshold then an Error will be thrown.
       *
       * See implementation details at https://github.com/mozilla/readability/blob/52ab9b5c8916c306a47b2119270dcdabebf9d203/Readability.js#L2019
       */
      maxElemsToParse?: number

      nbTopCandidates?: number

      /**
       * Minimum number of characters in the extracted textContent in order to
       * consider the article correctly identified. If the threshold is not met then
       * the extraction process will automatically run again with different flags.
       *
       * See implementation details at https://github.com/mozilla/readability/blob/52ab9b5c8916c306a47b2119270dcdabebf9d203/Readability.js#L1208
       *
       * Changed from wordThreshold in https://github.com/mozilla/readability/commit/3ff9a166fb27928f222c4c0722e730eda412658a
       */
      charThreshold?: number

      /**
       * parse() removes the class="" attribute from every element in the given
       * subtree, except those that match CLASSES_TO_PRESERVE and
       * the classesToPreserve array from the options object.
       */
      classesToPreserve?: string[]

      /**
       * By default Readability will strip all classes from the HTML elements in the
       * processed article. By setting this to `true` the classes will be retained.
       *
       * This is a blanket alternative to `classesToPreserve`.
       *
       * Added in https://github.com/mozilla/readability/commit/2982216913af2c66b0690e88606b03116553ad92
       */

      keepClasses?: boolean
      url?: string

      /**
       * Function that converts a regular image url into imageproxy url
       * @param url string
       */
      createImageProxyUrl?: (
        url: string,
        width?: number,
        height?: number
      ) => string

      /**
       * By default, Readability will clean all tables from the HTML elements in the
       * processed article. But newsletters in emails use tables to display their content.
       * By setting this to `true`, these tables will be retained.
       */
      keepTables?: boolean
      ignoreLinkDensity?: boolean
    }

    interface ParseResult {
      /** Article title */
      title: string
      /** Author metadata */
      byline?: string | null
      /** Content direction */
      dir?: string | null
      /** HTML string of processed article content */
      content: string
      /** non-HTML version of `content`  */
      textContent: string
      /** Length of an article, in characters */
      length: number
      /** Article description, or short excerpt from the content */
      excerpt: string
      /** Article site name */
      siteName?: string | null
      /** Article site icon */
      siteIcon?: string | null
      /** Article preview image */
      previewImage?: string | null
      /** Article published date */
      publishedDate?: Date | null
      language?: string | null
    }
  }

  export { Readability }
}