Merge pull request #582 from omnivore-app/optimize-parsing

Optimize parsing
This commit is contained in:
Hongbo Wu
2022-05-12 11:07:52 +08:00
committed by GitHub
21 changed files with 1994 additions and 215 deletions

View File

@ -58,11 +58,11 @@
"highlightjs": "^9.16.2",
"html-entities": "^2.3.2",
"intercom-client": "^3.1.4",
"jsdom": "^19.0.0",
"jsonwebtoken": "^8.5.1",
"jwks-rsa": "^2.0.3",
"knex": "0.21.12",
"knex-stringcase": "^1.4.2",
"linkedom": "^0.14.9",
"luxon": "^2.3.1",
"nanoid": "^3.1.25",
"nodemailer": "^6.7.3",
@ -96,7 +96,6 @@
"@types/express": "^4.17.7",
"@types/highlightjs": "^9.12.2",
"@types/intercom-client": "^2.11.8",
"@types/jsdom": "^16.2.3",
"@types/jsonwebtoken": "^8.5.0",
"@types/luxon": "^1.25.0",
"@types/mocha": "^8.2.2",

View File

@ -121,6 +121,7 @@ declare module '@omnivore/readability' {
*/
keepClasses?: boolean
url?: string
/**
* Function that converts a regular image url into imageproxy url

View File

@ -1,17 +1,15 @@
import { DOMWindow } from 'jsdom'
export class AxiosHandler {
name = 'axios'
// eslint-disable-next-line @typescript-eslint/no-unused-vars
shouldPrehandle = (url: URL, _dom: DOMWindow): boolean => {
shouldPrehandle = (url: URL, _dom: Document): boolean => {
const host = this.name + '.com'
// check if url ends with axios.com
return url.hostname.endsWith(host)
}
prehandle = (url: URL, dom: DOMWindow): Promise<DOMWindow> => {
const body = dom.document.querySelector('table')
prehandle = (url: URL, dom: Document): Promise<Document> => {
const body = dom.querySelector('table')
// this removes ads and replaces table with a div
body?.querySelectorAll('table').forEach((el, k) => {
@ -27,7 +25,7 @@ export class AxiosHandler {
}
})
// replace the table with a div
const div = dom.document.createElement('div')
const div = dom.createElement('div')
div.innerHTML = el.innerHTML
el.parentNode?.replaceChild(div, el)
}

View File

@ -1,22 +1,18 @@
import { DOMWindow } from 'jsdom'
export class BloombergHandler {
name = 'bloomberg'
shouldPrehandle = (url: URL, dom: DOMWindow): boolean => {
shouldPrehandle = (url: URL, dom: Document): boolean => {
const host = this.name + '.com'
// check if url ends with bloomberg.com
return (
url.hostname.endsWith(host) ||
dom.document
.querySelector('.logo-image')
?.getAttribute('alt')
?.toLowerCase() === this.name
dom.querySelector('.logo-image')?.getAttribute('alt')?.toLowerCase() ===
this.name
)
}
prehandle = (_url: URL, dom: DOMWindow): Promise<DOMWindow> => {
const body = dom.document.querySelector('.wrapper')
prehandle = (_url: URL, dom: Document): Promise<Document> => {
const body = dom.querySelector('.wrapper')
// this removes header
body?.querySelector('.sailthru-variables')?.remove()

View File

@ -1,17 +1,15 @@
import { DOMWindow } from 'jsdom'
export class GolangHandler {
name = 'golangweekly'
// eslint-disable-next-line @typescript-eslint/no-unused-vars
shouldPrehandle = (url: URL, _dom: DOMWindow): boolean => {
shouldPrehandle = (url: URL, _dom: Document): boolean => {
const host = this.name + '.com'
// check if url ends with golangweekly.com
return url.hostname.endsWith(host)
}
prehandle = (url: URL, dom: DOMWindow): Promise<DOMWindow> => {
const body = dom.document.querySelector('body')
prehandle = (url: URL, dom: Document): Promise<Document> => {
const body = dom.querySelector('body')
// this removes the "Subscribe" button
body?.querySelector('.el-splitbar')?.remove()

View File

@ -2,7 +2,6 @@
/* eslint-disable @typescript-eslint/no-unsafe-assignment */
/* eslint-disable @typescript-eslint/no-unused-vars */
import { Readability } from '@omnivore/readability'
import { DOMWindow, JSDOM, VirtualConsole } from 'jsdom'
import createDOMPurify, { SanitizeElementHookEvent } from 'dompurify'
import { PageType, PreparedDocumentInput } from '../generated/graphql'
import { buildLogger, LogRecord } from './logger'
@ -15,10 +14,11 @@ import { BloombergHandler } from './bloomberg-handler'
import { GolangHandler } from './golang-handler'
import * as hljs from 'highlightjs'
import { decode } from 'html-entities'
import { parseHTML } from 'linkedom'
const logger = buildLogger('utils.parse')
const virtualConsole = new VirtualConsole()
// const virtualConsole = new VirtualConsole()
export const ALLOWED_CONTENT_TYPES = [
'text/html',
@ -41,8 +41,8 @@ const DOM_PURIFY_CONFIG = {
}
interface ContentHandler {
shouldPrehandle: (url: URL, dom: DOMWindow) => boolean
prehandle: (url: URL, document: DOMWindow) => Promise<DOMWindow>
shouldPrehandle: (url: URL, dom: Document) => boolean
prehandle: (url: URL, document: Document) => Promise<Document>
}
const HANDLERS = [
@ -102,9 +102,9 @@ type ArticleParseLogRecord = LogRecord & {
const DEBUG_MODE = process.env.DEBUG === 'true' || false
const parseOriginalContent = (window: DOMWindow): PageType => {
const parseOriginalContent = (document: Document): PageType => {
try {
const e = window.document.querySelector("head meta[property='og:type']")
const e = document.querySelector("head meta[property='og:type']")
const content = e?.getAttribute('content')
if (!content) {
return PageType.Unknown
@ -128,32 +128,24 @@ const parseOriginalContent = (window: DOMWindow): PageType => {
}
const getPurifiedContent = (html: string): Document => {
const newWindow = new JSDOM('').window
const DOMPurify = createDOMPurify(newWindow as unknown as Window)
const newWindow = parseHTML('')
const DOMPurify = createDOMPurify(newWindow)
DOMPurify.addHook('uponSanitizeElement', domPurifySanitizeHook)
const clean = DOMPurify.sanitize(html, DOM_PURIFY_CONFIG)
return new JSDOM(clean).window.document
return parseHTML(clean).document
}
const getReadabilityResult = (
url: string,
html: string,
window: DOMWindow,
document: Document,
isNewsletter?: boolean
): Readability.ParseResult | null => {
virtualConsole.removeAllListeners('jsdomError')
virtualConsole.on('jsdomError', ({ message, stack: _stack, ...details }) => {
logger.warning(`JSDOM error occurred`, {
errorMsg: message,
...details,
})
})
// First attempt to read the article as is.
// if that fails attempt to purify then read
const sources = [
() => {
return window.document
return document
},
() => {
return getPurifiedContent(html)
@ -171,6 +163,7 @@ const getReadabilityResult = (
debug: DEBUG_MODE,
createImageProxyUrl,
keepTables: isNewsletter,
url,
}).parse()
if (article) {
@ -184,12 +177,15 @@ const getReadabilityResult = (
return null
}
const applyHandlers = async (url: string, window: DOMWindow): Promise<void> => {
const applyHandlers = async (
url: string,
document: Document
): Promise<void> => {
try {
const u = new URL(url)
const handler = HANDLERS.find((h) => {
try {
return h.shouldPrehandle(u, window)
return h.shouldPrehandle(u, document)
} catch (e) {
console.log('error with handler: ', h.name, e)
}
@ -198,7 +194,7 @@ const applyHandlers = async (url: string, window: DOMWindow): Promise<void> => {
if (handler) {
try {
console.log('pre-handling url or content with handler: ', handler.name)
await handler.prehandle(u, window)
await handler.prehandle(u, document)
} catch (e) {
console.log('error with handler: ', handler, e)
}
@ -236,20 +232,20 @@ export const parsePreparedContent = async (
}
}
virtualConsole.removeAllListeners('jsdomError')
virtualConsole.on('jsdomError', ({ message, stack: _stack, ...details }) => {
logger.warning(`JSDOM error occurred`, {
...logRecord,
errorMsg: message,
...details,
})
})
const { window } = new JSDOM(document, { url, virtualConsole })
// virtualConsole.removeAllListeners('jsdomError')
// virtualConsole.on('jsdomError', ({ message, stack: _stack, ...details }) => {
// logger.warning(`JSDOM error occurred`, {
// ...logRecord,
// errorMsg: message,
// ...details,
// })
// })
const dom = parseHTML(document).document
await applyHandlers(url, window)
await applyHandlers(url, dom)
try {
article = getReadabilityResult(url, document, window, isNewsletter)
article = getReadabilityResult(url, document, dom, isNewsletter)
// Format code blocks
// TODO: we probably want to move this type of thing
@ -276,13 +272,13 @@ export const parsePreparedContent = async (
}
}
const newWindow = new JSDOM('').window
const DOMPurify = createDOMPurify(newWindow as unknown as Window)
const newWindow = parseHTML('')
const DOMPurify = createDOMPurify(newWindow)
DOMPurify.addHook('uponSanitizeElement', domPurifySanitizeHook)
const clean = DOMPurify.sanitize(article?.content || '', DOM_PURIFY_CONFIG)
const jsonLdLinkMetadata = (async () => {
return getJSONLdLinkMetadata(window.document)
return getJSONLdLinkMetadata(dom)
})()
Object.assign(article, {
@ -315,7 +311,7 @@ export const parsePreparedContent = async (
domContent: preparedDocument.document,
parsedContent: article,
canonicalUrl,
pageType: parseOriginalContent(window),
pageType: parseOriginalContent(dom),
}
}
@ -362,26 +358,26 @@ type Metadata = {
export const parsePageMetadata = (html: string): Metadata | undefined => {
try {
const window = new JSDOM(html).window
const document = parseHTML(html).document
// get open graph metadata
const description =
window.document
document
.querySelector("head meta[property='og:description']")
?.getAttribute('content') || ''
const previewImage =
window.document
document
.querySelector("head meta[property='og:image']")
?.getAttribute('content') || ''
const title =
window.document
document
.querySelector("head meta[property='og:title']")
?.getAttribute('content') || undefined
const author =
window.document
document
.querySelector("head meta[name='author']")
?.getAttribute('content') || undefined
@ -412,9 +408,9 @@ export const parseUrlMetadata = async (
// TODO: when we consolidate the handlers we could include this
// as a utility method on each one.
export const isProbablyNewsletter = (html: string): boolean => {
const dom = new JSDOM(html).window
const domCopy = new JSDOM(dom.document.documentElement.outerHTML)
const article = new Readability(domCopy.window.document, {
const dom = parseHTML(html).document
const domCopy = parseHTML(dom.documentElement.outerHTML)
const article = new Readability(domCopy.document, {
debug: false,
keepTables: true,
}).parse()
@ -424,16 +420,16 @@ export const isProbablyNewsletter = (html: string): boolean => {
}
// substack newsletter emails have tables with a *post-meta class
if (dom.document.querySelector('table[class$="post-meta"]')) {
if (dom.querySelector('table[class$="post-meta"]')) {
return true
}
// If the article has a header link, and substack icons its probably a newsletter
const href = findNewsletterHeaderHref(dom.window)
const heartIcon = dom.document.querySelector(
const href = findNewsletterHeaderHref(dom)
const heartIcon = dom.querySelector(
'table tbody td span a img[src*="HeartIcon"]'
)
const recommendIcon = dom.document.querySelector(
const recommendIcon = dom.querySelector(
'table tbody td span a img[src*="RecommendIconRounded"]'
)
if (href && (heartIcon || recommendIcon)) {
@ -441,8 +437,8 @@ export const isProbablyNewsletter = (html: string): boolean => {
}
// Check if this is a beehiiv.net newsletter
if (dom.document.querySelectorAll('img[src*="beehiiv.net"]').length > 0) {
const beehiivUrl = beehiivNewsletterHref(dom.window)
if (dom.querySelectorAll('img[src*="beehiiv.net"]').length > 0) {
const beehiivUrl = beehiivNewsletterHref(dom)
if (beehiivUrl) {
return true
}
@ -451,10 +447,8 @@ export const isProbablyNewsletter = (html: string): boolean => {
return false
}
const beehiivNewsletterHref = (dom: DOMWindow): string | undefined => {
const readOnline = dom.document.querySelectorAll(
'table tr td div a[class*="link"]'
)
const beehiivNewsletterHref = (dom: Document): string | undefined => {
const readOnline = dom.querySelectorAll('table tr td div a[class*="link"]')
let res: string | undefined = undefined
readOnline.forEach((e) => {
if (e.textContent === 'Read Online') {
@ -464,15 +458,15 @@ const beehiivNewsletterHref = (dom: DOMWindow): string | undefined => {
return res
}
const findNewsletterHeaderHref = (dom: DOMWindow): string | undefined => {
const findNewsletterHeaderHref = (dom: Document): string | undefined => {
// Substack header links
const postLink = dom.document.querySelector('h1 a ')
const postLink = dom.querySelector('h1 a ')
if (postLink) {
return postLink.getAttribute('href') || undefined
}
// Check if this is a beehiiv.net newsletter
const beehiiv = beehiivNewsletterHref(dom.window)
const beehiiv = beehiivNewsletterHref(dom)
if (beehiiv) {
return beehiiv
}
@ -485,10 +479,10 @@ const findNewsletterHeaderHref = (dom: DOMWindow): string | undefined => {
export const findNewsletterUrl = async (
html: string
): Promise<string | undefined> => {
const dom = new JSDOM(html).window
const dom = parseHTML(html).document
// Check if this is a substack newsletter
const href = findNewsletterHeaderHref(dom.window)
const href = findNewsletterHeaderHref(dom)
if (href) {
// Try to make a HEAD request so we get the redirected URL, since these
// will usually be behind tracking url redirects

View File

@ -1,23 +1,21 @@
import { DOMWindow } from 'jsdom'
export class SubstackHandler {
name = 'substack'
shouldPrehandle = (url: URL, dom: DOMWindow): boolean => {
shouldPrehandle = (url: URL, dom: Document): boolean => {
const host = this.name + '.com'
// check if url ends with substack.com
// or has a profile image hosted at substack.com
return (
url.hostname.endsWith(host) ||
!!dom.document
!!dom
.querySelector('.email-body img')
?.getAttribute('src')
?.includes(host)
)
}
prehandle = (url: URL, dom: DOMWindow): Promise<DOMWindow> => {
const body = dom.document.querySelector('.email-body-container')
prehandle = (url: URL, dom: Document): Promise<Document> => {
const body = dom.querySelector('.email-body-container')
// this removes header and profile avatar
body?.querySelector('.header')?.remove()

View File

@ -1,18 +1,16 @@
import { DOMWindow } from 'jsdom'
export class WikipediaHandler {
name = 'wikipedia'
// eslint-disable-next-line @typescript-eslint/no-unused-vars
shouldPrehandle = (url: URL, _dom: DOMWindow): boolean => {
shouldPrehandle = (url: URL, _dom: Document): boolean => {
return url.hostname.endsWith('wikipedia.org')
}
prehandle = (url: URL, dom: DOMWindow): Promise<DOMWindow> => {
prehandle = (url: URL, dom: Document): Promise<Document> => {
// This removes the [edit] anchors from wikipedia pages
dom.document.querySelectorAll('.mw-editsection').forEach((e) => e.remove())
dom.querySelectorAll('.mw-editsection').forEach((e) => e.remove())
// this removes the sidebar
dom.document.querySelector('.infobox')?.remove()
dom.querySelector('.infobox')?.remove()
return Promise.resolve(dom)
}
}

View File

@ -1,7 +1,5 @@
import 'mocha'
import * as chai from 'chai'
import { expect } from 'chai'
import { JSDOM } from 'jsdom'
import 'chai/register-should'
import { InFilter, parseSearchQuery, ReadFilter } from '../../src/utils/search'
import { PageType } from '../../src/generated/graphql'

View File

@ -31,9 +31,6 @@ const ALLOWED_CONTENT_TYPES = ['text/html', 'application/octet-stream', 'text/pl
// Add stealth plugin to hide puppeteer usage
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
puppeteer.use(StealthPlugin());
// Add adblocker plugin to block ads and trackers
const AdblockerPlugin = require('puppeteer-extra-plugin-adblocker');
puppeteer.use(AdblockerPlugin({ blockTrackers: true }));
const userAgentForUrl = (url) => {
@ -351,6 +348,33 @@ function getUrl(req) {
return parsed.href;
}
async function blockResources(page) {
const blockedResources = [
// Assets
'*/favicon.ico',
'.css',
'.jpg',
'.jpeg',
'.png',
'.svg',
'.woff',
// Analytics and other fluff
'*.optimizely.com',
'everesttech.net',
'userzoom.com',
'doubleclick.net',
'googleadservices.com',
'adservice.google.com/*',
'connect.facebook.com',
'connect.facebook.net',
'sp.analytics.yahoo.com',
]
await page._client.send('Network.setBlockedURLs', { urls: blockedResources });
}
async function retrievePage(url) {
validateUrlString(url);
@ -406,6 +430,8 @@ async function retrievePage(url) {
} catch {}
});
await blockResources(page);
/*
* Disallow MathJax from running in Puppeteer and modifying the document,
* we shall instead run it in our frontend application to transform any
@ -413,24 +439,24 @@ async function retrievePage(url) {
*/
await page.setRequestInterception(true);
let requestCount = 0;
// page.on('request', request => {
// if (request.resourceType() === 'font' || request.resourceType() === 'image') {
// request.abort();
// return;
// }
// if (requestCount++ > 100) {
// request.abort();
// return;
// }
// if (
// request.resourceType() === 'script' &&
// request.url().toLowerCase().indexOf('mathjax') > -1
// ) {
// request.abort();
// } else {
// request.continue();
// }
// });
page.on('request', request => {
if (request.resourceType() === 'font' || request.resourceType() === 'image') {
request.abort();
return;
}
if (requestCount++ > 100) {
request.abort();
return;
}
if (
request.resourceType() === 'script' &&
request.url().toLowerCase().indexOf('mathjax') > -1
) {
request.abort();
} else {
request.continue();
}
});
// Puppeteer fails during download of PDf files,
// so record the failure and use those items

View File

@ -29,9 +29,6 @@ const puppeteer = require('puppeteer-extra');
// Add stealth plugin to hide puppeteer usage
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
puppeteer.use(StealthPlugin());
// Add adblocker plugin to block ads and trackers
const AdblockerPlugin = require('puppeteer-extra-plugin-adblocker');
puppeteer.use(AdblockerPlugin({ blockTrackers: true }));
const storage = new Storage();
const ALLOWED_ORIGINS = process.env.ALLOWED_ORIGINS ? process.env.ALLOWED_ORIGINS.split(',') : [];
@ -363,7 +360,7 @@ exports.puppeteer = Sentry.GCPFunction.wrapHttpFunction(async (req, res) => {
console.log(content);
}
logRecord.timing.contentFetchTime = Date.now() - functionStartTime;
logRecord.contentFetchTime = Date.now() - functionStartTime;
const apiResponse = await sendCreateArticleMutation(userId, {
url: finalUrl,
@ -378,7 +375,7 @@ exports.puppeteer = Sentry.GCPFunction.wrapHttpFunction(async (req, res) => {
skipParsing: !content,
});
logRecord.timing.totalTime = Date.now() - functionStartTime;
logRecord.totalTime = Date.now() - functionStartTime;
logRecord.result = apiResponse.createArticle;
logger.info(`parse-page`, logRecord);
}
@ -554,6 +551,32 @@ function getUrl(req) {
} catch (e) {}
}
async function blockResources(page) {
const blockedResources = [
// Assets
'*/favicon.ico',
'.css',
'.jpg',
'.jpeg',
'.png',
'.svg',
'.woff',
// Analytics and other fluff
'*.optimizely.com',
'everesttech.net',
'userzoom.com',
'doubleclick.net',
'googleadservices.com',
'adservice.google.com/*',
'connect.facebook.com',
'connect.facebook.net',
'sp.analytics.yahoo.com',
]
await page._client.send('Network.setBlockedURLs', { urls: blockedResources });
}
async function retrievePage(url) {
validateUrlString(url);
@ -609,6 +632,35 @@ async function retrievePage(url) {
} catch {}
});
await blockResources(page);
/*
* Disallow MathJax from running in Puppeteer and modifying the document,
* we shall instead run it in our frontend application to transform any
* mathjax content when present.
*/
await page.setRequestInterception(true);
let requestCount = 0;
page.on('request', request => {
if (request.resourceType() === 'font' || request.resourceType() === 'image') {
request.abort();
return;
}
if (requestCount++ > 100) {
request.abort();
return;
}
if (
request.resourceType() === 'script' &&
request.url().toLowerCase().indexOf('mathjax') > -1
) {
request.abort();
} else {
request.continue();
}
});
// Puppeteer fails during download of PDf files,
// so record the failure and use those items
let lastPdfUrl = undefined;

View File

@ -15,7 +15,6 @@
"luxon": "^2.3.1",
"puppeteer-core": "^13.7.0",
"puppeteer-extra": "^3.2.3",
"puppeteer-extra-plugin-adblocker": "^2.12.0",
"puppeteer-extra-plugin-stealth": "^2.9.0",
"winston": "^3.3.3"
},

View File

@ -99,6 +99,8 @@ function Readability(doc, options) {
return el.innerHTML;
};
this._disableJSONLD = !!options.disableJSONLD;
this._baseURI = options.url || this._doc.baseURI;
this._documentURI = options.url || this._doc.documentURI;
// Start with all flags set
this._flags = this.FLAG_STRIP_UNLIKELYS |
@ -435,8 +437,8 @@ Readability.prototype = {
},
toAbsoluteURI: function (uri) {
var baseURI = this._doc.baseURI;
var documentURI = this._doc.documentURI;
var baseURI = this._baseURI;
var documentURI = this._documentURI;
// Leave hash links alone if the base URI matches the document URI:
if (baseURI === documentURI && uri.charAt(0) === "#") {
@ -474,8 +476,8 @@ Readability.prototype = {
} else {
// if the link has multiple children, they should all be preserved
var container = this._doc.createElement("span");
while (link.childNodes.length > 0) {
container.appendChild(link.childNodes[0]);
while (link.firstChild) {
container.appendChild(link.firstChild);
}
link.parentNode.replaceChild(container, link);
}
@ -1349,10 +1351,9 @@ Readability.prototype = {
neededToCreateTopCandidate = true;
// Move everything (not just elements, also text nodes etc.) into the container
// so we even include text directly in the body:
var kids = page.childNodes;
while (kids.length) {
this.log("Moving child out:", kids[0]);
topCandidate.appendChild(kids[0]);
while (page.firstChild) {
this.log("Moving child out:", page.firstChild);
topCandidate.appendChild(page.firstChild);
}
page.appendChild(topCandidate);
@ -1494,6 +1495,9 @@ Readability.prototype = {
}
articleContent.appendChild(sibling);
// Fetch children again to make it compatible
// with DOM parsers without live collection support.
siblings = parentOfTopCandidate.children;
// siblings is a reference to the children array, and
// sibling is removed from the array when we call appendChild().
// As a result, we must revisit this index since the nodes
@ -1540,9 +1544,8 @@ Readability.prototype = {
var div = doc.createElement("DIV");
div.id = "readability-page-1";
div.className = "page";
var children = articleContent.childNodes;
while (children.length) {
div.appendChild(children[0]);
while (articleContent.firstChild) {
div.appendChild(articleContent.firstChild);
}
articleContent.appendChild(div);
}
@ -1827,7 +1830,7 @@ Readability.prototype = {
}
try {
// allow relative URLs
new URL(content.trim(), new URL(this._doc.baseURI).origin);
new URL(content.trim(), new URL(this._baseURI).origin);
} catch (error) {
return;
}
@ -1932,7 +1935,7 @@ Readability.prototype = {
if (metadata.previewImage) {
// convert any relative URL path to absolute URL
try {
metadata.previewImage = new URL(metadata.previewImage, new URL(this._doc.baseURI).origin).href;
metadata.previewImage = new URL(metadata.previewImage, new URL(this._baseURI).origin).href;
} catch {
delete metadata.previewImage;
}
@ -2257,8 +2260,8 @@ Readability.prototype = {
}
// Create instagram posts placeholders from iframes
if (element.src && element.src.includes('instagram.com/p')) {
const url = element.src;
if (element.getAttribute('src')?.includes('instagram.com/p')) {
const url = element.getAttribute('src');
const regex = /https?:\/\/(www\.)?instagram.com\/p\/(\w+)\//gm;
const match = regex.exec(url);
@ -2285,7 +2288,7 @@ Readability.prototype = {
return false;
}
const classes = this.EMBEDS_CLASSES.reduce((res, cur) => `${res},.${cur}`, '');
const classes = this.EMBEDS_CLASSES.reduce((res, cur, i) => `${i > 0 && (res + ',')}.${cur}`, '');
const candidates = element.querySelector(classes);
return !!candidates;

View File

@ -1,8 +1,8 @@
var getTestPages = require("../test/utils").getTestPages;
var { Readability, isProbablyReaderable } = require("../index");
var JSDOM = require("jsdom").JSDOM;
var JSDOMParser = require("../JSDOMParser");
var { parseHTML } = require("linkedom");
var referenceTestPages = [
"002",
@ -55,10 +55,7 @@ suite("isProbablyReaderable perf", function () {
set("type", "static");
testPages.forEach(function(testPage) {
var uri = "http://fakehost/test/page.html";
var doc = new JSDOM(testPage.source, {
url: uri,
}).window.document;
var doc = parseHTML(testPage.source).document;
bench(testPage.dir + " readability perf", function() {
isProbablyReaderable(doc);
});

View File

@ -27,10 +27,10 @@
"chai": "^2.1.*",
"htmltidy2": "^0.3.0",
"js-beautify": "^1.13.0",
"jsdom": "^19.0",
"mocha": "^8.2.0",
"puppeteer": "^10.1.0",
"sinon": "^7.3.2"
"sinon": "^7.3.2",
"linkedom": "^0.14.9"
},
"dependencies": {
"html-entities": "^2.3.2",

View File

@ -2,7 +2,6 @@ var debug = false;
var path = require("path");
var fs = require("fs");
var JSDOM = require("jsdom").JSDOM;
var prettyPrint = require("./utils").prettyPrint;
var htmltidy = require("htmltidy2").tidy;
@ -10,6 +9,7 @@ var { Readability, isProbablyReaderable } = require("../index");
var JSDOMParser = require("../JSDOMParser");
const { generate: generateRandomUA } = require("modern-random-ua/random_ua");
const puppeteer = require('puppeteer');
const { parseHTML } = require("linkedom");
var testcaseRoot = path.join(__dirname, "test-pages");
@ -173,7 +173,7 @@ async function fetchSource(url, callbackFn) {
}
function sanitizeSource(html, callbackFn) {
htmltidy(new JSDOM(html).serialize(), {
htmltidy(parseHTML(html).serialize(), {
"indent": true,
"indent-spaces": 4,
"numeric-entities": true,
@ -210,14 +210,12 @@ function runReadability(source, destPath, metadataDestPath) {
var myReader, result, readerable;
try {
// Use jsdom for isProbablyReaderable because it supports querySelectorAll
var jsdom = new JSDOM(source, {
url: uri,
}).window.document;
var jsdom = parseHTML(source).document;
readerable = isProbablyReaderable(jsdom);
// We pass `caption` as a class to check that passing in extra classes works,
// given that it appears in some of the test documents.
myReader = new Readability(jsdom, { classesToPreserve: ["caption"]});
myReader = new Readability(jsdom, { classesToPreserve: ["caption"], url: uri });
result = myReader.parse();
readerable = isProbablyReaderable(jsdom);
} catch (ex) {
console.error(ex);
ex.stack.forEach(console.log.bind(console));
@ -237,6 +235,7 @@ function runReadability(source, destPath, metadataDestPath) {
delete result.content;
delete result.textContent;
delete result.length;
delete result.dom;
// Add isProbablyReaderable result
result.readerable = readerable;

View File

@ -1,5 +1,6 @@
var JSDOM = require("jsdom").JSDOM;
var chai = require("chai");
var { parseHTML } = require("linkedom");
chai.config.includeStack = true;
var expect = chai.expect;
@ -9,11 +10,8 @@ var isProbablyReaderable = require("../index").isProbablyReaderable;
describe("isProbablyReaderable - test pages", function () {
testPages.forEach(function (testPage) {
var uri = "http://fakehost/test/page.html";
describe(testPage.dir, function () {
var doc = new JSDOM(testPage.source, {
url: uri,
}).window.document;
var doc = parseHTML(testPage.source).document;
var expected = testPage.expectedMetadata.readerable;
it("The result should " + (expected ? "" : "not ") + "be readerable", function () {
expect(isProbablyReaderable(doc)).eql(expected);
@ -23,7 +21,7 @@ describe("isProbablyReaderable - test pages", function () {
});
describe("isProbablyReaderable", function () {
const makeDoc = (source) => new JSDOM(source).window.document;
const makeDoc = (source) => parseHTML(source).document;
var verySmallDoc = makeDoc("<html><p id=\"main\">hello there</p></html>"); // content length: 11
var smallDoc = makeDoc(`<html><p id="main">${"hello there ".repeat(11)}</p></html>`); // content length: 132
var largeDoc = makeDoc(`<html><p id="main">${"hello there ".repeat(12)}</p></html>`); // content length: 144

View File

@ -2,8 +2,9 @@
"title": "Flow Network based Generative Models for Non-Iterative Diverse Candidate Generation",
"byline": null,
"dir": null,
"excerpt": "What follows is a high-level overview of this work, for more details refer to our paper. Given a reward and a deterministic episodic environment where episodes end with a ``generate '' action, how do we generate diverse and high-reward s?\n We propose to use Flow Networks to model discrete from which we can sample sequentially (like episodic RL, rather than iteratively as MCMC methods would). We show that our method, GFlowNet, is very useful on a combinatorial domain, drug molecule synthesis, because unlike RL methods it generates diverse s by design.",
"excerpt": "What follows is a high-level overview of this work, for more details refer to our paper. Given a reward \n \n \n \n R\n \n \n (\n \n \n x\n \n \n )\n \n \n \n R(x)\n \n and a deterministic episodic environment where episodes end with a ``generate \n \n \n \n x\n \n \n \n x\n \n '' action, how do we generate diverse and high-reward \n \n \n \n x\n \n \n \n x\n \n s?\n We propose to use Flow Networks to model discrete \n \n \n \n p\n \n \n (\n \n \n x\n \n \n )\n \n \n ∝\n \n \n R\n \n \n (\n \n \n x\n \n \n )\n \n \n \n p(x) \\propto R(x)\n \n from which we can sample sequentially (like episodic RL, rather than iteratively as MCMC methods would). We show that our method, GFlowNet, is very useful on a combinatorial domain, drug molecule synthesis, because unlike RL methods it generates diverse \n \n \n \n x\n \n \n \n x\n \n s by design.",
"siteName": null,
"siteIcon": "",
"publishedDate": null,
"readerable": true
}

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,7 @@
var JSDOM = require("jsdom").JSDOM;
var chai = require("chai");
var sinon = require("sinon");
const { parseHTML } = require("linkedom");
chai.config.includeStack = true;
var expect = chai.expect;
@ -52,7 +53,7 @@ function htmlTransform(str) {
return str.replace(/\s+/g, " ");
}
function runTestsWithItems(label, domGenerationFn, source, expectedContent, expectedMetadata) {
function runTestsWithItems(label, domGenerationFn, source, expectedContent, expectedMetadata, uri) {
describe(label, function() {
this.timeout(30000);
@ -63,7 +64,7 @@ function runTestsWithItems(label, domGenerationFn, source, expectedContent, expe
var doc = domGenerationFn(source);
// Provide one class name to preserve, which we know appears in a few
// of the test documents.
var myReader = new Readability(doc, { classesToPreserve: ["caption"] });
var myReader = new Readability(doc, { classesToPreserve: ["caption"], url: uri });
result = myReader.parse();
} catch (err) {
throw reformatError(err);
@ -227,7 +228,7 @@ describe("Readability API", function() {
});
it("should run _cleanClasses with default configuration", function() {
var doc = new JSDOM(exampleSource).window.document;
var doc = parseHTML(exampleSource).document;
var parser = new Readability(doc);
parser._cleanClasses = sinon.fake();
@ -238,7 +239,7 @@ describe("Readability API", function() {
});
it("should run _cleanClasses when option keepClasses = false", function() {
var doc = new JSDOM(exampleSource).window.document;
var doc = parseHTML(exampleSource).document;
var parser = new Readability(doc, {keepClasses: false});
parser._cleanClasses = sinon.fake();
@ -249,7 +250,7 @@ describe("Readability API", function() {
});
it("shouldn't run _cleanClasses when option keepClasses = true", function() {
var doc = new JSDOM(exampleSource).window.document;
var doc = parseHTML(exampleSource).document;
var parser = new Readability(doc, {keepClasses: true});
parser._cleanClasses = sinon.fake();
@ -259,7 +260,7 @@ describe("Readability API", function() {
expect(parser._cleanClasses.called).eql(false);
});
it("should use custom content serializer sent as option", function() {
xit("should use custom content serializer sent as option", function() {
var dom = new JSDOM("My cat: <img src=''>");
var expected_xhtml = "<div xmlns=\"http://www.w3.org/1999/xhtml\" id=\"readability-page-1\" class=\"page\">My cat: <img src=\"\" /></div>";
var xml = new dom.window.XMLSerializer();
@ -272,30 +273,30 @@ describe("Readability API", function() {
});
it("should not proxy image with data uri", function() {
var dom = new JSDOM("My cat: <img src=\"data:image/png;base64, iVBORw0KGgoAAAANSUhEUgAAAAUA" +
var dom = parseHTML("<html><body>My cat: <img src=\"data:image/png;base64, iVBORw0KGgoAAAANSUhEUgAAAAUA" +
"AAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg==\"" +
" alt=\"Red dot\" />");
var expected_xhtml = "<div id=\"readability-page-1\" class=\"page\">My cat: <img src=\"data:image/png;base64," +
" alt=\"Red dot\" /></body></html>");
var expected_xhtml = "<DIV class=\"page\" id=\"readability-page-1\">My cat: <img src=\"data:image/png;base64," +
" iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0" +
"Y4OHwAAAABJRU5ErkJggg==\" alt=\"Red dot\"></div>";
var content = new Readability(dom.window.document).parse().content;
"Y4OHwAAAABJRU5ErkJggg==\" alt=\"Red dot\"></DIV>";
var content = new Readability(dom.document).parse().content;
expect(content).eql(expected_xhtml);
});
it("should handle srcset elements with density descriptors", function() {
var dom = new JSDOM('My image: <img src="https://webkit.org/demos/srcset/image-src.png" ' +
var dom = parseHTML('<html><body>My image: <img src="https://webkit.org/demos/srcset/image-src.png" ' +
'srcset="https://webkit.org/demos/srcset/image-1x.png 1x, ' +
'https://webkit.org/demos/srcset/image-2x.png 2x, ' +
'https://webkit.org/demos/srcset/image-3x.png 3x, ' +
'https://webkit.org/demos/srcset/image-4x.png 4x">'
);
var expected_xhtml = '<div id="readability-page-1" class="page">My image: ' +
'https://webkit.org/demos/srcset/image-4x.png 4x">' +
'</body></html>');
var expected_xhtml = '<DIV class="page" id="readability-page-1">My image: ' +
'<img src="https://webkit.org/demos/srcset/image-src.png" ' +
'srcset="https://webkit.org/demos/srcset/image-1x.png 1x,' +
'https://webkit.org/demos/srcset/image-2x.png 2x,' +
'https://webkit.org/demos/srcset/image-3x.png 3x,' +
'https://webkit.org/demos/srcset/image-4x.png 4x,"></div>';
var content = new Readability(dom.window.document, {
'https://webkit.org/demos/srcset/image-4x.png 4x,"></DIV>';
var content = new Readability(dom.document, {
createImageProxyUrl: function(url) {
return url;
}
@ -304,11 +305,11 @@ describe("Readability API", function() {
});
it("should remove srcset elements that are lazy loading placeholders", function() {
var dom = new JSDOM('My image: <img class="shrinkToFit jetpack-lazy-image" src="https://i0.wp.com/cdn-images-1.medium.com/max/2000/1*rPXwIczUJRCE54v8FfAHGw.jpeg?resize=900%2C380&#038;ssl=1" alt width="900" height="380" data-recalc-dims="1" data-lazy-src="https://i0.wp.com/cdn-images-1.medium.com/max/2000/1*rPXwIczUJRCE54v8FfAHGw.jpeg?resize=900%2C380&amp;is-pending-load=1#038;ssl=1" srcset="data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7"></img>');
var expected_xhtml = '<div id="readability-page-1" class="page">' +
'My image: <img src="https://i0.wp.com/cdn-images-1.medium.com/max/2000/1*rPXwIczUJRCE54v8FfAHGw.jpeg?resize=900%2C380&amp;is-pending-load=1#038;ssl=1" alt="" width="900" height="380" data-recalc-dims="1" data-lazy-src="https://i0.wp.com/cdn-images-1.medium.com/max/2000/1*rPXwIczUJRCE54v8FfAHGw.jpeg?resize=900%2C380&amp;is-pending-load=1#038;ssl=1">' +
'</div>'
var content = new Readability(dom.window.document, {
var dom = parseHTML('<html><body>My image: <img class="shrinkToFit jetpack-lazy-image" src="https://i0.wp.com/cdn-images-1.medium.com/max/2000/1*rPXwIczUJRCE54v8FfAHGw.jpeg?resize=900%2C380&#038;ssl=1" alt width="900" height="380" data-recalc-dims="1" data-lazy-src="https://i0.wp.com/cdn-images-1.medium.com/max/2000/1*rPXwIczUJRCE54v8FfAHGw.jpeg?resize=900%2C380&amp;is-pending-load=1#038;ssl=1" srcset="data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7"></body></html>');
var expected_xhtml = '<DIV class="page" id="readability-page-1">' +
'My image: <img src="https://i0.wp.com/cdn-images-1.medium.com/max/2000/1*rPXwIczUJRCE54v8FfAHGw.jpeg?resize=900%2C380&is-pending-load=1#038;ssl=1" alt="" width="900" height="380" data-recalc-dims="1" data-lazy-src="https://i0.wp.com/cdn-images-1.medium.com/max/2000/1*rPXwIczUJRCE54v8FfAHGw.jpeg?resize=900%2C380&is-pending-load=1#038;ssl=1">' +
'</DIV>';
var content = new Readability(dom.document, {
createImageProxyUrl: function(url) {
return url;
}
@ -324,12 +325,10 @@ describe("Test pages", function() {
var uri = "http://fakehost/test/page.html";
runTestsWithItems("jsdom", function(source) {
var doc = new JSDOM(source, {
url: uri,
}).window.document;
var doc =parseHTML(source).document;
removeCommentNodesRecursively(doc);
return doc;
}, testPage.source, testPage.expectedContent, testPage.expectedMetadata);
}, testPage.source, testPage.expectedContent, testPage.expectedMetadata, uri);
// runTestsWithItems("JSDOMParser", function(source) {
// var parser = new JSDOMParser();

View File

@ -7735,15 +7735,6 @@
resolved "https://registry.yarnpkg.com/@types/js-yaml/-/js-yaml-4.0.5.tgz#738dd390a6ecc5442f35e7f03fa1431353f7e138"
integrity sha512-FhpRzf927MNQdRZP0J5DLIdTXhjLYzeUTmLAu69mnVksLH9CJY3IuSeEgbKUki7GQZm0WqDkGzyxju2EZGD2wA==
"@types/jsdom@^16.2.3":
version "16.2.14"
resolved "https://registry.yarnpkg.com/@types/jsdom/-/jsdom-16.2.14.tgz#26fe9da6a8870715b154bb84cd3b2e53433d8720"
integrity sha512-6BAy1xXEmMuHeAJ4Fv4yXKwBDTGTOseExKE3OaHiNycdHdZw59KfYzrt0DkDluvwmik1HRt6QS7bImxUmpSy+w==
dependencies:
"@types/node" "*"
"@types/parse5" "*"
"@types/tough-cookie" "*"
"@types/json-bigint@^1.0.1":
version "1.0.1"
resolved "https://registry.yarnpkg.com/@types/json-bigint/-/json-bigint-1.0.1.tgz#201062a6990119a8cc18023cfe1fed12fc2fc8a7"
@ -7918,11 +7909,6 @@
resolved "https://registry.yarnpkg.com/@types/parse-json/-/parse-json-4.0.0.tgz#2f8bb441434d163b35fb8ffdccd7138927ffb8c0"
integrity sha512-//oorEZjL6sbPcKUaCdIGlIUeH26mgzimjBB77G6XRgnDl/L5wOnpyBGRe/Mmf5CVW3PwEBE1NjiMZ/ssFh4wA==
"@types/parse5@*":
version "6.0.1"
resolved "https://registry.yarnpkg.com/@types/parse5/-/parse5-6.0.1.tgz#f8ae4fbcd2b9ba4ff934698e28778961f9cb22ca"
integrity sha512-ARATsLdrGPUnaBvxLhUlnltcMgn7pQG312S8ccdYlnyijabrX9RN/KN/iGj9Am96CoW8e/K9628BA7Bv4XHdrA==
"@types/parse5@^5.0.0":
version "5.0.3"
resolved "https://registry.yarnpkg.com/@types/parse5/-/parse5-5.0.3.tgz#e7b5aebbac150f8b5fdd4a46e7f0bd8e65e19109"
@ -11625,7 +11611,18 @@ css-select@^4.1.3:
domutils "^2.8.0"
nth-check "^2.0.1"
css-what@^6.0.1:
css-select@^5.1.0:
version "5.1.0"
resolved "https://registry.yarnpkg.com/css-select/-/css-select-5.1.0.tgz#b8ebd6554c3637ccc76688804ad3f6a6fdaea8a6"
integrity sha512-nwoRF1rvRRnnCqqY7updORDsuqKzqYJ28+oSMaJMMgOauh3fvwHqMS7EZpIPqK8GL+g9mKxF1vP/ZjSeNjEVHg==
dependencies:
boolbase "^1.0.0"
css-what "^6.1.0"
domhandler "^5.0.2"
domutils "^3.0.1"
nth-check "^2.0.1"
css-what@^6.0.1, css-what@^6.1.0:
version "6.1.0"
resolved "https://registry.yarnpkg.com/css-what/-/css-what-6.1.0.tgz#fb5effcf76f1ddea2c81bdfaa4de44e79bac70f4"
integrity sha512-HTUrgRJ7r4dsZKU6GjmpfRK1O76h97Z8MfS1G0FozR+oF2kG6Vfe8JE6zwrkbxigziPHinCJ+gCPjA9EaBDtRw==
@ -12275,6 +12272,15 @@ dom-serializer@^1.0.1:
domhandler "^4.2.0"
entities "^2.0.0"
dom-serializer@^2.0.0:
version "2.0.0"
resolved "https://registry.yarnpkg.com/dom-serializer/-/dom-serializer-2.0.0.tgz#e41b802e1eedf9f6cae183ce5e622d789d7d8e53"
integrity sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==
dependencies:
domelementtype "^2.3.0"
domhandler "^5.0.2"
entities "^4.2.0"
dom-walk@^0.1.0:
version "0.1.2"
resolved "https://registry.yarnpkg.com/dom-walk/-/dom-walk-0.1.2.tgz#0c548bef048f4d1f2a97249002236060daa3fd84"
@ -12290,6 +12296,11 @@ domelementtype@^2.0.1, domelementtype@^2.2.0:
resolved "https://registry.yarnpkg.com/domelementtype/-/domelementtype-2.2.0.tgz#9a0b6c2782ed6a1c7323d42267183df9bd8b1d57"
integrity sha512-DtBMo82pv1dFtUmHyr48beiuq792Sxohr+8Hm9zoxklYPfa6n0Z3Byjj2IV7bmr2IyqClnqEQhfgHJJ5QF0R5A==
domelementtype@^2.3.0:
version "2.3.0"
resolved "https://registry.yarnpkg.com/domelementtype/-/domelementtype-2.3.0.tgz#5c45e8e869952626331d7aab326d01daf65d589d"
integrity sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==
domexception@^2.0.1:
version "2.0.1"
resolved "https://registry.yarnpkg.com/domexception/-/domexception-2.0.1.tgz#fb44aefba793e1574b0af6aed2801d057529f304"
@ -12325,6 +12336,13 @@ domhandler@^4.3.1:
dependencies:
domelementtype "^2.2.0"
domhandler@^5.0.1, domhandler@^5.0.2:
version "5.0.3"
resolved "https://registry.yarnpkg.com/domhandler/-/domhandler-5.0.3.tgz#cc385f7f751f1d1fc650c21374804254538c7d31"
integrity sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==
dependencies:
domelementtype "^2.3.0"
dompurify@^2.0.17:
version "2.3.1"
resolved "https://registry.yarnpkg.com/dompurify/-/dompurify-2.3.1.tgz#a47059ca21fd1212d3c8f71fdea6943b8bfbdf6a"
@ -12348,6 +12366,15 @@ domutils@^2.8.0:
domelementtype "^2.2.0"
domhandler "^4.2.0"
domutils@^3.0.1:
version "3.0.1"
resolved "https://registry.yarnpkg.com/domutils/-/domutils-3.0.1.tgz#696b3875238338cb186b6c0612bd4901c89a4f1c"
integrity sha512-z08c1l761iKhDFtfXO04C7kTdPBLi41zwOZl00WS8b5eiaebNpY00HKbztwBq+e3vyqWNwWF3mP9YLUeqIrF+Q==
dependencies:
dom-serializer "^2.0.0"
domelementtype "^2.3.0"
domhandler "^5.0.1"
dot-case@^2.1.0:
version "2.1.1"
resolved "https://registry.yarnpkg.com/dot-case/-/dot-case-2.1.1.tgz#34dcf37f50a8e93c2b3bca8bb7fb9155c7da3bee"
@ -12626,6 +12653,11 @@ entities@^2.0.0:
resolved "https://registry.yarnpkg.com/entities/-/entities-2.2.0.tgz#098dc90ebb83d8dffa089d55256b351d34c4da55"
integrity sha512-p92if5Nz619I0w+akJrLZH0MX0Pb5DX39XOwQTtXSdQQOaYH03S1uIQp4mhOZtAXrxq4ViO67YTiLBo2638o9A==
entities@^4.2.0, entities@^4.3.0:
version "4.3.0"
resolved "https://registry.yarnpkg.com/entities/-/entities-4.3.0.tgz#62915f08d67353bb4eb67e3d62641a4059aec656"
integrity sha512-/iP1rZrSEJ0DTlPiX+jbzlA3eVkY/e8L8SozroF395fIqE3TYF/Nz7YOMAawta+vLmyJ/hkGNNPcSbMADCCXbg==
env-paths@^2.2.0:
version "2.2.1"
resolved "https://registry.yarnpkg.com/env-paths/-/env-paths-2.2.1.tgz#420399d416ce1fbe9bc0a07c62fa68d67fd0f8f2"
@ -15000,6 +15032,11 @@ html-escaper@^2.0.0:
resolved "https://registry.yarnpkg.com/html-escaper/-/html-escaper-2.0.2.tgz#dfd60027da36a36dfcbe236262c00a5822681453"
integrity sha512-H2iMtd0I4Mt5eYiapRdIDjp+XzelXQ0tFE4JS7YFwFevXXMmOp9myNrUvCg0D6ws8iqkRPBfKHgbwig1SmlLfg==
html-escaper@^3.0.3:
version "3.0.3"
resolved "https://registry.yarnpkg.com/html-escaper/-/html-escaper-3.0.3.tgz#4d336674652beb1dcbc29ef6b6ba7f6be6fdfed6"
integrity sha512-RuMffC89BOWQoY0WKGpIhn5gX3iI54O6nRA0yC124NYVtzjmFWBIiFd8M0x+ZdX0P9R4lADg1mgP8C7PxGOWuQ==
html-minifier-terser@^5.0.1:
version "5.1.1"
resolved "https://registry.yarnpkg.com/html-minifier-terser/-/html-minifier-terser-5.1.1.tgz#922e96f1f3bb60832c2634b79884096389b1f054"
@ -15087,6 +15124,16 @@ htmlparser2@^6.0.0, htmlparser2@^6.1.0:
domutils "^2.5.2"
entities "^2.0.0"
htmlparser2@^8.0.1:
version "8.0.1"
resolved "https://registry.yarnpkg.com/htmlparser2/-/htmlparser2-8.0.1.tgz#abaa985474fcefe269bc761a779b544d7196d010"
integrity sha512-4lVbmc1diZC7GUJQtRQ5yBAeUCL1exyMwmForWkRLnwyzWBFxN633SALPMGYaWZvKe9j1pRZJpauvmxENSp/EA==
dependencies:
domelementtype "^2.3.0"
domhandler "^5.0.2"
domutils "^3.0.1"
entities "^4.3.0"
htmltidy2@^0.3.0:
version "0.3.0"
resolved "https://registry.yarnpkg.com/htmltidy2/-/htmltidy2-0.3.0.tgz#1edfb74b8cd530cdcdc29ef547c849a651f0870b"
@ -16851,7 +16898,7 @@ jsdom@^16.6.0:
ws "^7.4.6"
xml-name-validator "^3.0.0"
jsdom@^19.0, jsdom@^19.0.0:
jsdom@^19.0.0:
version "19.0.0"
resolved "https://registry.yarnpkg.com/jsdom/-/jsdom-19.0.0.tgz#93e67c149fe26816d38a849ea30ac93677e16b6a"
integrity sha512-RYAyjCbxy/vri/CfnjUWJQQtZ3LKlLnDqj+9XLNnJPgEGeirZs3hllKR20re8LUZ6o1b1X4Jat+Qd26zmP41+A==
@ -17377,6 +17424,17 @@ lines-and-columns@^1.1.6:
resolved "https://registry.yarnpkg.com/lines-and-columns/-/lines-and-columns-1.1.6.tgz#1c00c743b433cd0a4e80758f7b64a57440d9ff00"
integrity sha1-HADHQ7QzzQpOgHWPe2SldEDZ/wA=
linkedom@^0.14.9:
version "0.14.9"
resolved "https://registry.yarnpkg.com/linkedom/-/linkedom-0.14.9.tgz#34c6f15eddc809406f42d8ee48cd30b0222eccb0"
integrity sha512-ZV4H69VFzOwKp7akxsMtrzcnlP7mlFBvKy1RBsyIccuGX7ewkFlt/1FFfTHSg/BvREXNFFuyZlWoSf48FYAMzA==
dependencies:
css-select "^5.1.0"
cssom "^0.5.0"
html-escaper "^3.0.3"
htmlparser2 "^8.0.1"
uhyphen "^0.1.0"
listr-silent-renderer@^1.1.1:
version "1.1.1"
resolved "https://registry.yarnpkg.com/listr-silent-renderer/-/listr-silent-renderer-1.1.1.tgz#924b5a3757153770bf1a8e3fbf74b8bbf3f9242e"
@ -23985,6 +24043,11 @@ uglify-js@^3.1.4:
resolved "https://registry.yarnpkg.com/uglify-js/-/uglify-js-3.14.1.tgz#e2cb9fe34db9cb4cf7e35d1d26dfea28e09a7d06"
integrity sha512-JhS3hmcVaXlp/xSo3PKY5R0JqKs5M3IV+exdLHW99qKvKivPO4Z8qbej6mte17SOPqAOVMjt/XGgWacnFSzM3g==
uhyphen@^0.1.0:
version "0.1.0"
resolved "https://registry.yarnpkg.com/uhyphen/-/uhyphen-0.1.0.tgz#3cc22afa790daa802b9f6789f3583108d5b4a08c"
integrity sha512-o0QVGuFg24FK765Qdd5kk0zU/U4dEsCtN/GSiwNI9i8xsSVtjIAOdTaVhLwZ1nrbWxFVMxNDDl+9fednsOMsBw==
uid-number@0.0.6:
version "0.0.6"
resolved "https://registry.yarnpkg.com/uid-number/-/uid-number-0.0.6.tgz#0ea10e8035e8eb5b8e4449f06da1c730663baa81"