Merge pull request #582 from omnivore-app/optimize-parsing
Optimize parsing
This commit is contained in:
@ -58,11 +58,11 @@
|
||||
"highlightjs": "^9.16.2",
|
||||
"html-entities": "^2.3.2",
|
||||
"intercom-client": "^3.1.4",
|
||||
"jsdom": "^19.0.0",
|
||||
"jsonwebtoken": "^8.5.1",
|
||||
"jwks-rsa": "^2.0.3",
|
||||
"knex": "0.21.12",
|
||||
"knex-stringcase": "^1.4.2",
|
||||
"linkedom": "^0.14.9",
|
||||
"luxon": "^2.3.1",
|
||||
"nanoid": "^3.1.25",
|
||||
"nodemailer": "^6.7.3",
|
||||
@ -96,7 +96,6 @@
|
||||
"@types/express": "^4.17.7",
|
||||
"@types/highlightjs": "^9.12.2",
|
||||
"@types/intercom-client": "^2.11.8",
|
||||
"@types/jsdom": "^16.2.3",
|
||||
"@types/jsonwebtoken": "^8.5.0",
|
||||
"@types/luxon": "^1.25.0",
|
||||
"@types/mocha": "^8.2.2",
|
||||
|
||||
1
packages/api/src/readability.d.ts
vendored
1
packages/api/src/readability.d.ts
vendored
@ -121,6 +121,7 @@ declare module '@omnivore/readability' {
|
||||
*/
|
||||
|
||||
keepClasses?: boolean
|
||||
url?: string
|
||||
|
||||
/**
|
||||
* Function that converts a regular image url into imageproxy url
|
||||
|
||||
@ -1,17 +1,15 @@
|
||||
import { DOMWindow } from 'jsdom'
|
||||
|
||||
export class AxiosHandler {
|
||||
name = 'axios'
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
||||
shouldPrehandle = (url: URL, _dom: DOMWindow): boolean => {
|
||||
shouldPrehandle = (url: URL, _dom: Document): boolean => {
|
||||
const host = this.name + '.com'
|
||||
// check if url ends with axios.com
|
||||
return url.hostname.endsWith(host)
|
||||
}
|
||||
|
||||
prehandle = (url: URL, dom: DOMWindow): Promise<DOMWindow> => {
|
||||
const body = dom.document.querySelector('table')
|
||||
prehandle = (url: URL, dom: Document): Promise<Document> => {
|
||||
const body = dom.querySelector('table')
|
||||
|
||||
// this removes ads and replaces table with a div
|
||||
body?.querySelectorAll('table').forEach((el, k) => {
|
||||
@ -27,7 +25,7 @@ export class AxiosHandler {
|
||||
}
|
||||
})
|
||||
// replace the table with a div
|
||||
const div = dom.document.createElement('div')
|
||||
const div = dom.createElement('div')
|
||||
div.innerHTML = el.innerHTML
|
||||
el.parentNode?.replaceChild(div, el)
|
||||
}
|
||||
|
||||
@ -1,22 +1,18 @@
|
||||
import { DOMWindow } from 'jsdom'
|
||||
|
||||
export class BloombergHandler {
|
||||
name = 'bloomberg'
|
||||
|
||||
shouldPrehandle = (url: URL, dom: DOMWindow): boolean => {
|
||||
shouldPrehandle = (url: URL, dom: Document): boolean => {
|
||||
const host = this.name + '.com'
|
||||
// check if url ends with bloomberg.com
|
||||
return (
|
||||
url.hostname.endsWith(host) ||
|
||||
dom.document
|
||||
.querySelector('.logo-image')
|
||||
?.getAttribute('alt')
|
||||
?.toLowerCase() === this.name
|
||||
dom.querySelector('.logo-image')?.getAttribute('alt')?.toLowerCase() ===
|
||||
this.name
|
||||
)
|
||||
}
|
||||
|
||||
prehandle = (_url: URL, dom: DOMWindow): Promise<DOMWindow> => {
|
||||
const body = dom.document.querySelector('.wrapper')
|
||||
prehandle = (_url: URL, dom: Document): Promise<Document> => {
|
||||
const body = dom.querySelector('.wrapper')
|
||||
|
||||
// this removes header
|
||||
body?.querySelector('.sailthru-variables')?.remove()
|
||||
|
||||
@ -1,17 +1,15 @@
|
||||
import { DOMWindow } from 'jsdom'
|
||||
|
||||
export class GolangHandler {
|
||||
name = 'golangweekly'
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
||||
shouldPrehandle = (url: URL, _dom: DOMWindow): boolean => {
|
||||
shouldPrehandle = (url: URL, _dom: Document): boolean => {
|
||||
const host = this.name + '.com'
|
||||
// check if url ends with golangweekly.com
|
||||
return url.hostname.endsWith(host)
|
||||
}
|
||||
|
||||
prehandle = (url: URL, dom: DOMWindow): Promise<DOMWindow> => {
|
||||
const body = dom.document.querySelector('body')
|
||||
prehandle = (url: URL, dom: Document): Promise<Document> => {
|
||||
const body = dom.querySelector('body')
|
||||
|
||||
// this removes the "Subscribe" button
|
||||
body?.querySelector('.el-splitbar')?.remove()
|
||||
|
||||
@ -2,7 +2,6 @@
|
||||
/* eslint-disable @typescript-eslint/no-unsafe-assignment */
|
||||
/* eslint-disable @typescript-eslint/no-unused-vars */
|
||||
import { Readability } from '@omnivore/readability'
|
||||
import { DOMWindow, JSDOM, VirtualConsole } from 'jsdom'
|
||||
import createDOMPurify, { SanitizeElementHookEvent } from 'dompurify'
|
||||
import { PageType, PreparedDocumentInput } from '../generated/graphql'
|
||||
import { buildLogger, LogRecord } from './logger'
|
||||
@ -15,10 +14,11 @@ import { BloombergHandler } from './bloomberg-handler'
|
||||
import { GolangHandler } from './golang-handler'
|
||||
import * as hljs from 'highlightjs'
|
||||
import { decode } from 'html-entities'
|
||||
import { parseHTML } from 'linkedom'
|
||||
|
||||
const logger = buildLogger('utils.parse')
|
||||
|
||||
const virtualConsole = new VirtualConsole()
|
||||
// const virtualConsole = new VirtualConsole()
|
||||
|
||||
export const ALLOWED_CONTENT_TYPES = [
|
||||
'text/html',
|
||||
@ -41,8 +41,8 @@ const DOM_PURIFY_CONFIG = {
|
||||
}
|
||||
|
||||
interface ContentHandler {
|
||||
shouldPrehandle: (url: URL, dom: DOMWindow) => boolean
|
||||
prehandle: (url: URL, document: DOMWindow) => Promise<DOMWindow>
|
||||
shouldPrehandle: (url: URL, dom: Document) => boolean
|
||||
prehandle: (url: URL, document: Document) => Promise<Document>
|
||||
}
|
||||
|
||||
const HANDLERS = [
|
||||
@ -102,9 +102,9 @@ type ArticleParseLogRecord = LogRecord & {
|
||||
|
||||
const DEBUG_MODE = process.env.DEBUG === 'true' || false
|
||||
|
||||
const parseOriginalContent = (window: DOMWindow): PageType => {
|
||||
const parseOriginalContent = (document: Document): PageType => {
|
||||
try {
|
||||
const e = window.document.querySelector("head meta[property='og:type']")
|
||||
const e = document.querySelector("head meta[property='og:type']")
|
||||
const content = e?.getAttribute('content')
|
||||
if (!content) {
|
||||
return PageType.Unknown
|
||||
@ -128,32 +128,24 @@ const parseOriginalContent = (window: DOMWindow): PageType => {
|
||||
}
|
||||
|
||||
const getPurifiedContent = (html: string): Document => {
|
||||
const newWindow = new JSDOM('').window
|
||||
const DOMPurify = createDOMPurify(newWindow as unknown as Window)
|
||||
const newWindow = parseHTML('')
|
||||
const DOMPurify = createDOMPurify(newWindow)
|
||||
DOMPurify.addHook('uponSanitizeElement', domPurifySanitizeHook)
|
||||
const clean = DOMPurify.sanitize(html, DOM_PURIFY_CONFIG)
|
||||
return new JSDOM(clean).window.document
|
||||
return parseHTML(clean).document
|
||||
}
|
||||
|
||||
const getReadabilityResult = (
|
||||
url: string,
|
||||
html: string,
|
||||
window: DOMWindow,
|
||||
document: Document,
|
||||
isNewsletter?: boolean
|
||||
): Readability.ParseResult | null => {
|
||||
virtualConsole.removeAllListeners('jsdomError')
|
||||
virtualConsole.on('jsdomError', ({ message, stack: _stack, ...details }) => {
|
||||
logger.warning(`JSDOM error occurred`, {
|
||||
errorMsg: message,
|
||||
...details,
|
||||
})
|
||||
})
|
||||
|
||||
// First attempt to read the article as is.
|
||||
// if that fails attempt to purify then read
|
||||
const sources = [
|
||||
() => {
|
||||
return window.document
|
||||
return document
|
||||
},
|
||||
() => {
|
||||
return getPurifiedContent(html)
|
||||
@ -171,6 +163,7 @@ const getReadabilityResult = (
|
||||
debug: DEBUG_MODE,
|
||||
createImageProxyUrl,
|
||||
keepTables: isNewsletter,
|
||||
url,
|
||||
}).parse()
|
||||
|
||||
if (article) {
|
||||
@ -184,12 +177,15 @@ const getReadabilityResult = (
|
||||
return null
|
||||
}
|
||||
|
||||
const applyHandlers = async (url: string, window: DOMWindow): Promise<void> => {
|
||||
const applyHandlers = async (
|
||||
url: string,
|
||||
document: Document
|
||||
): Promise<void> => {
|
||||
try {
|
||||
const u = new URL(url)
|
||||
const handler = HANDLERS.find((h) => {
|
||||
try {
|
||||
return h.shouldPrehandle(u, window)
|
||||
return h.shouldPrehandle(u, document)
|
||||
} catch (e) {
|
||||
console.log('error with handler: ', h.name, e)
|
||||
}
|
||||
@ -198,7 +194,7 @@ const applyHandlers = async (url: string, window: DOMWindow): Promise<void> => {
|
||||
if (handler) {
|
||||
try {
|
||||
console.log('pre-handling url or content with handler: ', handler.name)
|
||||
await handler.prehandle(u, window)
|
||||
await handler.prehandle(u, document)
|
||||
} catch (e) {
|
||||
console.log('error with handler: ', handler, e)
|
||||
}
|
||||
@ -236,20 +232,20 @@ export const parsePreparedContent = async (
|
||||
}
|
||||
}
|
||||
|
||||
virtualConsole.removeAllListeners('jsdomError')
|
||||
virtualConsole.on('jsdomError', ({ message, stack: _stack, ...details }) => {
|
||||
logger.warning(`JSDOM error occurred`, {
|
||||
...logRecord,
|
||||
errorMsg: message,
|
||||
...details,
|
||||
})
|
||||
})
|
||||
const { window } = new JSDOM(document, { url, virtualConsole })
|
||||
// virtualConsole.removeAllListeners('jsdomError')
|
||||
// virtualConsole.on('jsdomError', ({ message, stack: _stack, ...details }) => {
|
||||
// logger.warning(`JSDOM error occurred`, {
|
||||
// ...logRecord,
|
||||
// errorMsg: message,
|
||||
// ...details,
|
||||
// })
|
||||
// })
|
||||
const dom = parseHTML(document).document
|
||||
|
||||
await applyHandlers(url, window)
|
||||
await applyHandlers(url, dom)
|
||||
|
||||
try {
|
||||
article = getReadabilityResult(url, document, window, isNewsletter)
|
||||
article = getReadabilityResult(url, document, dom, isNewsletter)
|
||||
|
||||
// Format code blocks
|
||||
// TODO: we probably want to move this type of thing
|
||||
@ -276,13 +272,13 @@ export const parsePreparedContent = async (
|
||||
}
|
||||
}
|
||||
|
||||
const newWindow = new JSDOM('').window
|
||||
const DOMPurify = createDOMPurify(newWindow as unknown as Window)
|
||||
const newWindow = parseHTML('')
|
||||
const DOMPurify = createDOMPurify(newWindow)
|
||||
DOMPurify.addHook('uponSanitizeElement', domPurifySanitizeHook)
|
||||
const clean = DOMPurify.sanitize(article?.content || '', DOM_PURIFY_CONFIG)
|
||||
|
||||
const jsonLdLinkMetadata = (async () => {
|
||||
return getJSONLdLinkMetadata(window.document)
|
||||
return getJSONLdLinkMetadata(dom)
|
||||
})()
|
||||
|
||||
Object.assign(article, {
|
||||
@ -315,7 +311,7 @@ export const parsePreparedContent = async (
|
||||
domContent: preparedDocument.document,
|
||||
parsedContent: article,
|
||||
canonicalUrl,
|
||||
pageType: parseOriginalContent(window),
|
||||
pageType: parseOriginalContent(dom),
|
||||
}
|
||||
}
|
||||
|
||||
@ -362,26 +358,26 @@ type Metadata = {
|
||||
|
||||
export const parsePageMetadata = (html: string): Metadata | undefined => {
|
||||
try {
|
||||
const window = new JSDOM(html).window
|
||||
const document = parseHTML(html).document
|
||||
|
||||
// get open graph metadata
|
||||
const description =
|
||||
window.document
|
||||
document
|
||||
.querySelector("head meta[property='og:description']")
|
||||
?.getAttribute('content') || ''
|
||||
|
||||
const previewImage =
|
||||
window.document
|
||||
document
|
||||
.querySelector("head meta[property='og:image']")
|
||||
?.getAttribute('content') || ''
|
||||
|
||||
const title =
|
||||
window.document
|
||||
document
|
||||
.querySelector("head meta[property='og:title']")
|
||||
?.getAttribute('content') || undefined
|
||||
|
||||
const author =
|
||||
window.document
|
||||
document
|
||||
.querySelector("head meta[name='author']")
|
||||
?.getAttribute('content') || undefined
|
||||
|
||||
@ -412,9 +408,9 @@ export const parseUrlMetadata = async (
|
||||
// TODO: when we consolidate the handlers we could include this
|
||||
// as a utility method on each one.
|
||||
export const isProbablyNewsletter = (html: string): boolean => {
|
||||
const dom = new JSDOM(html).window
|
||||
const domCopy = new JSDOM(dom.document.documentElement.outerHTML)
|
||||
const article = new Readability(domCopy.window.document, {
|
||||
const dom = parseHTML(html).document
|
||||
const domCopy = parseHTML(dom.documentElement.outerHTML)
|
||||
const article = new Readability(domCopy.document, {
|
||||
debug: false,
|
||||
keepTables: true,
|
||||
}).parse()
|
||||
@ -424,16 +420,16 @@ export const isProbablyNewsletter = (html: string): boolean => {
|
||||
}
|
||||
|
||||
// substack newsletter emails have tables with a *post-meta class
|
||||
if (dom.document.querySelector('table[class$="post-meta"]')) {
|
||||
if (dom.querySelector('table[class$="post-meta"]')) {
|
||||
return true
|
||||
}
|
||||
|
||||
// If the article has a header link, and substack icons its probably a newsletter
|
||||
const href = findNewsletterHeaderHref(dom.window)
|
||||
const heartIcon = dom.document.querySelector(
|
||||
const href = findNewsletterHeaderHref(dom)
|
||||
const heartIcon = dom.querySelector(
|
||||
'table tbody td span a img[src*="HeartIcon"]'
|
||||
)
|
||||
const recommendIcon = dom.document.querySelector(
|
||||
const recommendIcon = dom.querySelector(
|
||||
'table tbody td span a img[src*="RecommendIconRounded"]'
|
||||
)
|
||||
if (href && (heartIcon || recommendIcon)) {
|
||||
@ -441,8 +437,8 @@ export const isProbablyNewsletter = (html: string): boolean => {
|
||||
}
|
||||
|
||||
// Check if this is a beehiiv.net newsletter
|
||||
if (dom.document.querySelectorAll('img[src*="beehiiv.net"]').length > 0) {
|
||||
const beehiivUrl = beehiivNewsletterHref(dom.window)
|
||||
if (dom.querySelectorAll('img[src*="beehiiv.net"]').length > 0) {
|
||||
const beehiivUrl = beehiivNewsletterHref(dom)
|
||||
if (beehiivUrl) {
|
||||
return true
|
||||
}
|
||||
@ -451,10 +447,8 @@ export const isProbablyNewsletter = (html: string): boolean => {
|
||||
return false
|
||||
}
|
||||
|
||||
const beehiivNewsletterHref = (dom: DOMWindow): string | undefined => {
|
||||
const readOnline = dom.document.querySelectorAll(
|
||||
'table tr td div a[class*="link"]'
|
||||
)
|
||||
const beehiivNewsletterHref = (dom: Document): string | undefined => {
|
||||
const readOnline = dom.querySelectorAll('table tr td div a[class*="link"]')
|
||||
let res: string | undefined = undefined
|
||||
readOnline.forEach((e) => {
|
||||
if (e.textContent === 'Read Online') {
|
||||
@ -464,15 +458,15 @@ const beehiivNewsletterHref = (dom: DOMWindow): string | undefined => {
|
||||
return res
|
||||
}
|
||||
|
||||
const findNewsletterHeaderHref = (dom: DOMWindow): string | undefined => {
|
||||
const findNewsletterHeaderHref = (dom: Document): string | undefined => {
|
||||
// Substack header links
|
||||
const postLink = dom.document.querySelector('h1 a ')
|
||||
const postLink = dom.querySelector('h1 a ')
|
||||
if (postLink) {
|
||||
return postLink.getAttribute('href') || undefined
|
||||
}
|
||||
|
||||
// Check if this is a beehiiv.net newsletter
|
||||
const beehiiv = beehiivNewsletterHref(dom.window)
|
||||
const beehiiv = beehiivNewsletterHref(dom)
|
||||
if (beehiiv) {
|
||||
return beehiiv
|
||||
}
|
||||
@ -485,10 +479,10 @@ const findNewsletterHeaderHref = (dom: DOMWindow): string | undefined => {
|
||||
export const findNewsletterUrl = async (
|
||||
html: string
|
||||
): Promise<string | undefined> => {
|
||||
const dom = new JSDOM(html).window
|
||||
const dom = parseHTML(html).document
|
||||
|
||||
// Check if this is a substack newsletter
|
||||
const href = findNewsletterHeaderHref(dom.window)
|
||||
const href = findNewsletterHeaderHref(dom)
|
||||
if (href) {
|
||||
// Try to make a HEAD request so we get the redirected URL, since these
|
||||
// will usually be behind tracking url redirects
|
||||
|
||||
@ -1,23 +1,21 @@
|
||||
import { DOMWindow } from 'jsdom'
|
||||
|
||||
export class SubstackHandler {
|
||||
name = 'substack'
|
||||
|
||||
shouldPrehandle = (url: URL, dom: DOMWindow): boolean => {
|
||||
shouldPrehandle = (url: URL, dom: Document): boolean => {
|
||||
const host = this.name + '.com'
|
||||
// check if url ends with substack.com
|
||||
// or has a profile image hosted at substack.com
|
||||
return (
|
||||
url.hostname.endsWith(host) ||
|
||||
!!dom.document
|
||||
!!dom
|
||||
.querySelector('.email-body img')
|
||||
?.getAttribute('src')
|
||||
?.includes(host)
|
||||
)
|
||||
}
|
||||
|
||||
prehandle = (url: URL, dom: DOMWindow): Promise<DOMWindow> => {
|
||||
const body = dom.document.querySelector('.email-body-container')
|
||||
prehandle = (url: URL, dom: Document): Promise<Document> => {
|
||||
const body = dom.querySelector('.email-body-container')
|
||||
|
||||
// this removes header and profile avatar
|
||||
body?.querySelector('.header')?.remove()
|
||||
|
||||
@ -1,18 +1,16 @@
|
||||
import { DOMWindow } from 'jsdom'
|
||||
|
||||
export class WikipediaHandler {
|
||||
name = 'wikipedia'
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
||||
shouldPrehandle = (url: URL, _dom: DOMWindow): boolean => {
|
||||
shouldPrehandle = (url: URL, _dom: Document): boolean => {
|
||||
return url.hostname.endsWith('wikipedia.org')
|
||||
}
|
||||
|
||||
prehandle = (url: URL, dom: DOMWindow): Promise<DOMWindow> => {
|
||||
prehandle = (url: URL, dom: Document): Promise<Document> => {
|
||||
// This removes the [edit] anchors from wikipedia pages
|
||||
dom.document.querySelectorAll('.mw-editsection').forEach((e) => e.remove())
|
||||
dom.querySelectorAll('.mw-editsection').forEach((e) => e.remove())
|
||||
// this removes the sidebar
|
||||
dom.document.querySelector('.infobox')?.remove()
|
||||
dom.querySelector('.infobox')?.remove()
|
||||
return Promise.resolve(dom)
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,7 +1,5 @@
|
||||
import 'mocha'
|
||||
import * as chai from 'chai'
|
||||
import { expect } from 'chai'
|
||||
import { JSDOM } from 'jsdom'
|
||||
import 'chai/register-should'
|
||||
import { InFilter, parseSearchQuery, ReadFilter } from '../../src/utils/search'
|
||||
import { PageType } from '../../src/generated/graphql'
|
||||
|
||||
@ -31,9 +31,6 @@ const ALLOWED_CONTENT_TYPES = ['text/html', 'application/octet-stream', 'text/pl
|
||||
// Add stealth plugin to hide puppeteer usage
|
||||
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
|
||||
puppeteer.use(StealthPlugin());
|
||||
// Add adblocker plugin to block ads and trackers
|
||||
const AdblockerPlugin = require('puppeteer-extra-plugin-adblocker');
|
||||
puppeteer.use(AdblockerPlugin({ blockTrackers: true }));
|
||||
|
||||
|
||||
const userAgentForUrl = (url) => {
|
||||
@ -351,6 +348,33 @@ function getUrl(req) {
|
||||
return parsed.href;
|
||||
}
|
||||
|
||||
|
||||
async function blockResources(page) {
|
||||
const blockedResources = [
|
||||
// Assets
|
||||
'*/favicon.ico',
|
||||
'.css',
|
||||
'.jpg',
|
||||
'.jpeg',
|
||||
'.png',
|
||||
'.svg',
|
||||
'.woff',
|
||||
|
||||
// Analytics and other fluff
|
||||
'*.optimizely.com',
|
||||
'everesttech.net',
|
||||
'userzoom.com',
|
||||
'doubleclick.net',
|
||||
'googleadservices.com',
|
||||
'adservice.google.com/*',
|
||||
'connect.facebook.com',
|
||||
'connect.facebook.net',
|
||||
'sp.analytics.yahoo.com',
|
||||
]
|
||||
|
||||
await page._client.send('Network.setBlockedURLs', { urls: blockedResources });
|
||||
}
|
||||
|
||||
async function retrievePage(url) {
|
||||
validateUrlString(url);
|
||||
|
||||
@ -406,6 +430,8 @@ async function retrievePage(url) {
|
||||
} catch {}
|
||||
});
|
||||
|
||||
await blockResources(page);
|
||||
|
||||
/*
|
||||
* Disallow MathJax from running in Puppeteer and modifying the document,
|
||||
* we shall instead run it in our frontend application to transform any
|
||||
@ -413,24 +439,24 @@ async function retrievePage(url) {
|
||||
*/
|
||||
await page.setRequestInterception(true);
|
||||
let requestCount = 0;
|
||||
// page.on('request', request => {
|
||||
// if (request.resourceType() === 'font' || request.resourceType() === 'image') {
|
||||
// request.abort();
|
||||
// return;
|
||||
// }
|
||||
// if (requestCount++ > 100) {
|
||||
// request.abort();
|
||||
// return;
|
||||
// }
|
||||
// if (
|
||||
// request.resourceType() === 'script' &&
|
||||
// request.url().toLowerCase().indexOf('mathjax') > -1
|
||||
// ) {
|
||||
// request.abort();
|
||||
// } else {
|
||||
// request.continue();
|
||||
// }
|
||||
// });
|
||||
page.on('request', request => {
|
||||
if (request.resourceType() === 'font' || request.resourceType() === 'image') {
|
||||
request.abort();
|
||||
return;
|
||||
}
|
||||
if (requestCount++ > 100) {
|
||||
request.abort();
|
||||
return;
|
||||
}
|
||||
if (
|
||||
request.resourceType() === 'script' &&
|
||||
request.url().toLowerCase().indexOf('mathjax') > -1
|
||||
) {
|
||||
request.abort();
|
||||
} else {
|
||||
request.continue();
|
||||
}
|
||||
});
|
||||
|
||||
// Puppeteer fails during download of PDf files,
|
||||
// so record the failure and use those items
|
||||
|
||||
@ -29,9 +29,6 @@ const puppeteer = require('puppeteer-extra');
|
||||
// Add stealth plugin to hide puppeteer usage
|
||||
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
|
||||
puppeteer.use(StealthPlugin());
|
||||
// Add adblocker plugin to block ads and trackers
|
||||
const AdblockerPlugin = require('puppeteer-extra-plugin-adblocker');
|
||||
puppeteer.use(AdblockerPlugin({ blockTrackers: true }));
|
||||
|
||||
const storage = new Storage();
|
||||
const ALLOWED_ORIGINS = process.env.ALLOWED_ORIGINS ? process.env.ALLOWED_ORIGINS.split(',') : [];
|
||||
@ -363,7 +360,7 @@ exports.puppeteer = Sentry.GCPFunction.wrapHttpFunction(async (req, res) => {
|
||||
console.log(content);
|
||||
}
|
||||
|
||||
logRecord.timing.contentFetchTime = Date.now() - functionStartTime;
|
||||
logRecord.contentFetchTime = Date.now() - functionStartTime;
|
||||
|
||||
const apiResponse = await sendCreateArticleMutation(userId, {
|
||||
url: finalUrl,
|
||||
@ -378,7 +375,7 @@ exports.puppeteer = Sentry.GCPFunction.wrapHttpFunction(async (req, res) => {
|
||||
skipParsing: !content,
|
||||
});
|
||||
|
||||
logRecord.timing.totalTime = Date.now() - functionStartTime;
|
||||
logRecord.totalTime = Date.now() - functionStartTime;
|
||||
logRecord.result = apiResponse.createArticle;
|
||||
logger.info(`parse-page`, logRecord);
|
||||
}
|
||||
@ -554,6 +551,32 @@ function getUrl(req) {
|
||||
} catch (e) {}
|
||||
}
|
||||
|
||||
async function blockResources(page) {
|
||||
const blockedResources = [
|
||||
// Assets
|
||||
'*/favicon.ico',
|
||||
'.css',
|
||||
'.jpg',
|
||||
'.jpeg',
|
||||
'.png',
|
||||
'.svg',
|
||||
'.woff',
|
||||
|
||||
// Analytics and other fluff
|
||||
'*.optimizely.com',
|
||||
'everesttech.net',
|
||||
'userzoom.com',
|
||||
'doubleclick.net',
|
||||
'googleadservices.com',
|
||||
'adservice.google.com/*',
|
||||
'connect.facebook.com',
|
||||
'connect.facebook.net',
|
||||
'sp.analytics.yahoo.com',
|
||||
]
|
||||
|
||||
await page._client.send('Network.setBlockedURLs', { urls: blockedResources });
|
||||
}
|
||||
|
||||
async function retrievePage(url) {
|
||||
validateUrlString(url);
|
||||
|
||||
@ -609,6 +632,35 @@ async function retrievePage(url) {
|
||||
} catch {}
|
||||
});
|
||||
|
||||
await blockResources(page);
|
||||
|
||||
/*
|
||||
* Disallow MathJax from running in Puppeteer and modifying the document,
|
||||
* we shall instead run it in our frontend application to transform any
|
||||
* mathjax content when present.
|
||||
*/
|
||||
await page.setRequestInterception(true);
|
||||
let requestCount = 0;
|
||||
page.on('request', request => {
|
||||
if (request.resourceType() === 'font' || request.resourceType() === 'image') {
|
||||
request.abort();
|
||||
return;
|
||||
}
|
||||
if (requestCount++ > 100) {
|
||||
request.abort();
|
||||
return;
|
||||
}
|
||||
if (
|
||||
request.resourceType() === 'script' &&
|
||||
request.url().toLowerCase().indexOf('mathjax') > -1
|
||||
) {
|
||||
request.abort();
|
||||
} else {
|
||||
request.continue();
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
// Puppeteer fails during download of PDf files,
|
||||
// so record the failure and use those items
|
||||
let lastPdfUrl = undefined;
|
||||
|
||||
@ -15,7 +15,6 @@
|
||||
"luxon": "^2.3.1",
|
||||
"puppeteer-core": "^13.7.0",
|
||||
"puppeteer-extra": "^3.2.3",
|
||||
"puppeteer-extra-plugin-adblocker": "^2.12.0",
|
||||
"puppeteer-extra-plugin-stealth": "^2.9.0",
|
||||
"winston": "^3.3.3"
|
||||
},
|
||||
|
||||
@ -99,6 +99,8 @@ function Readability(doc, options) {
|
||||
return el.innerHTML;
|
||||
};
|
||||
this._disableJSONLD = !!options.disableJSONLD;
|
||||
this._baseURI = options.url || this._doc.baseURI;
|
||||
this._documentURI = options.url || this._doc.documentURI;
|
||||
|
||||
// Start with all flags set
|
||||
this._flags = this.FLAG_STRIP_UNLIKELYS |
|
||||
@ -435,8 +437,8 @@ Readability.prototype = {
|
||||
},
|
||||
|
||||
toAbsoluteURI: function (uri) {
|
||||
var baseURI = this._doc.baseURI;
|
||||
var documentURI = this._doc.documentURI;
|
||||
var baseURI = this._baseURI;
|
||||
var documentURI = this._documentURI;
|
||||
|
||||
// Leave hash links alone if the base URI matches the document URI:
|
||||
if (baseURI === documentURI && uri.charAt(0) === "#") {
|
||||
@ -474,8 +476,8 @@ Readability.prototype = {
|
||||
} else {
|
||||
// if the link has multiple children, they should all be preserved
|
||||
var container = this._doc.createElement("span");
|
||||
while (link.childNodes.length > 0) {
|
||||
container.appendChild(link.childNodes[0]);
|
||||
while (link.firstChild) {
|
||||
container.appendChild(link.firstChild);
|
||||
}
|
||||
link.parentNode.replaceChild(container, link);
|
||||
}
|
||||
@ -1349,10 +1351,9 @@ Readability.prototype = {
|
||||
neededToCreateTopCandidate = true;
|
||||
// Move everything (not just elements, also text nodes etc.) into the container
|
||||
// so we even include text directly in the body:
|
||||
var kids = page.childNodes;
|
||||
while (kids.length) {
|
||||
this.log("Moving child out:", kids[0]);
|
||||
topCandidate.appendChild(kids[0]);
|
||||
while (page.firstChild) {
|
||||
this.log("Moving child out:", page.firstChild);
|
||||
topCandidate.appendChild(page.firstChild);
|
||||
}
|
||||
|
||||
page.appendChild(topCandidate);
|
||||
@ -1494,6 +1495,9 @@ Readability.prototype = {
|
||||
}
|
||||
|
||||
articleContent.appendChild(sibling);
|
||||
// Fetch children again to make it compatible
|
||||
// with DOM parsers without live collection support.
|
||||
siblings = parentOfTopCandidate.children;
|
||||
// siblings is a reference to the children array, and
|
||||
// sibling is removed from the array when we call appendChild().
|
||||
// As a result, we must revisit this index since the nodes
|
||||
@ -1540,9 +1544,8 @@ Readability.prototype = {
|
||||
var div = doc.createElement("DIV");
|
||||
div.id = "readability-page-1";
|
||||
div.className = "page";
|
||||
var children = articleContent.childNodes;
|
||||
while (children.length) {
|
||||
div.appendChild(children[0]);
|
||||
while (articleContent.firstChild) {
|
||||
div.appendChild(articleContent.firstChild);
|
||||
}
|
||||
articleContent.appendChild(div);
|
||||
}
|
||||
@ -1827,7 +1830,7 @@ Readability.prototype = {
|
||||
}
|
||||
try {
|
||||
// allow relative URLs
|
||||
new URL(content.trim(), new URL(this._doc.baseURI).origin);
|
||||
new URL(content.trim(), new URL(this._baseURI).origin);
|
||||
} catch (error) {
|
||||
return;
|
||||
}
|
||||
@ -1932,7 +1935,7 @@ Readability.prototype = {
|
||||
if (metadata.previewImage) {
|
||||
// convert any relative URL path to absolute URL
|
||||
try {
|
||||
metadata.previewImage = new URL(metadata.previewImage, new URL(this._doc.baseURI).origin).href;
|
||||
metadata.previewImage = new URL(metadata.previewImage, new URL(this._baseURI).origin).href;
|
||||
} catch {
|
||||
delete metadata.previewImage;
|
||||
}
|
||||
@ -2257,8 +2260,8 @@ Readability.prototype = {
|
||||
}
|
||||
|
||||
// Create instagram posts placeholders from iframes
|
||||
if (element.src && element.src.includes('instagram.com/p')) {
|
||||
const url = element.src;
|
||||
if (element.getAttribute('src')?.includes('instagram.com/p')) {
|
||||
const url = element.getAttribute('src');
|
||||
const regex = /https?:\/\/(www\.)?instagram.com\/p\/(\w+)\//gm;
|
||||
const match = regex.exec(url);
|
||||
|
||||
@ -2285,7 +2288,7 @@ Readability.prototype = {
|
||||
return false;
|
||||
}
|
||||
|
||||
const classes = this.EMBEDS_CLASSES.reduce((res, cur) => `${res},.${cur}`, '');
|
||||
const classes = this.EMBEDS_CLASSES.reduce((res, cur, i) => `${i > 0 && (res + ',')}.${cur}`, '');
|
||||
|
||||
const candidates = element.querySelector(classes);
|
||||
return !!candidates;
|
||||
|
||||
@ -1,8 +1,8 @@
|
||||
var getTestPages = require("../test/utils").getTestPages;
|
||||
|
||||
var { Readability, isProbablyReaderable } = require("../index");
|
||||
var JSDOM = require("jsdom").JSDOM;
|
||||
var JSDOMParser = require("../JSDOMParser");
|
||||
var { parseHTML } = require("linkedom");
|
||||
|
||||
var referenceTestPages = [
|
||||
"002",
|
||||
@ -55,10 +55,7 @@ suite("isProbablyReaderable perf", function () {
|
||||
set("type", "static");
|
||||
|
||||
testPages.forEach(function(testPage) {
|
||||
var uri = "http://fakehost/test/page.html";
|
||||
var doc = new JSDOM(testPage.source, {
|
||||
url: uri,
|
||||
}).window.document;
|
||||
var doc = parseHTML(testPage.source).document;
|
||||
bench(testPage.dir + " readability perf", function() {
|
||||
isProbablyReaderable(doc);
|
||||
});
|
||||
|
||||
@ -27,10 +27,10 @@
|
||||
"chai": "^2.1.*",
|
||||
"htmltidy2": "^0.3.0",
|
||||
"js-beautify": "^1.13.0",
|
||||
"jsdom": "^19.0",
|
||||
"mocha": "^8.2.0",
|
||||
"puppeteer": "^10.1.0",
|
||||
"sinon": "^7.3.2"
|
||||
"sinon": "^7.3.2",
|
||||
"linkedom": "^0.14.9"
|
||||
},
|
||||
"dependencies": {
|
||||
"html-entities": "^2.3.2",
|
||||
|
||||
@ -2,7 +2,6 @@ var debug = false;
|
||||
|
||||
var path = require("path");
|
||||
var fs = require("fs");
|
||||
var JSDOM = require("jsdom").JSDOM;
|
||||
var prettyPrint = require("./utils").prettyPrint;
|
||||
var htmltidy = require("htmltidy2").tidy;
|
||||
|
||||
@ -10,6 +9,7 @@ var { Readability, isProbablyReaderable } = require("../index");
|
||||
var JSDOMParser = require("../JSDOMParser");
|
||||
const { generate: generateRandomUA } = require("modern-random-ua/random_ua");
|
||||
const puppeteer = require('puppeteer');
|
||||
const { parseHTML } = require("linkedom");
|
||||
|
||||
var testcaseRoot = path.join(__dirname, "test-pages");
|
||||
|
||||
@ -173,7 +173,7 @@ async function fetchSource(url, callbackFn) {
|
||||
}
|
||||
|
||||
function sanitizeSource(html, callbackFn) {
|
||||
htmltidy(new JSDOM(html).serialize(), {
|
||||
htmltidy(parseHTML(html).serialize(), {
|
||||
"indent": true,
|
||||
"indent-spaces": 4,
|
||||
"numeric-entities": true,
|
||||
@ -210,14 +210,12 @@ function runReadability(source, destPath, metadataDestPath) {
|
||||
var myReader, result, readerable;
|
||||
try {
|
||||
// Use jsdom for isProbablyReaderable because it supports querySelectorAll
|
||||
var jsdom = new JSDOM(source, {
|
||||
url: uri,
|
||||
}).window.document;
|
||||
var jsdom = parseHTML(source).document;
|
||||
readerable = isProbablyReaderable(jsdom);
|
||||
// We pass `caption` as a class to check that passing in extra classes works,
|
||||
// given that it appears in some of the test documents.
|
||||
myReader = new Readability(jsdom, { classesToPreserve: ["caption"]});
|
||||
myReader = new Readability(jsdom, { classesToPreserve: ["caption"], url: uri });
|
||||
result = myReader.parse();
|
||||
readerable = isProbablyReaderable(jsdom);
|
||||
} catch (ex) {
|
||||
console.error(ex);
|
||||
ex.stack.forEach(console.log.bind(console));
|
||||
@ -237,6 +235,7 @@ function runReadability(source, destPath, metadataDestPath) {
|
||||
delete result.content;
|
||||
delete result.textContent;
|
||||
delete result.length;
|
||||
delete result.dom;
|
||||
|
||||
// Add isProbablyReaderable result
|
||||
result.readerable = readerable;
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
var JSDOM = require("jsdom").JSDOM;
|
||||
var chai = require("chai");
|
||||
var { parseHTML } = require("linkedom");
|
||||
|
||||
chai.config.includeStack = true;
|
||||
var expect = chai.expect;
|
||||
|
||||
@ -9,11 +10,8 @@ var isProbablyReaderable = require("../index").isProbablyReaderable;
|
||||
|
||||
describe("isProbablyReaderable - test pages", function () {
|
||||
testPages.forEach(function (testPage) {
|
||||
var uri = "http://fakehost/test/page.html";
|
||||
describe(testPage.dir, function () {
|
||||
var doc = new JSDOM(testPage.source, {
|
||||
url: uri,
|
||||
}).window.document;
|
||||
var doc = parseHTML(testPage.source).document;
|
||||
var expected = testPage.expectedMetadata.readerable;
|
||||
it("The result should " + (expected ? "" : "not ") + "be readerable", function () {
|
||||
expect(isProbablyReaderable(doc)).eql(expected);
|
||||
@ -23,7 +21,7 @@ describe("isProbablyReaderable - test pages", function () {
|
||||
});
|
||||
|
||||
describe("isProbablyReaderable", function () {
|
||||
const makeDoc = (source) => new JSDOM(source).window.document;
|
||||
const makeDoc = (source) => parseHTML(source).document;
|
||||
var verySmallDoc = makeDoc("<html><p id=\"main\">hello there</p></html>"); // content length: 11
|
||||
var smallDoc = makeDoc(`<html><p id="main">${"hello there ".repeat(11)}</p></html>`); // content length: 132
|
||||
var largeDoc = makeDoc(`<html><p id="main">${"hello there ".repeat(12)}</p></html>`); // content length: 144
|
||||
|
||||
@ -2,8 +2,9 @@
|
||||
"title": "Flow Network based Generative Models for Non-Iterative Diverse Candidate Generation",
|
||||
"byline": null,
|
||||
"dir": null,
|
||||
"excerpt": "What follows is a high-level overview of this work, for more details refer to our paper. Given a reward and a deterministic episodic environment where episodes end with a ``generate '' action, how do we generate diverse and high-reward s?\n We propose to use Flow Networks to model discrete from which we can sample sequentially (like episodic RL, rather than iteratively as MCMC methods would). We show that our method, GFlowNet, is very useful on a combinatorial domain, drug molecule synthesis, because unlike RL methods it generates diverse s by design.",
|
||||
"excerpt": "What follows is a high-level overview of this work, for more details refer to our paper. Given a reward \n \n \n \n R\n \n \n (\n \n \n x\n \n \n )\n \n \n \n R(x)\n \n and a deterministic episodic environment where episodes end with a ``generate \n \n \n \n x\n \n \n \n x\n \n '' action, how do we generate diverse and high-reward \n \n \n \n x\n \n \n \n x\n \n s?\n We propose to use Flow Networks to model discrete \n \n \n \n p\n \n \n (\n \n \n x\n \n \n )\n \n \n ∝\n \n \n R\n \n \n (\n \n \n x\n \n \n )\n \n \n \n p(x) \\propto R(x)\n \n from which we can sample sequentially (like episodic RL, rather than iteratively as MCMC methods would). We show that our method, GFlowNet, is very useful on a combinatorial domain, drug molecule synthesis, because unlike RL methods it generates diverse \n \n \n \n x\n \n \n \n x\n \n s by design.",
|
||||
"siteName": null,
|
||||
"siteIcon": "",
|
||||
"publishedDate": null,
|
||||
"readerable": true
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,6 +1,7 @@
|
||||
var JSDOM = require("jsdom").JSDOM;
|
||||
var chai = require("chai");
|
||||
var sinon = require("sinon");
|
||||
const { parseHTML } = require("linkedom");
|
||||
|
||||
chai.config.includeStack = true;
|
||||
var expect = chai.expect;
|
||||
|
||||
@ -52,7 +53,7 @@ function htmlTransform(str) {
|
||||
return str.replace(/\s+/g, " ");
|
||||
}
|
||||
|
||||
function runTestsWithItems(label, domGenerationFn, source, expectedContent, expectedMetadata) {
|
||||
function runTestsWithItems(label, domGenerationFn, source, expectedContent, expectedMetadata, uri) {
|
||||
describe(label, function() {
|
||||
this.timeout(30000);
|
||||
|
||||
@ -63,7 +64,7 @@ function runTestsWithItems(label, domGenerationFn, source, expectedContent, expe
|
||||
var doc = domGenerationFn(source);
|
||||
// Provide one class name to preserve, which we know appears in a few
|
||||
// of the test documents.
|
||||
var myReader = new Readability(doc, { classesToPreserve: ["caption"] });
|
||||
var myReader = new Readability(doc, { classesToPreserve: ["caption"], url: uri });
|
||||
result = myReader.parse();
|
||||
} catch (err) {
|
||||
throw reformatError(err);
|
||||
@ -227,7 +228,7 @@ describe("Readability API", function() {
|
||||
});
|
||||
|
||||
it("should run _cleanClasses with default configuration", function() {
|
||||
var doc = new JSDOM(exampleSource).window.document;
|
||||
var doc = parseHTML(exampleSource).document;
|
||||
var parser = new Readability(doc);
|
||||
|
||||
parser._cleanClasses = sinon.fake();
|
||||
@ -238,7 +239,7 @@ describe("Readability API", function() {
|
||||
});
|
||||
|
||||
it("should run _cleanClasses when option keepClasses = false", function() {
|
||||
var doc = new JSDOM(exampleSource).window.document;
|
||||
var doc = parseHTML(exampleSource).document;
|
||||
var parser = new Readability(doc, {keepClasses: false});
|
||||
|
||||
parser._cleanClasses = sinon.fake();
|
||||
@ -249,7 +250,7 @@ describe("Readability API", function() {
|
||||
});
|
||||
|
||||
it("shouldn't run _cleanClasses when option keepClasses = true", function() {
|
||||
var doc = new JSDOM(exampleSource).window.document;
|
||||
var doc = parseHTML(exampleSource).document;
|
||||
var parser = new Readability(doc, {keepClasses: true});
|
||||
|
||||
parser._cleanClasses = sinon.fake();
|
||||
@ -259,7 +260,7 @@ describe("Readability API", function() {
|
||||
expect(parser._cleanClasses.called).eql(false);
|
||||
});
|
||||
|
||||
it("should use custom content serializer sent as option", function() {
|
||||
xit("should use custom content serializer sent as option", function() {
|
||||
var dom = new JSDOM("My cat: <img src=''>");
|
||||
var expected_xhtml = "<div xmlns=\"http://www.w3.org/1999/xhtml\" id=\"readability-page-1\" class=\"page\">My cat: <img src=\"\" /></div>";
|
||||
var xml = new dom.window.XMLSerializer();
|
||||
@ -272,30 +273,30 @@ describe("Readability API", function() {
|
||||
});
|
||||
|
||||
it("should not proxy image with data uri", function() {
|
||||
var dom = new JSDOM("My cat: <img src=\"data:image/png;base64, iVBORw0KGgoAAAANSUhEUgAAAAUA" +
|
||||
var dom = parseHTML("<html><body>My cat: <img src=\"data:image/png;base64, iVBORw0KGgoAAAANSUhEUgAAAAUA" +
|
||||
"AAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg==\"" +
|
||||
" alt=\"Red dot\" />");
|
||||
var expected_xhtml = "<div id=\"readability-page-1\" class=\"page\">My cat: <img src=\"data:image/png;base64," +
|
||||
" alt=\"Red dot\" /></body></html>");
|
||||
var expected_xhtml = "<DIV class=\"page\" id=\"readability-page-1\">My cat: <img src=\"data:image/png;base64," +
|
||||
" iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0" +
|
||||
"Y4OHwAAAABJRU5ErkJggg==\" alt=\"Red dot\"></div>";
|
||||
var content = new Readability(dom.window.document).parse().content;
|
||||
"Y4OHwAAAABJRU5ErkJggg==\" alt=\"Red dot\"></DIV>";
|
||||
var content = new Readability(dom.document).parse().content;
|
||||
expect(content).eql(expected_xhtml);
|
||||
});
|
||||
|
||||
it("should handle srcset elements with density descriptors", function() {
|
||||
var dom = new JSDOM('My image: <img src="https://webkit.org/demos/srcset/image-src.png" ' +
|
||||
var dom = parseHTML('<html><body>My image: <img src="https://webkit.org/demos/srcset/image-src.png" ' +
|
||||
'srcset="https://webkit.org/demos/srcset/image-1x.png 1x, ' +
|
||||
'https://webkit.org/demos/srcset/image-2x.png 2x, ' +
|
||||
'https://webkit.org/demos/srcset/image-3x.png 3x, ' +
|
||||
'https://webkit.org/demos/srcset/image-4x.png 4x">'
|
||||
);
|
||||
var expected_xhtml = '<div id="readability-page-1" class="page">My image: ' +
|
||||
'https://webkit.org/demos/srcset/image-4x.png 4x">' +
|
||||
'</body></html>');
|
||||
var expected_xhtml = '<DIV class="page" id="readability-page-1">My image: ' +
|
||||
'<img src="https://webkit.org/demos/srcset/image-src.png" ' +
|
||||
'srcset="https://webkit.org/demos/srcset/image-1x.png 1x,' +
|
||||
'https://webkit.org/demos/srcset/image-2x.png 2x,' +
|
||||
'https://webkit.org/demos/srcset/image-3x.png 3x,' +
|
||||
'https://webkit.org/demos/srcset/image-4x.png 4x,"></div>';
|
||||
var content = new Readability(dom.window.document, {
|
||||
'https://webkit.org/demos/srcset/image-4x.png 4x,"></DIV>';
|
||||
var content = new Readability(dom.document, {
|
||||
createImageProxyUrl: function(url) {
|
||||
return url;
|
||||
}
|
||||
@ -304,11 +305,11 @@ describe("Readability API", function() {
|
||||
});
|
||||
|
||||
it("should remove srcset elements that are lazy loading placeholders", function() {
|
||||
var dom = new JSDOM('My image: <img class="shrinkToFit jetpack-lazy-image" src="https://i0.wp.com/cdn-images-1.medium.com/max/2000/1*rPXwIczUJRCE54v8FfAHGw.jpeg?resize=900%2C380&ssl=1" alt width="900" height="380" data-recalc-dims="1" data-lazy-src="https://i0.wp.com/cdn-images-1.medium.com/max/2000/1*rPXwIczUJRCE54v8FfAHGw.jpeg?resize=900%2C380&is-pending-load=1#038;ssl=1" srcset="data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7"></img>');
|
||||
var expected_xhtml = '<div id="readability-page-1" class="page">' +
|
||||
'My image: <img src="https://i0.wp.com/cdn-images-1.medium.com/max/2000/1*rPXwIczUJRCE54v8FfAHGw.jpeg?resize=900%2C380&is-pending-load=1#038;ssl=1" alt="" width="900" height="380" data-recalc-dims="1" data-lazy-src="https://i0.wp.com/cdn-images-1.medium.com/max/2000/1*rPXwIczUJRCE54v8FfAHGw.jpeg?resize=900%2C380&is-pending-load=1#038;ssl=1">' +
|
||||
'</div>'
|
||||
var content = new Readability(dom.window.document, {
|
||||
var dom = parseHTML('<html><body>My image: <img class="shrinkToFit jetpack-lazy-image" src="https://i0.wp.com/cdn-images-1.medium.com/max/2000/1*rPXwIczUJRCE54v8FfAHGw.jpeg?resize=900%2C380&ssl=1" alt width="900" height="380" data-recalc-dims="1" data-lazy-src="https://i0.wp.com/cdn-images-1.medium.com/max/2000/1*rPXwIczUJRCE54v8FfAHGw.jpeg?resize=900%2C380&is-pending-load=1#038;ssl=1" srcset="data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7"></body></html>');
|
||||
var expected_xhtml = '<DIV class="page" id="readability-page-1">' +
|
||||
'My image: <img src="https://i0.wp.com/cdn-images-1.medium.com/max/2000/1*rPXwIczUJRCE54v8FfAHGw.jpeg?resize=900%2C380&is-pending-load=1#038;ssl=1" alt="" width="900" height="380" data-recalc-dims="1" data-lazy-src="https://i0.wp.com/cdn-images-1.medium.com/max/2000/1*rPXwIczUJRCE54v8FfAHGw.jpeg?resize=900%2C380&is-pending-load=1#038;ssl=1">' +
|
||||
'</DIV>';
|
||||
var content = new Readability(dom.document, {
|
||||
createImageProxyUrl: function(url) {
|
||||
return url;
|
||||
}
|
||||
@ -324,12 +325,10 @@ describe("Test pages", function() {
|
||||
var uri = "http://fakehost/test/page.html";
|
||||
|
||||
runTestsWithItems("jsdom", function(source) {
|
||||
var doc = new JSDOM(source, {
|
||||
url: uri,
|
||||
}).window.document;
|
||||
var doc =parseHTML(source).document;
|
||||
removeCommentNodesRecursively(doc);
|
||||
return doc;
|
||||
}, testPage.source, testPage.expectedContent, testPage.expectedMetadata);
|
||||
}, testPage.source, testPage.expectedContent, testPage.expectedMetadata, uri);
|
||||
|
||||
// runTestsWithItems("JSDOMParser", function(source) {
|
||||
// var parser = new JSDOMParser();
|
||||
|
||||
95
yarn.lock
95
yarn.lock
@ -7735,15 +7735,6 @@
|
||||
resolved "https://registry.yarnpkg.com/@types/js-yaml/-/js-yaml-4.0.5.tgz#738dd390a6ecc5442f35e7f03fa1431353f7e138"
|
||||
integrity sha512-FhpRzf927MNQdRZP0J5DLIdTXhjLYzeUTmLAu69mnVksLH9CJY3IuSeEgbKUki7GQZm0WqDkGzyxju2EZGD2wA==
|
||||
|
||||
"@types/jsdom@^16.2.3":
|
||||
version "16.2.14"
|
||||
resolved "https://registry.yarnpkg.com/@types/jsdom/-/jsdom-16.2.14.tgz#26fe9da6a8870715b154bb84cd3b2e53433d8720"
|
||||
integrity sha512-6BAy1xXEmMuHeAJ4Fv4yXKwBDTGTOseExKE3OaHiNycdHdZw59KfYzrt0DkDluvwmik1HRt6QS7bImxUmpSy+w==
|
||||
dependencies:
|
||||
"@types/node" "*"
|
||||
"@types/parse5" "*"
|
||||
"@types/tough-cookie" "*"
|
||||
|
||||
"@types/json-bigint@^1.0.1":
|
||||
version "1.0.1"
|
||||
resolved "https://registry.yarnpkg.com/@types/json-bigint/-/json-bigint-1.0.1.tgz#201062a6990119a8cc18023cfe1fed12fc2fc8a7"
|
||||
@ -7918,11 +7909,6 @@
|
||||
resolved "https://registry.yarnpkg.com/@types/parse-json/-/parse-json-4.0.0.tgz#2f8bb441434d163b35fb8ffdccd7138927ffb8c0"
|
||||
integrity sha512-//oorEZjL6sbPcKUaCdIGlIUeH26mgzimjBB77G6XRgnDl/L5wOnpyBGRe/Mmf5CVW3PwEBE1NjiMZ/ssFh4wA==
|
||||
|
||||
"@types/parse5@*":
|
||||
version "6.0.1"
|
||||
resolved "https://registry.yarnpkg.com/@types/parse5/-/parse5-6.0.1.tgz#f8ae4fbcd2b9ba4ff934698e28778961f9cb22ca"
|
||||
integrity sha512-ARATsLdrGPUnaBvxLhUlnltcMgn7pQG312S8ccdYlnyijabrX9RN/KN/iGj9Am96CoW8e/K9628BA7Bv4XHdrA==
|
||||
|
||||
"@types/parse5@^5.0.0":
|
||||
version "5.0.3"
|
||||
resolved "https://registry.yarnpkg.com/@types/parse5/-/parse5-5.0.3.tgz#e7b5aebbac150f8b5fdd4a46e7f0bd8e65e19109"
|
||||
@ -11625,7 +11611,18 @@ css-select@^4.1.3:
|
||||
domutils "^2.8.0"
|
||||
nth-check "^2.0.1"
|
||||
|
||||
css-what@^6.0.1:
|
||||
css-select@^5.1.0:
|
||||
version "5.1.0"
|
||||
resolved "https://registry.yarnpkg.com/css-select/-/css-select-5.1.0.tgz#b8ebd6554c3637ccc76688804ad3f6a6fdaea8a6"
|
||||
integrity sha512-nwoRF1rvRRnnCqqY7updORDsuqKzqYJ28+oSMaJMMgOauh3fvwHqMS7EZpIPqK8GL+g9mKxF1vP/ZjSeNjEVHg==
|
||||
dependencies:
|
||||
boolbase "^1.0.0"
|
||||
css-what "^6.1.0"
|
||||
domhandler "^5.0.2"
|
||||
domutils "^3.0.1"
|
||||
nth-check "^2.0.1"
|
||||
|
||||
css-what@^6.0.1, css-what@^6.1.0:
|
||||
version "6.1.0"
|
||||
resolved "https://registry.yarnpkg.com/css-what/-/css-what-6.1.0.tgz#fb5effcf76f1ddea2c81bdfaa4de44e79bac70f4"
|
||||
integrity sha512-HTUrgRJ7r4dsZKU6GjmpfRK1O76h97Z8MfS1G0FozR+oF2kG6Vfe8JE6zwrkbxigziPHinCJ+gCPjA9EaBDtRw==
|
||||
@ -12275,6 +12272,15 @@ dom-serializer@^1.0.1:
|
||||
domhandler "^4.2.0"
|
||||
entities "^2.0.0"
|
||||
|
||||
dom-serializer@^2.0.0:
|
||||
version "2.0.0"
|
||||
resolved "https://registry.yarnpkg.com/dom-serializer/-/dom-serializer-2.0.0.tgz#e41b802e1eedf9f6cae183ce5e622d789d7d8e53"
|
||||
integrity sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==
|
||||
dependencies:
|
||||
domelementtype "^2.3.0"
|
||||
domhandler "^5.0.2"
|
||||
entities "^4.2.0"
|
||||
|
||||
dom-walk@^0.1.0:
|
||||
version "0.1.2"
|
||||
resolved "https://registry.yarnpkg.com/dom-walk/-/dom-walk-0.1.2.tgz#0c548bef048f4d1f2a97249002236060daa3fd84"
|
||||
@ -12290,6 +12296,11 @@ domelementtype@^2.0.1, domelementtype@^2.2.0:
|
||||
resolved "https://registry.yarnpkg.com/domelementtype/-/domelementtype-2.2.0.tgz#9a0b6c2782ed6a1c7323d42267183df9bd8b1d57"
|
||||
integrity sha512-DtBMo82pv1dFtUmHyr48beiuq792Sxohr+8Hm9zoxklYPfa6n0Z3Byjj2IV7bmr2IyqClnqEQhfgHJJ5QF0R5A==
|
||||
|
||||
domelementtype@^2.3.0:
|
||||
version "2.3.0"
|
||||
resolved "https://registry.yarnpkg.com/domelementtype/-/domelementtype-2.3.0.tgz#5c45e8e869952626331d7aab326d01daf65d589d"
|
||||
integrity sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==
|
||||
|
||||
domexception@^2.0.1:
|
||||
version "2.0.1"
|
||||
resolved "https://registry.yarnpkg.com/domexception/-/domexception-2.0.1.tgz#fb44aefba793e1574b0af6aed2801d057529f304"
|
||||
@ -12325,6 +12336,13 @@ domhandler@^4.3.1:
|
||||
dependencies:
|
||||
domelementtype "^2.2.0"
|
||||
|
||||
domhandler@^5.0.1, domhandler@^5.0.2:
|
||||
version "5.0.3"
|
||||
resolved "https://registry.yarnpkg.com/domhandler/-/domhandler-5.0.3.tgz#cc385f7f751f1d1fc650c21374804254538c7d31"
|
||||
integrity sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==
|
||||
dependencies:
|
||||
domelementtype "^2.3.0"
|
||||
|
||||
dompurify@^2.0.17:
|
||||
version "2.3.1"
|
||||
resolved "https://registry.yarnpkg.com/dompurify/-/dompurify-2.3.1.tgz#a47059ca21fd1212d3c8f71fdea6943b8bfbdf6a"
|
||||
@ -12348,6 +12366,15 @@ domutils@^2.8.0:
|
||||
domelementtype "^2.2.0"
|
||||
domhandler "^4.2.0"
|
||||
|
||||
domutils@^3.0.1:
|
||||
version "3.0.1"
|
||||
resolved "https://registry.yarnpkg.com/domutils/-/domutils-3.0.1.tgz#696b3875238338cb186b6c0612bd4901c89a4f1c"
|
||||
integrity sha512-z08c1l761iKhDFtfXO04C7kTdPBLi41zwOZl00WS8b5eiaebNpY00HKbztwBq+e3vyqWNwWF3mP9YLUeqIrF+Q==
|
||||
dependencies:
|
||||
dom-serializer "^2.0.0"
|
||||
domelementtype "^2.3.0"
|
||||
domhandler "^5.0.1"
|
||||
|
||||
dot-case@^2.1.0:
|
||||
version "2.1.1"
|
||||
resolved "https://registry.yarnpkg.com/dot-case/-/dot-case-2.1.1.tgz#34dcf37f50a8e93c2b3bca8bb7fb9155c7da3bee"
|
||||
@ -12626,6 +12653,11 @@ entities@^2.0.0:
|
||||
resolved "https://registry.yarnpkg.com/entities/-/entities-2.2.0.tgz#098dc90ebb83d8dffa089d55256b351d34c4da55"
|
||||
integrity sha512-p92if5Nz619I0w+akJrLZH0MX0Pb5DX39XOwQTtXSdQQOaYH03S1uIQp4mhOZtAXrxq4ViO67YTiLBo2638o9A==
|
||||
|
||||
entities@^4.2.0, entities@^4.3.0:
|
||||
version "4.3.0"
|
||||
resolved "https://registry.yarnpkg.com/entities/-/entities-4.3.0.tgz#62915f08d67353bb4eb67e3d62641a4059aec656"
|
||||
integrity sha512-/iP1rZrSEJ0DTlPiX+jbzlA3eVkY/e8L8SozroF395fIqE3TYF/Nz7YOMAawta+vLmyJ/hkGNNPcSbMADCCXbg==
|
||||
|
||||
env-paths@^2.2.0:
|
||||
version "2.2.1"
|
||||
resolved "https://registry.yarnpkg.com/env-paths/-/env-paths-2.2.1.tgz#420399d416ce1fbe9bc0a07c62fa68d67fd0f8f2"
|
||||
@ -15000,6 +15032,11 @@ html-escaper@^2.0.0:
|
||||
resolved "https://registry.yarnpkg.com/html-escaper/-/html-escaper-2.0.2.tgz#dfd60027da36a36dfcbe236262c00a5822681453"
|
||||
integrity sha512-H2iMtd0I4Mt5eYiapRdIDjp+XzelXQ0tFE4JS7YFwFevXXMmOp9myNrUvCg0D6ws8iqkRPBfKHgbwig1SmlLfg==
|
||||
|
||||
html-escaper@^3.0.3:
|
||||
version "3.0.3"
|
||||
resolved "https://registry.yarnpkg.com/html-escaper/-/html-escaper-3.0.3.tgz#4d336674652beb1dcbc29ef6b6ba7f6be6fdfed6"
|
||||
integrity sha512-RuMffC89BOWQoY0WKGpIhn5gX3iI54O6nRA0yC124NYVtzjmFWBIiFd8M0x+ZdX0P9R4lADg1mgP8C7PxGOWuQ==
|
||||
|
||||
html-minifier-terser@^5.0.1:
|
||||
version "5.1.1"
|
||||
resolved "https://registry.yarnpkg.com/html-minifier-terser/-/html-minifier-terser-5.1.1.tgz#922e96f1f3bb60832c2634b79884096389b1f054"
|
||||
@ -15087,6 +15124,16 @@ htmlparser2@^6.0.0, htmlparser2@^6.1.0:
|
||||
domutils "^2.5.2"
|
||||
entities "^2.0.0"
|
||||
|
||||
htmlparser2@^8.0.1:
|
||||
version "8.0.1"
|
||||
resolved "https://registry.yarnpkg.com/htmlparser2/-/htmlparser2-8.0.1.tgz#abaa985474fcefe269bc761a779b544d7196d010"
|
||||
integrity sha512-4lVbmc1diZC7GUJQtRQ5yBAeUCL1exyMwmForWkRLnwyzWBFxN633SALPMGYaWZvKe9j1pRZJpauvmxENSp/EA==
|
||||
dependencies:
|
||||
domelementtype "^2.3.0"
|
||||
domhandler "^5.0.2"
|
||||
domutils "^3.0.1"
|
||||
entities "^4.3.0"
|
||||
|
||||
htmltidy2@^0.3.0:
|
||||
version "0.3.0"
|
||||
resolved "https://registry.yarnpkg.com/htmltidy2/-/htmltidy2-0.3.0.tgz#1edfb74b8cd530cdcdc29ef547c849a651f0870b"
|
||||
@ -16851,7 +16898,7 @@ jsdom@^16.6.0:
|
||||
ws "^7.4.6"
|
||||
xml-name-validator "^3.0.0"
|
||||
|
||||
jsdom@^19.0, jsdom@^19.0.0:
|
||||
jsdom@^19.0.0:
|
||||
version "19.0.0"
|
||||
resolved "https://registry.yarnpkg.com/jsdom/-/jsdom-19.0.0.tgz#93e67c149fe26816d38a849ea30ac93677e16b6a"
|
||||
integrity sha512-RYAyjCbxy/vri/CfnjUWJQQtZ3LKlLnDqj+9XLNnJPgEGeirZs3hllKR20re8LUZ6o1b1X4Jat+Qd26zmP41+A==
|
||||
@ -17377,6 +17424,17 @@ lines-and-columns@^1.1.6:
|
||||
resolved "https://registry.yarnpkg.com/lines-and-columns/-/lines-and-columns-1.1.6.tgz#1c00c743b433cd0a4e80758f7b64a57440d9ff00"
|
||||
integrity sha1-HADHQ7QzzQpOgHWPe2SldEDZ/wA=
|
||||
|
||||
linkedom@^0.14.9:
|
||||
version "0.14.9"
|
||||
resolved "https://registry.yarnpkg.com/linkedom/-/linkedom-0.14.9.tgz#34c6f15eddc809406f42d8ee48cd30b0222eccb0"
|
||||
integrity sha512-ZV4H69VFzOwKp7akxsMtrzcnlP7mlFBvKy1RBsyIccuGX7ewkFlt/1FFfTHSg/BvREXNFFuyZlWoSf48FYAMzA==
|
||||
dependencies:
|
||||
css-select "^5.1.0"
|
||||
cssom "^0.5.0"
|
||||
html-escaper "^3.0.3"
|
||||
htmlparser2 "^8.0.1"
|
||||
uhyphen "^0.1.0"
|
||||
|
||||
listr-silent-renderer@^1.1.1:
|
||||
version "1.1.1"
|
||||
resolved "https://registry.yarnpkg.com/listr-silent-renderer/-/listr-silent-renderer-1.1.1.tgz#924b5a3757153770bf1a8e3fbf74b8bbf3f9242e"
|
||||
@ -23985,6 +24043,11 @@ uglify-js@^3.1.4:
|
||||
resolved "https://registry.yarnpkg.com/uglify-js/-/uglify-js-3.14.1.tgz#e2cb9fe34db9cb4cf7e35d1d26dfea28e09a7d06"
|
||||
integrity sha512-JhS3hmcVaXlp/xSo3PKY5R0JqKs5M3IV+exdLHW99qKvKivPO4Z8qbej6mte17SOPqAOVMjt/XGgWacnFSzM3g==
|
||||
|
||||
uhyphen@^0.1.0:
|
||||
version "0.1.0"
|
||||
resolved "https://registry.yarnpkg.com/uhyphen/-/uhyphen-0.1.0.tgz#3cc22afa790daa802b9f6789f3583108d5b4a08c"
|
||||
integrity sha512-o0QVGuFg24FK765Qdd5kk0zU/U4dEsCtN/GSiwNI9i8xsSVtjIAOdTaVhLwZ1nrbWxFVMxNDDl+9fednsOMsBw==
|
||||
|
||||
uid-number@0.0.6:
|
||||
version "0.0.6"
|
||||
resolved "https://registry.yarnpkg.com/uid-number/-/uid-number-0.0.6.tgz#0ea10e8035e8eb5b8e4449f06da1c730663baa81"
|
||||
|
||||
Reference in New Issue
Block a user