rewrite puppeteer in typescript

This commit is contained in:
Hongbo Wu
2024-01-15 23:32:26 +08:00
parent 51e586ed3d
commit cd3402b98a
13 changed files with 678 additions and 330 deletions

View File

@ -0,0 +1,75 @@
const { interfaces } = require('mocha');
const { uploadPdf, sendSavePageMutation, sendCreateArticleMutation, sendImportStatusUpdate } = require('./api');
interface Item {
url: string;
userId: string;
contentType: string;
articleSavingRequestId: string;
state: string;
labels: string[];
source: string;
folder: string;
rssFeedUrl: string;
savedAt: string;
publishedAt: string;
readabilityResult: string;
}
exports.saveItem = async (item: Item) => {
const { url, userId, contentType, articleSavingRequestId, state, labels, source, folder, rssFeedUrl, savedAt, publishedAt, readabilityResult } = item;
try {
if (contentType === 'application/pdf') {
const uploadFileId = await uploadPdf(url, userId, articleSavingRequestId);
const uploadedPdf = await sendCreateArticleMutation(userId, {
url: encodeURI(url),
articleSavingRequestId,
uploadFileId,
state,
labels,
source,
folder,
rssFeedUrl,
savedAt,
publishedAt,
});
if (!uploadedPdf) {
console.error('error while saving uploaded pdf', url);
return false;
}
} else {
const apiResponse = await sendSavePageMutation(userId, {
url,
clientRequestId: articleSavingRequestId,h
title,
originalContent: content,
parseResult: readabilityResult,
state,
labels,
rssFeedUrl,
savedAt,
publishedAt,
source,
folder,
});
if (!apiResponse) {
console.error('error while saving page', url);
return false;
} else if (apiResponse.error === 'UNAUTHORIZED') {
console.log('user is deleted, do not retry', userId);
return true;
} else {
importStatus = readabilityResult ? 'imported' : 'failed';
}
}
} catch (error) {
logRecord.error = error.message;
} finally {
// mark import failed on the last failed retry
const retryCount = req.headers['x-cloudtasks-taskretrycount'];
if (retryCount === MAX_RETRY_COUNT) {
console.log('max retry count reached');
importStatus = importStatus || 'failed';
}
}
}

View File

@ -36,7 +36,7 @@
"linkedom": "^0.14.16",
"lodash": "^4.17.21",
"luxon": "^3.0.4",
"puppeteer-core": "^19.1.1",
"puppeteer-core": "^20.9.0",
"underscore": "^1.13.6",
"uuid": "^9.0.0"
},

View File

@ -0,0 +1,2 @@
node_modules/
build/

View File

@ -0,0 +1,14 @@
{
"extends": "../../.eslintrc",
"parserOptions": {
"project": "tsconfig.json"
},
"rules": {
"@typescript-eslint/no-floating-promises": [
"error",
{
"ignoreIIFE": true
}
]
}
}

View File

@ -0,0 +1,5 @@
{
"extension": ["ts"],
"spec": "test/**/*.test.ts",
"require": "test/babel-register.js"
}

View File

@ -2,10 +2,14 @@
"name": "@omnivore/puppeteer-parse",
"version": "1.0.0",
"description": "Accepts URL of the article and parses its content",
"main": "index.js",
"main": "build/src/index.js",
"files": [
"build/src"
],
"dependencies": {
"@omnivore/content-handler": "1.0.0",
"@omnivore/readability": "1.0.0",
"axios": "^1.4.0",
"crypto": "^1.0.1",
"dompurify": "^2.4.1",
"linkedom": "^0.14.9",
@ -20,7 +24,10 @@
"mocha": "^10.0.0"
},
"scripts": {
"test": "mocha test/*.js"
"test": "yarn mocha -r ts-node/register --config mocha-config.json",
"test:typecheck": "tsc --noEmit",
"lint": "eslint src --ext ts,js,tsx,jsx",
"build": "tsc"
},
"volta": {
"extends": "../../package.json"

View File

@ -1,99 +1,106 @@
/* eslint-disable no-undef */
/* eslint-disable no-empty */
/* eslint-disable @typescript-eslint/explicit-function-return-type */
/* eslint-disable @typescript-eslint/no-var-requires */
/* eslint-disable @typescript-eslint/no-require-imports */
const { encode } = require("urlsafe-base64");
const crypto = require("crypto");
const Url = require('url');
const os = require('os');
/* eslint-disable @typescript-eslint/no-unsafe-member-access */
/* eslint-disable @typescript-eslint/no-unsafe-assignment */
import { preHandleContent, preParseContent } from '@omnivore/content-handler'
import { Readability } from '@omnivore/readability'
import axios from 'axios'
import crypto from 'crypto'
import createDOMPurify, { SanitizeElementHookEvent } from 'dompurify'
// const { Storage } = require('@google-cloud/storage');
const { parseHTML } = require('linkedom');
const { preHandleContent, preParseContent } = require("@omnivore/content-handler");
const { Readability } = require("@omnivore/readability");
const puppeteer = require('puppeteer-extra');
import { parseHTML } from 'linkedom'
import path from 'path'
import { Browser, BrowserContext, Page, Protocol } from 'puppeteer-core'
import puppeteer from 'puppeteer-extra'
import AdblockerPlugin from 'puppeteer-extra-plugin-adblocker'
import StealthPlugin from 'puppeteer-extra-plugin-stealth'
import Url from 'url'
import { encode } from 'urlsafe-base64'
// Add stealth plugin to hide puppeteer usage
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
puppeteer.use(StealthPlugin());
puppeteer.use(StealthPlugin())
// Add adblocker plugin to block all ads and trackers (saves bandwidth)
const AdblockerPlugin = require('puppeteer-extra-plugin-adblocker');
puppeteer.use(AdblockerPlugin({ blockTrackers: true }));
const createDOMPurify = require("dompurify");
puppeteer.use(AdblockerPlugin({ blockTrackers: true }))
// const storage = new Storage();
const ALLOWED_ORIGINS = process.env.ALLOWED_ORIGINS ? process.env.ALLOWED_ORIGINS.split(',') : [];
// const ALLOWED_ORIGINS = process.env.ALLOWED_ORIGINS
// ? process.env.ALLOWED_ORIGINS.split(',')
// : []
// const previewBucket = process.env.PREVIEW_IMAGE_BUCKET ? storage.bucket(process.env.PREVIEW_IMAGE_BUCKET) : undefined;
const filePath = `${os.tmpdir()}/previewImage.png`;
// const filePath = `${os.tmpdir()}/previewImage.png`
const MOBILE_USER_AGENT = 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.62 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
const DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
const BOT_DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
const NON_BOT_DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
const MOBILE_USER_AGENT =
'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.62 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
const DESKTOP_USER_AGENT =
'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
const BOT_DESKTOP_USER_AGENT =
'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
const NON_BOT_DESKTOP_USER_AGENT =
'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
const NON_BOT_HOSTS = ['bloomberg.com', 'forbes.com']
const NON_SCRIPT_HOSTS= ['medium.com', 'fastcompany.com', 'fortelabs.com'];
const NON_SCRIPT_HOSTS = ['medium.com', 'fastcompany.com', 'fortelabs.com']
const ALLOWED_CONTENT_TYPES = ['text/html', 'application/octet-stream', 'text/plain', 'application/pdf'];
const ALLOWED_CONTENT_TYPES = [
'text/html',
'application/octet-stream',
'text/plain',
'application/pdf',
]
const REQUEST_TIMEOUT = 30000
const userAgentForUrl = (url) => {
const userAgentForUrl = (url: string) => {
try {
const u = new URL(url);
const u = new URL(url)
for (const host of NON_BOT_HOSTS) {
if (u.hostname.endsWith(host)) {
return NON_BOT_DESKTOP_USER_AGENT;
return NON_BOT_DESKTOP_USER_AGENT
}
}
} catch (e) {
console.log('error getting user agent for url', url, e)
}
return DESKTOP_USER_AGENT
};
}
const fetchContentWithScrapingBee = async (url) => {
const fetchContentWithScrapingBee = async (url: string) => {
try {
const response = await axios.get('https://app.scrapingbee.com/api/v1', {
params: {
'api_key': process.env.SCRAPINGBEE_API_KEY,
'url': url,
'render_js': 'false',
'premium_proxy': 'true',
'country_code':'us'
api_key: process.env.SCRAPINGBEE_API_KEY,
url: url,
render_js: 'false',
premium_proxy: 'true',
country_code: 'us',
},
timeout: REQUEST_TIMEOUT,
})
const dom = parseHTML(response.data).document;
const dom = parseHTML(response.data).document
return { title: dom.title, domContent: dom.documentElement.outerHTML, url }
} catch (e) {
console.error('error fetching with scrapingbee', e.message)
console.error('error fetching with scrapingbee', e)
return { title: url, domContent: '', url }
}
}
const enableJavascriptForUrl = (url) => {
const enableJavascriptForUrl = (url: string) => {
try {
const u = new URL(url);
const u = new URL(url)
for (const host of NON_SCRIPT_HOSTS) {
if (u.hostname.endsWith(host)) {
return false;
return false
}
}
} catch (e) {
console.log('error getting hostname for url', url, e)
}
return true
};
}
// launch Puppeteer
const getBrowserPromise = (async () => {
console.log("starting puppeteer browser")
return puppeteer.launch({
console.log('starting puppeteer browser')
return (await puppeteer.launch({
args: [
'--allow-running-insecure-content',
'--autoplay-policy=user-gesture-required',
@ -123,112 +130,141 @@ const getBrowserPromise = (async () => {
height: 1080,
isLandscape: true,
isMobile: false,
width: 1920
width: 1920,
},
executablePath: process.env.CHROMIUM_PATH,
headless: !!process.env.LAUNCH_HEADLESS,
timeout: 120000, // 2 minutes
});
})();
})) as Browser
})()
async function fetchContent(url, locale, timezone) {
let functionStartTime = Date.now();
let logRecord = {
export const fetchContent = async (
url: string,
locale: string,
timezone: string
) => {
const functionStartTime = Date.now()
const logRecord = {
url,
functionStartTime,
locale,
timezone,
}
console.log(`content-fetch request`, logRecord);
console.log(`content-fetch request`, logRecord)
let context, page, finalUrl, title, content, contentType, readabilityResult = null;
let context: BrowserContext | undefined,
page: Page | undefined,
finalUrl: string | undefined,
title: string | undefined,
content: string | undefined,
contentType: string | undefined,
readabilityResult: Readability.ParseResult | null | undefined
try {
url = getUrl(url);
url = getUrl(url)
if (!url) {
throw new Error('Valid URL to parse not specified');
throw new Error('Valid URL to parse not specified')
}
// pre handle url with custom handlers
try {
const browser = await getBrowserPromise;
const result = await preHandleContent(url, browser);
const browser = await getBrowserPromise
const result = await preHandleContent(url, browser)
if (result && result.url) {
validateUrlString(url);
url = result.url;
validateUrlString(url)
url = result.url
}
if (result && result.title) {
title = result.title
}
if (result && result.content) {
content = result.content
}
if (result && result.contentType) {
contentType = result.contentType
}
if (result && result.title) { title = result.title }
if (result && result.content) { content = result.content }
if (result && result.contentType) { contentType = result.contentType }
} catch (e) {
console.info('error with handler: ', e);
console.info('error with handler: ', e)
}
if ((!content || !title) && contentType !== 'application/pdf') {
const result = await retrievePage(url, logRecord, functionStartTime, locale, timezone);
if (result && result.context) { context = result.context }
if (result && result.page) { page = result.page }
if (result && result.finalUrl) { finalUrl = result.finalUrl }
if (result && result.contentType) { contentType = result.contentType }
const result = await retrievePage(
url,
logRecord,
functionStartTime,
locale,
timezone
)
if (result && result.context) {
context = result.context
}
if (result && result.page) {
page = result.page
}
if (result && result.finalUrl) {
finalUrl = result.finalUrl
}
if (result && result.contentType) {
contentType = result.contentType
}
} else {
finalUrl = url
}
if (contentType !== 'application/pdf') {
if (!content || !title) {
const result = await retrieveHtml(page, logRecord);
if (page && (!content || !title)) {
const result = await retrieveHtml(page, logRecord)
if (result.isBlocked) {
const sbResult = await fetchContentWithScrapingBee(url)
title = sbResult.title
content = sbResult.domContent
} else {
title = result.title;
content = result.domContent;
title = result.title
content = result.domContent
}
} else {
console.info('using prefetched content and title');
console.info('using prefetched content and title')
}
}
} catch (e) {
console.error(`Error while retrieving page ${url}`, e);
console.error(`Error while retrieving page ${url}`, e)
// fallback to scrapingbee for non pdf content
if (url && contentType !== 'application/pdf') {
console.info('fallback to scrapingbee', url);
console.info('fallback to scrapingbee', url)
const fetchStartTime = Date.now();
const sbResult = await fetchContentWithScrapingBee(url);
content = sbResult.domContent;
title = sbResult.title;
const sbResult = await fetchContentWithScrapingBee(url)
content = sbResult.domContent
title = sbResult.title
} else {
throw e;
throw e
}
} finally {
// close browser context if it was opened
if (context) {
await context.close();
await context.close()
}
// save non pdf content
if (url && contentType !== 'application/pdf') {
// parse content if it is not empty
if (content) {
let document = parseHTML(content).document;
let document = parseHTML(content).document
// preParse content
const preParsedDom = await preParseContent(url, document)
if (preParsedDom) {
document = preParsedDom
}
readabilityResult = await getReadabilityResult(url, document);
readabilityResult = await getReadabilityResult(url, document)
}
}
console.info(`content-fetch result`, logRecord);
return { finalUrl, title, content, readabilityResult, contentType };
}
console.info(`content-fetch result`, logRecord)
}
function validateUrlString(url) {
const u = new URL(url);
return { finalUrl, title, content, readabilityResult, contentType }
}
function validateUrlString(url: string) {
const u = new URL(url)
// Make sure the URL is http or https
if (u.protocol !== 'http:' && u.protocol !== 'https:') {
throw new Error('Invalid URL protocol check failed')
@ -243,60 +279,75 @@ function validateUrlString(url) {
}
}
function tryParseUrl(urlStr) {
function tryParseUrl(urlStr: string) {
if (!urlStr) {
return null;
return null
}
// a regular expression to match all URLs
const regex = /(https?:\/\/[^\s]+)/g;
const regex = /(https?:\/\/[^\s]+)/g
const matches = urlStr.match(regex);
const matches = urlStr.match(regex)
if (matches) {
return matches[0]; // only return first match
return matches[0] // only return first match
} else {
return null;
return null
}
}
function getUrl(urlStr) {
function getUrl(urlStr: string) {
const url = tryParseUrl(urlStr)
if (!url) {
throw new Error('No URL specified');
throw new Error('No URL specified')
}
validateUrlString(url);
validateUrlString(url)
const parsed = Url.parse(url);
return parsed.href;
const parsed = Url.parse(url)
return parsed.href
}
async function retrievePage(url, logRecord, functionStartTime, locale, timezone) {
validateUrlString(url);
async function retrievePage(
url: string,
logRecord: Record<string, any>,
functionStartTime: number,
locale: string,
timezone: string
) {
validateUrlString(url)
const browser = await getBrowserPromise;
logRecord.timing = { ...logRecord.timing, browserOpened: Date.now() - functionStartTime };
const browser = await getBrowserPromise
logRecord.timing = {
...logRecord.timing,
browserOpened: Date.now() - functionStartTime,
}
const context = await browser.createIncognitoBrowserContext();
const context = await browser.createIncognitoBrowserContext()
const page = await context.newPage()
if (!enableJavascriptForUrl(url)) {
await page.setJavaScriptEnabled(false);
await page.setJavaScriptEnabled(false)
}
await page.setUserAgent(userAgentForUrl(url));
await page.setUserAgent(userAgentForUrl(url))
// set locale for the page
if (locale) {
await page.setExtraHTTPHeaders({ 'Accept-Language': locale });
await page.setExtraHTTPHeaders({ 'Accept-Language': locale })
}
// set timezone for the page
if (timezone) {
await page.emulateTimezone(timezone);
await page.emulateTimezone(timezone)
}
const client = await page.target().createCDPSession();
const client = await page.target().createCDPSession()
const downloadPath = path.resolve('./download_dir/')
await client.send('Page.setDownloadBehavior', {
behavior: 'allow',
downloadPath,
})
// intercept request when response headers was received
await client.send('Network.setRequestInterception', {
@ -307,107 +358,126 @@ async function retrievePage(url, logRecord, functionStartTime, locale, timezone)
interceptionStage: 'HeadersReceived',
},
],
});
const path = require('path');
const download_path = path.resolve('./download_dir/');
await client.send('Page.setDownloadBehavior', {
behavior: 'allow',
userDataDir: './',
downloadPath: download_path,
})
client.on('Network.requestIntercepted', async e => {
const headers = e.responseHeaders || {};
client.on(
'Network.requestIntercepted',
(e: Protocol.Network.RequestInterceptedEvent) => {
;(async () => {
const headers = e.responseHeaders || {}
const [contentType] = (headers['content-type'] || headers['Content-Type'] || '')
const [contentType] = (
headers['content-type'] ||
headers['Content-Type'] ||
''
)
.toLowerCase()
.split(';');
const obj = { interceptionId: e.interceptionId };
.split(';')
const obj: Protocol.Network.ContinueInterceptedRequestRequest = {
interceptionId: e.interceptionId,
}
if (e.responseStatusCode >= 200 && e.responseStatusCode < 300) {
if (
e.responseStatusCode &&
e.responseStatusCode >= 200 &&
e.responseStatusCode < 300
) {
// We only check content-type on success responses
// as it doesn't matter what the content type is for things
// like redirects
if (contentType && !ALLOWED_CONTENT_TYPES.includes(contentType)) {
obj['errorReason'] = 'BlockedByClient';
obj['errorReason'] = 'BlockedByClient'
}
}
try {
await client.send('Network.continueInterceptedRequest', obj);
// eslint-disable-next-line no-empty
} catch {}
});
await client.send('Network.continueInterceptedRequest', obj)
} catch {
// ignore
}
})()
}
)
/*
* Disallow MathJax from running in Puppeteer and modifying the document,
* we shall instead run it in our frontend application to transform any
* mathjax content when present.
*/
await page.setRequestInterception(true);
let requestCount = 0;
page.on('request', request => {
await page.setRequestInterception(true)
let requestCount = 0
page.on('request', (request) => {
;(async () => {
if (request.resourceType() === 'font') {
// Disallow fonts from loading
request.abort();
return;
return request.abort()
}
if (requestCount++ > 100) {
request.abort();
return;
return request.abort()
}
if (
request.resourceType() === 'script' &&
request.url().toLowerCase().indexOf('mathjax') > -1
) {
request.abort();
return
return request.abort()
}
request.continue();
});
await request.continue()
})()
})
// Puppeteer fails during download of PDf files,
// so record the failure and use those items
let lastPdfUrl = undefined;
page.on('response', response => {
let lastPdfUrl = undefined
page.on('response', (response) => {
if (response.headers()['content-type'] === 'application/pdf') {
lastPdfUrl = response.url();
lastPdfUrl = response.url()
}
});
})
try {
const response = await page.goto(url, { timeout: 30 * 1000, waitUntil: ['networkidle2'] });
const finalUrl = response.url();
const contentType = response.headers()['content-type'];
const response = await page.goto(url, {
timeout: 30 * 1000,
waitUntil: ['networkidle2'],
})
if (!response) {
throw new Error('No response from page')
}
logRecord.finalUrl = response.url();
logRecord.contentType = response.headers()['content-type'];
const finalUrl = response.url()
const contentType = response.headers()['content-type']
return { context, page, response, finalUrl, contentType };
logRecord.finalUrl = response.url()
logRecord.contentType = response.headers()['content-type']
return { context, page, response, finalUrl, contentType }
} catch (error) {
if (lastPdfUrl) {
return { context, page, finalUrl: lastPdfUrl, contentType: 'application/pdf' };
return {
context,
page,
finalUrl: lastPdfUrl,
contentType: 'application/pdf',
}
await context.close();
throw error;
}
await context.close()
throw error
}
}
async function retrieveHtml(page, logRecord) {
let domContent = '', title;
async function retrieveHtml(page: Page, logRecord: Record<string, any>) {
let domContent = '',
title
try {
title = await page.title();
logRecord.title = title;
title = await page.title()
logRecord.title = title
const pageScrollingStart = Date.now();
const pageScrollingStart = Date.now()
/* scroll with a 5 seconds timeout */
await Promise.race([
new Promise(resolve => {
(async function () {
try {
await page.evaluate(`(async () => {
await page
.evaluate(
`(async () => {
/* credit: https://github.com/puppeteer/puppeteer/issues/305 */
return new Promise((resolve, reject) => {
let scrollHeight = document.body.scrollHeight;
@ -422,46 +492,56 @@ async function retrieveHtml(page, logRecord) {
}
}, 10);
});
})()`);
} catch (e) {
logRecord.scrollError = true;
} finally {
resolve(true);
}
})();
})()`
)
.catch((e) => {
console.log('error scrolling page', e)
logRecord.scrollError = true
}),
page.waitForTimeout(5000),
]);
logRecord.timing = { ...logRecord.timing, pageScrolled: Date.now() - pageScrollingStart };
new Promise((r) => setTimeout(r, 5000)),
])
const iframes = {};
const urls = [];
const framesPromises = [];
const allowedUrls = /instagram\.com/gi;
logRecord.timing = {
...logRecord.timing,
pageScrolled: Date.now() - pageScrollingStart,
}
const iframes: Record<string, any> = {}
const urls: string[] = []
const framesPromises = []
const allowedUrls = /instagram\.com/gi
for (const frame of page.mainFrame().childFrames()) {
if (frame.url() && allowedUrls.test(frame.url())) {
urls.push(frame.url());
framesPromises.push(frame.evaluate(el => el.innerHTML, await frame.$('body')));
urls.push(frame.url())
framesPromises.push(
frame.evaluate((el) => el?.innerHTML, await frame.$('body'))
)
}
}
(await Promise.all(framesPromises)).forEach((frame, index) => (iframes[urls[index]] = frame));
;(await Promise.all(framesPromises)).forEach(
(frame, index) => (iframes[urls[index]] = frame)
)
const domContentCapturingStart = Date.now();
const domContentCapturingStart = Date.now()
// get document body with all hidden elements removed
domContent = await page.evaluate(iframes => {
const BI_SRC_REGEXP = /url\("(.+?)"\)/gi;
domContent = await page.evaluate((iframes) => {
const BI_SRC_REGEXP = /url\("(.+?)"\)/gi
Array.from(document.body.getElementsByTagName('*')).forEach(el => {
const style = window.getComputedStyle(el);
Array.from(document.body.getElementsByTagName('*')).forEach((el) => {
const style = window.getComputedStyle(el)
const src = el.getAttribute('src')
try {
// Removing blurred images since they are mostly the copies of lazy loaded ones
if (el.tagName && ['img', 'image'].includes(el.tagName.toLowerCase())) {
const filter = style.getPropertyValue('filter');
if (
el.tagName &&
['img', 'image'].includes(el.tagName.toLowerCase())
) {
const filter = style.getPropertyValue('filter')
if (filter && filter.startsWith('blur')) {
el.parentNode && el.parentNode.removeChild(el);
el.parentNode && el.parentNode.removeChild(el)
}
}
} catch (err) {
@ -469,69 +549,80 @@ async function retrieveHtml(page, logRecord) {
}
// convert all nodes with background image to img nodes
if (!['', 'none'].includes(style.getPropertyValue('background-image'))) {
const filter = style.getPropertyValue('filter');
if (
!['', 'none'].includes(style.getPropertyValue('background-image'))
) {
const filter = style.getPropertyValue('filter')
// avoiding image nodes with a blur effect creation
if (filter && filter.startsWith('blur')) {
el && el.parentNode && el.parentNode.removeChild(el);
el && el.parentNode && el.parentNode.removeChild(el)
} else {
const matchedSRC = BI_SRC_REGEXP.exec(style.getPropertyValue('background-image'));
const matchedSRC = BI_SRC_REGEXP.exec(
style.getPropertyValue('background-image')
)
// Using "g" flag with a regex we have to manually break down lastIndex to zero after every usage
// More details here: https://stackoverflow.com/questions/1520800/why-does-a-regexp-with-global-flag-give-wrong-results
BI_SRC_REGEXP.lastIndex = 0;
BI_SRC_REGEXP.lastIndex = 0
if (matchedSRC && matchedSRC[1] && !el.src) {
if (matchedSRC && matchedSRC[1] && !src) {
// Replacing element only of there are no content inside, b/c might remove important div with content.
// Article example: http://www.josiahzayner.com/2017/01/genetic-designer-part-i.html
// DIV with class "content-inner" has `url("https://resources.blogblog.com/blogblog/data/1kt/travel/bg_container.png")` background image.
if (!el.textContent) {
const img = document.createElement('img');
img.src = matchedSRC[1];
el && el.parentNode && el.parentNode.replaceChild(img, el);
const img = document.createElement('img')
img.src = matchedSRC[1]
el && el.parentNode && el.parentNode.replaceChild(img, el)
}
}
}
}
if (el.tagName === 'IFRAME') {
if (iframes[el.src]) {
const newNode = document.createElement('div');
newNode.className = 'omnivore-instagram-embed';
newNode.innerHTML = iframes[el.src];
el && el.parentNode && el.parentNode.replaceChild(newNode, el);
if (src && iframes[src]) {
const newNode = document.createElement('div')
newNode.className = 'omnivore-instagram-embed'
newNode.innerHTML = iframes[src]
el && el.parentNode && el.parentNode.replaceChild(newNode, el)
}
}
});
})
if (document.querySelector('[data-translate="managed_checking_msg"]') ||
document.getElementById('px-block-form-wrapper')) {
if (
document.querySelector('[data-translate="managed_checking_msg"]') ||
document.getElementById('px-block-form-wrapper')
) {
return 'IS_BLOCKED'
}
return document.documentElement.outerHTML;
}, iframes);
logRecord.puppeteerSuccess = true;
return document.documentElement.outerHTML
}, iframes)
logRecord.puppeteerSuccess = true
logRecord.timing = {
...logRecord.timing,
contenCaptured: Date.now() - domContentCapturingStart,
};
}
// [END puppeteer-block]
} catch (e) {
if (e instanceof Error) {
if (e.message.startsWith('net::ERR_BLOCKED_BY_CLIENT at ')) {
logRecord.blockedByClient = true;
logRecord.blockedByClient = true
} else {
logRecord.puppeteerSuccess = false;
logRecord.puppeteerSuccess = false
logRecord.puppeteerError = {
message: e.message,
stack: e.stack,
};
}
}
} else {
logRecord.puppeteerSuccess = false
logRecord.puppeteerError = e
}
}
if (domContent === 'IS_BLOCKED') {
return { isBlocked: true };
return { isBlocked: true }
}
return { domContent, title };
return { domContent, title }
}
// async function preview(req, res) {
@ -669,7 +760,7 @@ const DOM_PURIFY_CONFIG = {
],
}
function domPurifySanitizeHook(node, data) {
function domPurifySanitizeHook(node: Element, data: SanitizeElementHookEvent) {
if (data.tagName === 'iframe') {
const urlRegex = /^(https?:)?\/\/www\.youtube(-nocookie)?\.com\/embed\//i
const src = node.getAttribute('src') || ''
@ -688,7 +779,7 @@ function domPurifySanitizeHook(node, data) {
}
}
function getPurifiedContent(html) {
function getPurifiedContent(html: Document) {
const newWindow = parseHTML('')
const DOMPurify = createDOMPurify(newWindow)
DOMPurify.addHook('uponSanitizeElement', domPurifySanitizeHook)
@ -696,13 +787,16 @@ function getPurifiedContent(html) {
return parseHTML(clean).document
}
function signImageProxyUrl(url) {
function signImageProxyUrl(url: string) {
return encode(
crypto.createHmac('sha256', process.env.IMAGE_PROXY_SECRET).update(url).digest()
crypto
.createHmac('sha256', process.env.IMAGE_PROXY_SECRET || '')
.update(url)
.digest()
)
}
function createImageProxyUrl(url, width = 0, height = 0) {
function createImageProxyUrl(url: string, width = 0, height = 0) {
if (!process.env.IMAGE_PROXY_URL || !process.env.IMAGE_PROXY_SECRET) {
return url
}
@ -713,7 +807,7 @@ function createImageProxyUrl(url, width = 0, height = 0) {
return `${process.env.IMAGE_PROXY_URL}/${width}x${height},s${signature}/${url}`
}
async function getReadabilityResult(url, document) {
async function getReadabilityResult(url: string, document: Document) {
// First attempt to read the article as is.
// if that fails attempt to purify then read
const sources = [
@ -747,9 +841,3 @@ async function getReadabilityResult(url, document) {
return null
}
module.exports = {
fetchContent,
// preview,
};

View File

@ -0,0 +1,173 @@
// Type definitions for non-npm package mozilla-readability 0.2
// Project: https://github.com/mozilla/readability
// Definitions by: Charles Vandevoorde <https://github.com/charlesvdv>, Alex Wendland <https://github.com/awendland>
// Definitions: https://github.com/DefinitelyTyped/DefinitelyTyped
// TypeScript Version: 2.2
declare module '@omnivore/readability' {
/**
* A standalone version of the readability library used for Firefox Reader View.
*
* Note that isProbablyReaderable() was moved into a separate file in https://github.com/mozilla/readability/commit/2620542dd1e8380220d82afa97a2c283ae636e40
* and therefore is no longer part of the Readability class.
*/
class Readability {
/**
* ## Usage on the web
*
* To parse a document, you must create a new Readability object from a
* DOM document object, and then call parse(). Here's an example:
*
* ```js
* var article = new Readability(document).parse();
* ```
*
* If you're using Readability on the web, you will likely be able to
* use a document reference from elsewhere (e.g. fetched via XMLHttpRequest,
* in a same-origin <iframe> you have access to, etc.).
*
* ## Usage from node.js
*
* In node.js, you won't generally have a DOM document object. To obtain one, you can use external
* libraries like [jsdom](https://github.com/tmpvar/jsdom). While this repository contains a parser of
* its own (`JSDOMParser`), that is restricted to reading XML-compatible markup and therefore we do
* not recommend it for general use.
*
* If you're using `jsdom` to create a DOM object, you should ensure that the page doesn't run (page)
* scripts (avoid fetching remote resources etc.) as well as passing it the page's URI as the `url`
* property of the `options` object you pass the `JSDOM` constructor.
*
* ```js
* var JSDOM = require('jsdom').JSDOM;
* var doc = new JSDOM("<body>Here's a bunch of text</body>", {
* url: "https://www.example.com/the-page-i-got-the-source-from",
* });
* let reader = new Readability(doc.window.document);
* let article = reader.parse();
* ```
*/
constructor(doc: Document, options?: Readability.Options)
/**
* Runs readability.
*
* ## Workflow:
*
* 1. Prep the document by removing script tags, css, etc.
* 2. Build readability's DOM tree.
* 3. Grab the article content from the current dom tree.
* 4. Replace the current DOM tree with the new one.
* 5. Read peacefully.
*
* ## Additional notes:
*
* Readability's parse() works by modifying the DOM. This removes some
* elements in the web page. You could avoid this by passing the clone
* of the document object while creating a Readability object.
*
* ```js
* var documentClone = document.cloneNode(true);
* var article = new Readability(documentClone).parse();
* ```
*
* The response will be null if the processing failed (https://github.com/mozilla/readability/blob/52ab9b5c8916c306a47b2119270dcdabebf9d203/Readability.js#L2038)
*/
async parse(): Promise<Readability.ParseResult | null>
}
namespace Readability {
interface Options {
/**
* Control whether log messages are sent to the console
*/
debug?: boolean
/**
* Set a maximum size on the documents that will be processed. This size is
* checked before any parsing operations occur. If the number of elements in
* the document exceeds this threshold then an Error will be thrown.
*
* See implementation details at https://github.com/mozilla/readability/blob/52ab9b5c8916c306a47b2119270dcdabebf9d203/Readability.js#L2019
*/
maxElemsToParse?: number
nbTopCandidates?: number
/**
* Minimum number of characters in the extracted textContent in order to
* consider the article correctly identified. If the threshold is not met then
* the extraction process will automatically run again with different flags.
*
* See implementation details at https://github.com/mozilla/readability/blob/52ab9b5c8916c306a47b2119270dcdabebf9d203/Readability.js#L1208
*
* Changed from wordThreshold in https://github.com/mozilla/readability/commit/3ff9a166fb27928f222c4c0722e730eda412658a
*/
charThreshold?: number
/**
* parse() removes the class="" attribute from every element in the given
* subtree, except those that match CLASSES_TO_PRESERVE and
* the classesToPreserve array from the options object.
*/
classesToPreserve?: string[]
/**
* By default Readability will strip all classes from the HTML elements in the
* processed article. By setting this to `true` the classes will be retained.
*
* This is a blanket alternative to `classesToPreserve`.
*
* Added in https://github.com/mozilla/readability/commit/2982216913af2c66b0690e88606b03116553ad92
*/
keepClasses?: boolean
url?: string
/**
* Function that converts a regular image url into imageproxy url
* @param url string
*/
createImageProxyUrl?: (
url: string,
width?: number,
height?: number
) => string
/**
* By default, Readability will clean all tables from the HTML elements in the
* processed article. But newsletters in emails use tables to display their content.
* By setting this to `true`, these tables will be retained.
*/
keepTables?: boolean
ignoreLinkDensity?: boolean
}
interface ParseResult {
/** Article title */
title: string
/** Author metadata */
byline?: string | null
/** Content direction */
dir?: string | null
/** HTML string of processed article content */
content: string
/** non-HTML version of `content` */
textContent: string
/** Length of an article, in characters */
length: number
/** Article description, or short excerpt from the content */
excerpt: string
/** Article site name */
siteName?: string | null
/** Article site icon */
siteIcon?: string | null
/** Article preview image */
previewImage?: string | null
/** Article published date */
publishedDate?: Date | null
language?: string | null
}
}
export { Readability }
}

View File

@ -0,0 +1,3 @@
const register = require('@babel/register').default
register({ extensions: ['.ts', '.tsx', '.js', '.jsx'] })

View File

@ -1,9 +0,0 @@
const chai = require("chai");
const expect = chai.expect;
describe('Stub test', () => {
it('should pass', () => {
expect(true).to.be.true
})
})

View File

@ -0,0 +1,8 @@
import 'mocha'
import { expect } from 'chai'
describe('stub test', () => {
it('should pass', () => {
expect(true).to.be.true
})
})

View File

@ -0,0 +1,8 @@
{
"extends": "./../../tsconfig.json",
"compilerOptions": {
"outDir": "build",
"rootDir": "."
},
"include": ["src"]
}

View File

@ -12416,13 +12416,6 @@ cron-parser@^4.6.0:
dependencies:
luxon "^3.2.1"
cross-fetch@3.1.5, cross-fetch@^3.0.6, cross-fetch@^3.1.5:
version "3.1.5"
resolved "https://registry.yarnpkg.com/cross-fetch/-/cross-fetch-3.1.5.tgz#e1389f44d9e7ba767907f7af8454787952ab534f"
integrity sha512-lvb1SBsI0Z7GDwmuid+mU3kWVBwTVUbe7S0H52yaaAdQOXq2YktTCZdlAcNKFzE6QtRz0snpw9bNiPeOIkkQvw==
dependencies:
node-fetch "2.6.7"
cross-fetch@4.0.0:
version "4.0.0"
resolved "https://registry.yarnpkg.com/cross-fetch/-/cross-fetch-4.0.0.tgz#f037aef1580bb3a1a35164ea2a848ba81b445983"
@ -12430,6 +12423,13 @@ cross-fetch@4.0.0:
dependencies:
node-fetch "^2.6.12"
cross-fetch@^3.0.6, cross-fetch@^3.1.5:
version "3.1.5"
resolved "https://registry.yarnpkg.com/cross-fetch/-/cross-fetch-3.1.5.tgz#e1389f44d9e7ba767907f7af8454787952ab534f"
integrity sha512-lvb1SBsI0Z7GDwmuid+mU3kWVBwTVUbe7S0H52yaaAdQOXq2YktTCZdlAcNKFzE6QtRz0snpw9bNiPeOIkkQvw==
dependencies:
node-fetch "2.6.7"
cross-spawn@^6.0.0:
version "6.0.5"
resolved "https://registry.yarnpkg.com/cross-spawn/-/cross-spawn-6.0.5.tgz#4a5ec7c64dfae22c3a14124dbacdee846d80cbc4"
@ -13171,11 +13171,6 @@ detect-port@^1.3.0:
address "^1.0.1"
debug "^2.6.0"
devtools-protocol@0.0.1045489:
version "0.0.1045489"
resolved "https://registry.yarnpkg.com/devtools-protocol/-/devtools-protocol-0.0.1045489.tgz#f959ad560b05acd72d55644bc3fb8168a83abf28"
integrity sha512-D+PTmWulkuQW4D1NTiCRCFxF7pQPn0hgp4YyX4wAQ6xYXKOadSWPR3ENGDQ47MW/Ewc9v2rpC/UEEGahgBYpSQ==
devtools-protocol@0.0.1147663:
version "0.0.1147663"
resolved "https://registry.yarnpkg.com/devtools-protocol/-/devtools-protocol-0.0.1147663.tgz#4ec5610b39a6250d1f87e6b9c7e16688ed0ac78e"
@ -17239,14 +17234,6 @@ https-browserify@^1.0.0:
resolved "https://registry.yarnpkg.com/https-browserify/-/https-browserify-1.0.0.tgz#ec06c10e0a34c0f2faf199f7fd7fc78fffd03c73"
integrity sha1-7AbBDgo0wPL68Zn3/X/Hj//QPHM=
https-proxy-agent@5.0.1, https-proxy-agent@^5.0.0:
version "5.0.1"
resolved "https://registry.yarnpkg.com/https-proxy-agent/-/https-proxy-agent-5.0.1.tgz#c59ef224a04fe8b754f3db0063a25ea30d0005d6"
integrity sha512-dFcAjpTQFgoLMzC2VwU+C/CbS7uRL0lWmxDITmqm7C+7F0Odmj6s9l6alZc6AELXhrnggM2CeWSXHGOdX2YtwA==
dependencies:
agent-base "6"
debug "4"
https-proxy-agent@^4.0.0:
version "4.0.0"
resolved "https://registry.yarnpkg.com/https-proxy-agent/-/https-proxy-agent-4.0.0.tgz#702b71fb5520a132a66de1f67541d9e62154d82b"
@ -17255,6 +17242,14 @@ https-proxy-agent@^4.0.0:
agent-base "5"
debug "4"
https-proxy-agent@^5.0.0:
version "5.0.1"
resolved "https://registry.yarnpkg.com/https-proxy-agent/-/https-proxy-agent-5.0.1.tgz#c59ef224a04fe8b754f3db0063a25ea30d0005d6"
integrity sha512-dFcAjpTQFgoLMzC2VwU+C/CbS7uRL0lWmxDITmqm7C+7F0Odmj6s9l6alZc6AELXhrnggM2CeWSXHGOdX2YtwA==
dependencies:
agent-base "6"
debug "4"
https-proxy-agent@^7.0.0, https-proxy-agent@^7.0.1:
version "7.0.1"
resolved "https://registry.yarnpkg.com/https-proxy-agent/-/https-proxy-agent-7.0.1.tgz#0277e28f13a07d45c663633841e20a40aaafe0ab"
@ -24911,7 +24906,7 @@ proxy-from-env@1.0.0:
resolved "https://registry.yarnpkg.com/proxy-from-env/-/proxy-from-env-1.0.0.tgz#33c50398f70ea7eb96d21f7b817630a55791c7ee"
integrity sha1-M8UDmPcOp+uW0h97gXYwpVeRx+4=
proxy-from-env@1.1.0, proxy-from-env@^1.1.0:
proxy-from-env@^1.1.0:
version "1.1.0"
resolved "https://registry.yarnpkg.com/proxy-from-env/-/proxy-from-env-1.1.0.tgz#e102f16ca355424865755d2c9e8ea4f24d58c3e2"
integrity sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==
@ -25009,22 +25004,6 @@ pupa@^2.1.1:
dependencies:
escape-goat "^2.0.0"
puppeteer-core@^19.1.1:
version "19.1.1"
resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-19.1.1.tgz#6416ff925a9cc78523c490482a17a2998f7c0626"
integrity sha512-jV26Ke0VFel4MoXLjqm50uAW2uwksTP6Md1tvtXqWqXM5FyboKI6E9YYJ1qEQilUAqlhgGq8xLN5+SL8bPz/kw==
dependencies:
cross-fetch "3.1.5"
debug "4.3.4"
devtools-protocol "0.0.1045489"
extract-zip "2.0.1"
https-proxy-agent "5.0.1"
proxy-from-env "1.1.0"
rimraf "3.0.2"
tar-fs "2.1.1"
unbzip2-stream "1.4.3"
ws "8.9.0"
puppeteer-core@^20.9.0:
version "20.9.0"
resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-20.9.0.tgz#6f4b420001b64419deab38d398a4d9cd071040e6"
@ -26751,13 +26730,6 @@ rfdc@^1.3.0:
resolved "https://registry.yarnpkg.com/rfdc/-/rfdc-1.3.0.tgz#d0b7c441ab2720d05dc4cf26e01c89631d9da08b"
integrity sha512-V2hovdzFbOi77/WajaSMXk2OLm+xNIeQdMMuB7icj7bk6zi2F8GGAxigcnDFpJHbNyNcgyJDiP+8nOrY5cZGrA==
rimraf@3.0.2, rimraf@^3.0.0, rimraf@^3.0.2:
version "3.0.2"
resolved "https://registry.yarnpkg.com/rimraf/-/rimraf-3.0.2.tgz#f1a5402ba6220ad52cc1282bac1ae3aa49fd061a"
integrity sha512-JZkJMZkAGFFPP2YqXZXPbMlMBgsxzE8ILs4lMIX/2o0L9UBw9O/Y3o6wFw/i9YLapcUJWwqbi3kdxIPdC62TIA==
dependencies:
glob "^7.1.3"
rimraf@^2.2.8, rimraf@^2.5.4, rimraf@^2.6.1, rimraf@^2.6.3:
version "2.7.1"
resolved "https://registry.yarnpkg.com/rimraf/-/rimraf-2.7.1.tgz#35797f13a7fdadc566142c29d4f07ccad483e3ec"
@ -26765,6 +26737,13 @@ rimraf@^2.2.8, rimraf@^2.5.4, rimraf@^2.6.1, rimraf@^2.6.3:
dependencies:
glob "^7.1.3"
rimraf@^3.0.0, rimraf@^3.0.2:
version "3.0.2"
resolved "https://registry.yarnpkg.com/rimraf/-/rimraf-3.0.2.tgz#f1a5402ba6220ad52cc1282bac1ae3aa49fd061a"
integrity sha512-JZkJMZkAGFFPP2YqXZXPbMlMBgsxzE8ILs4lMIX/2o0L9UBw9O/Y3o6wFw/i9YLapcUJWwqbi3kdxIPdC62TIA==
dependencies:
glob "^7.1.3"
rimraf@^4.4.1:
version "4.4.1"
resolved "https://registry.yarnpkg.com/rimraf/-/rimraf-4.4.1.tgz#bd33364f67021c5b79e93d7f4fa0568c7c21b755"
@ -28533,16 +28512,6 @@ tapable@^2.0.0, tapable@^2.1.1, tapable@^2.2.0:
resolved "https://registry.yarnpkg.com/tapable/-/tapable-2.2.1.tgz#1967a73ef4060a82f12ab96af86d52fdb76eeca0"
integrity sha512-GNzQvQTOIP6RyTfE2Qxb8ZVlNmw0n88vp1szwWRimP02mnTsx3Wtn5qRdqY9w2XduFNUgvOwhNnQsjwCp+kqaQ==
tar-fs@2.1.1, tar-fs@^2.0.0:
version "2.1.1"
resolved "https://registry.yarnpkg.com/tar-fs/-/tar-fs-2.1.1.tgz#489a15ab85f1f0befabb370b7de4f9eb5cbe8784"
integrity sha512-V0r2Y9scmbDRLCNex/+hYzvp/zyYjvFbHPNgVTKfQvVrb6guiE/fxP+XblDNR011utopbkex2nM4dHNV6GDsng==
dependencies:
chownr "^1.1.1"
mkdirp-classic "^0.5.2"
pump "^3.0.0"
tar-stream "^2.1.4"
tar-fs@3.0.4, tar-fs@^3.0.4:
version "3.0.4"
resolved "https://registry.yarnpkg.com/tar-fs/-/tar-fs-3.0.4.tgz#a21dc60a2d5d9f55e0089ccd78124f1d3771dbbf"
@ -28552,6 +28521,16 @@ tar-fs@3.0.4, tar-fs@^3.0.4:
pump "^3.0.0"
tar-stream "^3.1.5"
tar-fs@^2.0.0:
version "2.1.1"
resolved "https://registry.yarnpkg.com/tar-fs/-/tar-fs-2.1.1.tgz#489a15ab85f1f0befabb370b7de4f9eb5cbe8784"
integrity sha512-V0r2Y9scmbDRLCNex/+hYzvp/zyYjvFbHPNgVTKfQvVrb6guiE/fxP+XblDNR011utopbkex2nM4dHNV6GDsng==
dependencies:
chownr "^1.1.1"
mkdirp-classic "^0.5.2"
pump "^3.0.0"
tar-stream "^2.1.4"
tar-stream@^2.1.4, tar-stream@~2.2.0:
version "2.2.0"
resolved "https://registry.yarnpkg.com/tar-stream/-/tar-stream-2.2.0.tgz#acad84c284136b060dc3faa64474aa9aebd77287"
@ -30935,11 +30914,6 @@ ws@8.13.0:
resolved "https://registry.yarnpkg.com/ws/-/ws-8.13.0.tgz#9a9fb92f93cf41512a0735c8f4dd09b8a1211cd0"
integrity sha512-x9vcZYTrFPC7aSIbj7sRCYo7L/Xb8Iy+pW0ng0wt2vCJv7M9HOMy0UoN3rr+IFC7hb7vXoqS+P9ktyLLLhO+LA==
ws@8.9.0:
version "8.9.0"
resolved "https://registry.yarnpkg.com/ws/-/ws-8.9.0.tgz#2a994bb67144be1b53fe2d23c53c028adeb7f45e"
integrity sha512-Ja7nszREasGaYUYCI2k4lCKIRTt+y7XuqVoHR44YpI49TtryyqbqvDMn5eqfW7e6HzTukDRIsXqzVHScqRcafg==
"ws@^5.2.0 || ^6.0.0 || ^7.0.0", ws@^7.3.1, ws@^7.4.6:
version "7.5.7"
resolved "https://registry.yarnpkg.com/ws/-/ws-7.5.7.tgz#9e0ac77ee50af70d58326ecff7e85eb3fa375e67"