rewrite puppeteer in typescript
This commit is contained in:
75
packages/content-fetch/item.js
Normal file
75
packages/content-fetch/item.js
Normal file
@ -0,0 +1,75 @@
|
||||
const { interfaces } = require('mocha');
|
||||
const { uploadPdf, sendSavePageMutation, sendCreateArticleMutation, sendImportStatusUpdate } = require('./api');
|
||||
|
||||
interface Item {
|
||||
url: string;
|
||||
userId: string;
|
||||
contentType: string;
|
||||
articleSavingRequestId: string;
|
||||
state: string;
|
||||
labels: string[];
|
||||
source: string;
|
||||
folder: string;
|
||||
rssFeedUrl: string;
|
||||
savedAt: string;
|
||||
publishedAt: string;
|
||||
readabilityResult: string;
|
||||
}
|
||||
|
||||
exports.saveItem = async (item: Item) => {
|
||||
const { url, userId, contentType, articleSavingRequestId, state, labels, source, folder, rssFeedUrl, savedAt, publishedAt, readabilityResult } = item;
|
||||
try {
|
||||
if (contentType === 'application/pdf') {
|
||||
const uploadFileId = await uploadPdf(url, userId, articleSavingRequestId);
|
||||
const uploadedPdf = await sendCreateArticleMutation(userId, {
|
||||
url: encodeURI(url),
|
||||
articleSavingRequestId,
|
||||
uploadFileId,
|
||||
state,
|
||||
labels,
|
||||
source,
|
||||
folder,
|
||||
rssFeedUrl,
|
||||
savedAt,
|
||||
publishedAt,
|
||||
});
|
||||
if (!uploadedPdf) {
|
||||
console.error('error while saving uploaded pdf', url);
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
const apiResponse = await sendSavePageMutation(userId, {
|
||||
url,
|
||||
clientRequestId: articleSavingRequestId,h
|
||||
title,
|
||||
originalContent: content,
|
||||
parseResult: readabilityResult,
|
||||
state,
|
||||
labels,
|
||||
rssFeedUrl,
|
||||
savedAt,
|
||||
publishedAt,
|
||||
source,
|
||||
folder,
|
||||
});
|
||||
if (!apiResponse) {
|
||||
console.error('error while saving page', url);
|
||||
return false;
|
||||
} else if (apiResponse.error === 'UNAUTHORIZED') {
|
||||
console.log('user is deleted, do not retry', userId);
|
||||
return true;
|
||||
} else {
|
||||
importStatus = readabilityResult ? 'imported' : 'failed';
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
logRecord.error = error.message;
|
||||
} finally {
|
||||
// mark import failed on the last failed retry
|
||||
const retryCount = req.headers['x-cloudtasks-taskretrycount'];
|
||||
if (retryCount === MAX_RETRY_COUNT) {
|
||||
console.log('max retry count reached');
|
||||
importStatus = importStatus || 'failed';
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -36,7 +36,7 @@
|
||||
"linkedom": "^0.14.16",
|
||||
"lodash": "^4.17.21",
|
||||
"luxon": "^3.0.4",
|
||||
"puppeteer-core": "^19.1.1",
|
||||
"puppeteer-core": "^20.9.0",
|
||||
"underscore": "^1.13.6",
|
||||
"uuid": "^9.0.0"
|
||||
},
|
||||
|
||||
2
packages/puppeteer-parse/.eslintignore
Normal file
2
packages/puppeteer-parse/.eslintignore
Normal file
@ -0,0 +1,2 @@
|
||||
node_modules/
|
||||
build/
|
||||
14
packages/puppeteer-parse/.eslintrc
Normal file
14
packages/puppeteer-parse/.eslintrc
Normal file
@ -0,0 +1,14 @@
|
||||
{
|
||||
"extends": "../../.eslintrc",
|
||||
"parserOptions": {
|
||||
"project": "tsconfig.json"
|
||||
},
|
||||
"rules": {
|
||||
"@typescript-eslint/no-floating-promises": [
|
||||
"error",
|
||||
{
|
||||
"ignoreIIFE": true
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
5
packages/puppeteer-parse/mocha-config.json
Normal file
5
packages/puppeteer-parse/mocha-config.json
Normal file
@ -0,0 +1,5 @@
|
||||
{
|
||||
"extension": ["ts"],
|
||||
"spec": "test/**/*.test.ts",
|
||||
"require": "test/babel-register.js"
|
||||
}
|
||||
@ -2,10 +2,14 @@
|
||||
"name": "@omnivore/puppeteer-parse",
|
||||
"version": "1.0.0",
|
||||
"description": "Accepts URL of the article and parses its content",
|
||||
"main": "index.js",
|
||||
"main": "build/src/index.js",
|
||||
"files": [
|
||||
"build/src"
|
||||
],
|
||||
"dependencies": {
|
||||
"@omnivore/content-handler": "1.0.0",
|
||||
"@omnivore/readability": "1.0.0",
|
||||
"axios": "^1.4.0",
|
||||
"crypto": "^1.0.1",
|
||||
"dompurify": "^2.4.1",
|
||||
"linkedom": "^0.14.9",
|
||||
@ -20,7 +24,10 @@
|
||||
"mocha": "^10.0.0"
|
||||
},
|
||||
"scripts": {
|
||||
"test": "mocha test/*.js"
|
||||
"test": "yarn mocha -r ts-node/register --config mocha-config.json",
|
||||
"test:typecheck": "tsc --noEmit",
|
||||
"lint": "eslint src --ext ts,js,tsx,jsx",
|
||||
"build": "tsc"
|
||||
},
|
||||
"volta": {
|
||||
"extends": "../../package.json"
|
||||
|
||||
@ -1,99 +1,106 @@
|
||||
/* eslint-disable no-undef */
|
||||
/* eslint-disable no-empty */
|
||||
/* eslint-disable @typescript-eslint/explicit-function-return-type */
|
||||
/* eslint-disable @typescript-eslint/no-var-requires */
|
||||
/* eslint-disable @typescript-eslint/no-require-imports */
|
||||
const { encode } = require("urlsafe-base64");
|
||||
const crypto = require("crypto");
|
||||
|
||||
const Url = require('url');
|
||||
const os = require('os');
|
||||
/* eslint-disable @typescript-eslint/no-unsafe-member-access */
|
||||
/* eslint-disable @typescript-eslint/no-unsafe-assignment */
|
||||
import { preHandleContent, preParseContent } from '@omnivore/content-handler'
|
||||
import { Readability } from '@omnivore/readability'
|
||||
import axios from 'axios'
|
||||
import crypto from 'crypto'
|
||||
import createDOMPurify, { SanitizeElementHookEvent } from 'dompurify'
|
||||
// const { Storage } = require('@google-cloud/storage');
|
||||
const { parseHTML } = require('linkedom');
|
||||
const { preHandleContent, preParseContent } = require("@omnivore/content-handler");
|
||||
const { Readability } = require("@omnivore/readability");
|
||||
|
||||
const puppeteer = require('puppeteer-extra');
|
||||
import { parseHTML } from 'linkedom'
|
||||
import path from 'path'
|
||||
import { Browser, BrowserContext, Page, Protocol } from 'puppeteer-core'
|
||||
import puppeteer from 'puppeteer-extra'
|
||||
import AdblockerPlugin from 'puppeteer-extra-plugin-adblocker'
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth'
|
||||
import Url from 'url'
|
||||
import { encode } from 'urlsafe-base64'
|
||||
|
||||
// Add stealth plugin to hide puppeteer usage
|
||||
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
puppeteer.use(StealthPlugin())
|
||||
// Add adblocker plugin to block all ads and trackers (saves bandwidth)
|
||||
const AdblockerPlugin = require('puppeteer-extra-plugin-adblocker');
|
||||
puppeteer.use(AdblockerPlugin({ blockTrackers: true }));
|
||||
|
||||
const createDOMPurify = require("dompurify");
|
||||
puppeteer.use(AdblockerPlugin({ blockTrackers: true }))
|
||||
|
||||
// const storage = new Storage();
|
||||
const ALLOWED_ORIGINS = process.env.ALLOWED_ORIGINS ? process.env.ALLOWED_ORIGINS.split(',') : [];
|
||||
// const ALLOWED_ORIGINS = process.env.ALLOWED_ORIGINS
|
||||
// ? process.env.ALLOWED_ORIGINS.split(',')
|
||||
// : []
|
||||
// const previewBucket = process.env.PREVIEW_IMAGE_BUCKET ? storage.bucket(process.env.PREVIEW_IMAGE_BUCKET) : undefined;
|
||||
|
||||
const filePath = `${os.tmpdir()}/previewImage.png`;
|
||||
// const filePath = `${os.tmpdir()}/previewImage.png`
|
||||
|
||||
const MOBILE_USER_AGENT = 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.62 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
|
||||
const DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
|
||||
const BOT_DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
|
||||
const NON_BOT_DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
|
||||
const MOBILE_USER_AGENT =
|
||||
'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.62 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
|
||||
const DESKTOP_USER_AGENT =
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
|
||||
const BOT_DESKTOP_USER_AGENT =
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
|
||||
const NON_BOT_DESKTOP_USER_AGENT =
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
|
||||
const NON_BOT_HOSTS = ['bloomberg.com', 'forbes.com']
|
||||
const NON_SCRIPT_HOSTS= ['medium.com', 'fastcompany.com', 'fortelabs.com'];
|
||||
const NON_SCRIPT_HOSTS = ['medium.com', 'fastcompany.com', 'fortelabs.com']
|
||||
|
||||
const ALLOWED_CONTENT_TYPES = ['text/html', 'application/octet-stream', 'text/plain', 'application/pdf'];
|
||||
const ALLOWED_CONTENT_TYPES = [
|
||||
'text/html',
|
||||
'application/octet-stream',
|
||||
'text/plain',
|
||||
'application/pdf',
|
||||
]
|
||||
const REQUEST_TIMEOUT = 30000
|
||||
|
||||
const userAgentForUrl = (url) => {
|
||||
const userAgentForUrl = (url: string) => {
|
||||
try {
|
||||
const u = new URL(url);
|
||||
const u = new URL(url)
|
||||
for (const host of NON_BOT_HOSTS) {
|
||||
if (u.hostname.endsWith(host)) {
|
||||
return NON_BOT_DESKTOP_USER_AGENT;
|
||||
return NON_BOT_DESKTOP_USER_AGENT
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
console.log('error getting user agent for url', url, e)
|
||||
}
|
||||
return DESKTOP_USER_AGENT
|
||||
};
|
||||
}
|
||||
|
||||
const fetchContentWithScrapingBee = async (url) => {
|
||||
const fetchContentWithScrapingBee = async (url: string) => {
|
||||
try {
|
||||
const response = await axios.get('https://app.scrapingbee.com/api/v1', {
|
||||
params: {
|
||||
'api_key': process.env.SCRAPINGBEE_API_KEY,
|
||||
'url': url,
|
||||
'render_js': 'false',
|
||||
'premium_proxy': 'true',
|
||||
'country_code':'us'
|
||||
api_key: process.env.SCRAPINGBEE_API_KEY,
|
||||
url: url,
|
||||
render_js: 'false',
|
||||
premium_proxy: 'true',
|
||||
country_code: 'us',
|
||||
},
|
||||
timeout: REQUEST_TIMEOUT,
|
||||
})
|
||||
|
||||
const dom = parseHTML(response.data).document;
|
||||
|
||||
const dom = parseHTML(response.data).document
|
||||
return { title: dom.title, domContent: dom.documentElement.outerHTML, url }
|
||||
} catch (e) {
|
||||
console.error('error fetching with scrapingbee', e.message)
|
||||
console.error('error fetching with scrapingbee', e)
|
||||
|
||||
return { title: url, domContent: '', url }
|
||||
}
|
||||
}
|
||||
|
||||
const enableJavascriptForUrl = (url) => {
|
||||
const enableJavascriptForUrl = (url: string) => {
|
||||
try {
|
||||
const u = new URL(url);
|
||||
const u = new URL(url)
|
||||
for (const host of NON_SCRIPT_HOSTS) {
|
||||
if (u.hostname.endsWith(host)) {
|
||||
return false;
|
||||
return false
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
console.log('error getting hostname for url', url, e)
|
||||
}
|
||||
return true
|
||||
};
|
||||
}
|
||||
|
||||
// launch Puppeteer
|
||||
const getBrowserPromise = (async () => {
|
||||
console.log("starting puppeteer browser")
|
||||
return puppeteer.launch({
|
||||
console.log('starting puppeteer browser')
|
||||
return (await puppeteer.launch({
|
||||
args: [
|
||||
'--allow-running-insecure-content',
|
||||
'--autoplay-policy=user-gesture-required',
|
||||
@ -123,112 +130,141 @@ const getBrowserPromise = (async () => {
|
||||
height: 1080,
|
||||
isLandscape: true,
|
||||
isMobile: false,
|
||||
width: 1920
|
||||
width: 1920,
|
||||
},
|
||||
executablePath: process.env.CHROMIUM_PATH,
|
||||
headless: !!process.env.LAUNCH_HEADLESS,
|
||||
timeout: 120000, // 2 minutes
|
||||
});
|
||||
})();
|
||||
})) as Browser
|
||||
})()
|
||||
|
||||
async function fetchContent(url, locale, timezone) {
|
||||
let functionStartTime = Date.now();
|
||||
let logRecord = {
|
||||
export const fetchContent = async (
|
||||
url: string,
|
||||
locale: string,
|
||||
timezone: string
|
||||
) => {
|
||||
const functionStartTime = Date.now()
|
||||
const logRecord = {
|
||||
url,
|
||||
functionStartTime,
|
||||
locale,
|
||||
timezone,
|
||||
}
|
||||
console.log(`content-fetch request`, logRecord);
|
||||
console.log(`content-fetch request`, logRecord)
|
||||
|
||||
let context, page, finalUrl, title, content, contentType, readabilityResult = null;
|
||||
let context: BrowserContext | undefined,
|
||||
page: Page | undefined,
|
||||
finalUrl: string | undefined,
|
||||
title: string | undefined,
|
||||
content: string | undefined,
|
||||
contentType: string | undefined,
|
||||
readabilityResult: Readability.ParseResult | null | undefined
|
||||
try {
|
||||
url = getUrl(url);
|
||||
url = getUrl(url)
|
||||
if (!url) {
|
||||
throw new Error('Valid URL to parse not specified');
|
||||
throw new Error('Valid URL to parse not specified')
|
||||
}
|
||||
|
||||
// pre handle url with custom handlers
|
||||
try {
|
||||
const browser = await getBrowserPromise;
|
||||
const result = await preHandleContent(url, browser);
|
||||
const browser = await getBrowserPromise
|
||||
const result = await preHandleContent(url, browser)
|
||||
if (result && result.url) {
|
||||
validateUrlString(url);
|
||||
url = result.url;
|
||||
validateUrlString(url)
|
||||
url = result.url
|
||||
}
|
||||
if (result && result.title) {
|
||||
title = result.title
|
||||
}
|
||||
if (result && result.content) {
|
||||
content = result.content
|
||||
}
|
||||
if (result && result.contentType) {
|
||||
contentType = result.contentType
|
||||
}
|
||||
if (result && result.title) { title = result.title }
|
||||
if (result && result.content) { content = result.content }
|
||||
if (result && result.contentType) { contentType = result.contentType }
|
||||
} catch (e) {
|
||||
console.info('error with handler: ', e);
|
||||
console.info('error with handler: ', e)
|
||||
}
|
||||
|
||||
if ((!content || !title) && contentType !== 'application/pdf') {
|
||||
const result = await retrievePage(url, logRecord, functionStartTime, locale, timezone);
|
||||
if (result && result.context) { context = result.context }
|
||||
if (result && result.page) { page = result.page }
|
||||
if (result && result.finalUrl) { finalUrl = result.finalUrl }
|
||||
if (result && result.contentType) { contentType = result.contentType }
|
||||
const result = await retrievePage(
|
||||
url,
|
||||
logRecord,
|
||||
functionStartTime,
|
||||
locale,
|
||||
timezone
|
||||
)
|
||||
if (result && result.context) {
|
||||
context = result.context
|
||||
}
|
||||
if (result && result.page) {
|
||||
page = result.page
|
||||
}
|
||||
if (result && result.finalUrl) {
|
||||
finalUrl = result.finalUrl
|
||||
}
|
||||
if (result && result.contentType) {
|
||||
contentType = result.contentType
|
||||
}
|
||||
} else {
|
||||
finalUrl = url
|
||||
}
|
||||
|
||||
if (contentType !== 'application/pdf') {
|
||||
if (!content || !title) {
|
||||
const result = await retrieveHtml(page, logRecord);
|
||||
if (page && (!content || !title)) {
|
||||
const result = await retrieveHtml(page, logRecord)
|
||||
if (result.isBlocked) {
|
||||
const sbResult = await fetchContentWithScrapingBee(url)
|
||||
title = sbResult.title
|
||||
content = sbResult.domContent
|
||||
} else {
|
||||
title = result.title;
|
||||
content = result.domContent;
|
||||
title = result.title
|
||||
content = result.domContent
|
||||
}
|
||||
} else {
|
||||
console.info('using prefetched content and title');
|
||||
console.info('using prefetched content and title')
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
console.error(`Error while retrieving page ${url}`, e);
|
||||
console.error(`Error while retrieving page ${url}`, e)
|
||||
|
||||
// fallback to scrapingbee for non pdf content
|
||||
if (url && contentType !== 'application/pdf') {
|
||||
console.info('fallback to scrapingbee', url);
|
||||
console.info('fallback to scrapingbee', url)
|
||||
|
||||
const fetchStartTime = Date.now();
|
||||
const sbResult = await fetchContentWithScrapingBee(url);
|
||||
content = sbResult.domContent;
|
||||
title = sbResult.title;
|
||||
const sbResult = await fetchContentWithScrapingBee(url)
|
||||
content = sbResult.domContent
|
||||
title = sbResult.title
|
||||
} else {
|
||||
throw e;
|
||||
throw e
|
||||
}
|
||||
} finally {
|
||||
// close browser context if it was opened
|
||||
if (context) {
|
||||
await context.close();
|
||||
await context.close()
|
||||
}
|
||||
// save non pdf content
|
||||
if (url && contentType !== 'application/pdf') {
|
||||
// parse content if it is not empty
|
||||
if (content) {
|
||||
let document = parseHTML(content).document;
|
||||
let document = parseHTML(content).document
|
||||
// preParse content
|
||||
const preParsedDom = await preParseContent(url, document)
|
||||
if (preParsedDom) {
|
||||
document = preParsedDom
|
||||
}
|
||||
readabilityResult = await getReadabilityResult(url, document);
|
||||
readabilityResult = await getReadabilityResult(url, document)
|
||||
}
|
||||
}
|
||||
|
||||
console.info(`content-fetch result`, logRecord);
|
||||
|
||||
return { finalUrl, title, content, readabilityResult, contentType };
|
||||
console.info(`content-fetch result`, logRecord)
|
||||
}
|
||||
|
||||
return { finalUrl, title, content, readabilityResult, contentType }
|
||||
}
|
||||
|
||||
function validateUrlString(url) {
|
||||
const u = new URL(url);
|
||||
function validateUrlString(url: string) {
|
||||
const u = new URL(url)
|
||||
// Make sure the URL is http or https
|
||||
if (u.protocol !== 'http:' && u.protocol !== 'https:') {
|
||||
throw new Error('Invalid URL protocol check failed')
|
||||
@ -243,60 +279,75 @@ function validateUrlString(url) {
|
||||
}
|
||||
}
|
||||
|
||||
function tryParseUrl(urlStr) {
|
||||
function tryParseUrl(urlStr: string) {
|
||||
if (!urlStr) {
|
||||
return null;
|
||||
return null
|
||||
}
|
||||
|
||||
|
||||
// a regular expression to match all URLs
|
||||
const regex = /(https?:\/\/[^\s]+)/g;
|
||||
|
||||
const matches = urlStr.match(regex);
|
||||
|
||||
const regex = /(https?:\/\/[^\s]+)/g
|
||||
|
||||
const matches = urlStr.match(regex)
|
||||
|
||||
if (matches) {
|
||||
return matches[0]; // only return first match
|
||||
return matches[0] // only return first match
|
||||
} else {
|
||||
return null;
|
||||
return null
|
||||
}
|
||||
}
|
||||
|
||||
function getUrl(urlStr) {
|
||||
function getUrl(urlStr: string) {
|
||||
const url = tryParseUrl(urlStr)
|
||||
if (!url) {
|
||||
throw new Error('No URL specified');
|
||||
throw new Error('No URL specified')
|
||||
}
|
||||
|
||||
validateUrlString(url);
|
||||
validateUrlString(url)
|
||||
|
||||
const parsed = Url.parse(url);
|
||||
return parsed.href;
|
||||
const parsed = Url.parse(url)
|
||||
return parsed.href
|
||||
}
|
||||
|
||||
async function retrievePage(url, logRecord, functionStartTime, locale, timezone) {
|
||||
validateUrlString(url);
|
||||
async function retrievePage(
|
||||
url: string,
|
||||
logRecord: Record<string, any>,
|
||||
functionStartTime: number,
|
||||
locale: string,
|
||||
timezone: string
|
||||
) {
|
||||
validateUrlString(url)
|
||||
|
||||
const browser = await getBrowserPromise;
|
||||
logRecord.timing = { ...logRecord.timing, browserOpened: Date.now() - functionStartTime };
|
||||
const browser = await getBrowserPromise
|
||||
logRecord.timing = {
|
||||
...logRecord.timing,
|
||||
browserOpened: Date.now() - functionStartTime,
|
||||
}
|
||||
|
||||
const context = await browser.createIncognitoBrowserContext();
|
||||
const context = await browser.createIncognitoBrowserContext()
|
||||
const page = await context.newPage()
|
||||
|
||||
if (!enableJavascriptForUrl(url)) {
|
||||
await page.setJavaScriptEnabled(false);
|
||||
await page.setJavaScriptEnabled(false)
|
||||
}
|
||||
await page.setUserAgent(userAgentForUrl(url));
|
||||
await page.setUserAgent(userAgentForUrl(url))
|
||||
|
||||
// set locale for the page
|
||||
if (locale) {
|
||||
await page.setExtraHTTPHeaders({ 'Accept-Language': locale });
|
||||
await page.setExtraHTTPHeaders({ 'Accept-Language': locale })
|
||||
}
|
||||
|
||||
// set timezone for the page
|
||||
if (timezone) {
|
||||
await page.emulateTimezone(timezone);
|
||||
await page.emulateTimezone(timezone)
|
||||
}
|
||||
|
||||
const client = await page.target().createCDPSession();
|
||||
const client = await page.target().createCDPSession()
|
||||
|
||||
const downloadPath = path.resolve('./download_dir/')
|
||||
await client.send('Page.setDownloadBehavior', {
|
||||
behavior: 'allow',
|
||||
downloadPath,
|
||||
})
|
||||
|
||||
// intercept request when response headers was received
|
||||
await client.send('Network.setRequestInterception', {
|
||||
@ -307,107 +358,126 @@ async function retrievePage(url, logRecord, functionStartTime, locale, timezone)
|
||||
interceptionStage: 'HeadersReceived',
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const path = require('path');
|
||||
const download_path = path.resolve('./download_dir/');
|
||||
|
||||
await client.send('Page.setDownloadBehavior', {
|
||||
behavior: 'allow',
|
||||
userDataDir: './',
|
||||
downloadPath: download_path,
|
||||
})
|
||||
|
||||
client.on('Network.requestIntercepted', async e => {
|
||||
const headers = e.responseHeaders || {};
|
||||
client.on(
|
||||
'Network.requestIntercepted',
|
||||
(e: Protocol.Network.RequestInterceptedEvent) => {
|
||||
;(async () => {
|
||||
const headers = e.responseHeaders || {}
|
||||
|
||||
const [contentType] = (headers['content-type'] || headers['Content-Type'] || '')
|
||||
.toLowerCase()
|
||||
.split(';');
|
||||
const obj = { interceptionId: e.interceptionId };
|
||||
const [contentType] = (
|
||||
headers['content-type'] ||
|
||||
headers['Content-Type'] ||
|
||||
''
|
||||
)
|
||||
.toLowerCase()
|
||||
.split(';')
|
||||
const obj: Protocol.Network.ContinueInterceptedRequestRequest = {
|
||||
interceptionId: e.interceptionId,
|
||||
}
|
||||
|
||||
if (e.responseStatusCode >= 200 && e.responseStatusCode < 300) {
|
||||
// We only check content-type on success responses
|
||||
// as it doesn't matter what the content type is for things
|
||||
// like redirects
|
||||
if (contentType && !ALLOWED_CONTENT_TYPES.includes(contentType)) {
|
||||
obj['errorReason'] = 'BlockedByClient';
|
||||
}
|
||||
if (
|
||||
e.responseStatusCode &&
|
||||
e.responseStatusCode >= 200 &&
|
||||
e.responseStatusCode < 300
|
||||
) {
|
||||
// We only check content-type on success responses
|
||||
// as it doesn't matter what the content type is for things
|
||||
// like redirects
|
||||
if (contentType && !ALLOWED_CONTENT_TYPES.includes(contentType)) {
|
||||
obj['errorReason'] = 'BlockedByClient'
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
await client.send('Network.continueInterceptedRequest', obj)
|
||||
} catch {
|
||||
// ignore
|
||||
}
|
||||
})()
|
||||
}
|
||||
|
||||
try {
|
||||
await client.send('Network.continueInterceptedRequest', obj);
|
||||
// eslint-disable-next-line no-empty
|
||||
} catch {}
|
||||
});
|
||||
)
|
||||
|
||||
/*
|
||||
* Disallow MathJax from running in Puppeteer and modifying the document,
|
||||
* we shall instead run it in our frontend application to transform any
|
||||
* mathjax content when present.
|
||||
*/
|
||||
await page.setRequestInterception(true);
|
||||
let requestCount = 0;
|
||||
page.on('request', request => {
|
||||
if (request.resourceType() === 'font') {
|
||||
// Disallow fonts from loading
|
||||
request.abort();
|
||||
return;
|
||||
}
|
||||
if (requestCount++ > 100) {
|
||||
request.abort();
|
||||
return;
|
||||
}
|
||||
if (
|
||||
request.resourceType() === 'script' &&
|
||||
request.url().toLowerCase().indexOf('mathjax') > -1
|
||||
) {
|
||||
request.abort();
|
||||
return
|
||||
}
|
||||
request.continue();
|
||||
});
|
||||
* Disallow MathJax from running in Puppeteer and modifying the document,
|
||||
* we shall instead run it in our frontend application to transform any
|
||||
* mathjax content when present.
|
||||
*/
|
||||
await page.setRequestInterception(true)
|
||||
let requestCount = 0
|
||||
page.on('request', (request) => {
|
||||
;(async () => {
|
||||
if (request.resourceType() === 'font') {
|
||||
// Disallow fonts from loading
|
||||
return request.abort()
|
||||
}
|
||||
if (requestCount++ > 100) {
|
||||
return request.abort()
|
||||
}
|
||||
if (
|
||||
request.resourceType() === 'script' &&
|
||||
request.url().toLowerCase().indexOf('mathjax') > -1
|
||||
) {
|
||||
return request.abort()
|
||||
}
|
||||
|
||||
await request.continue()
|
||||
})()
|
||||
})
|
||||
|
||||
// Puppeteer fails during download of PDf files,
|
||||
// so record the failure and use those items
|
||||
let lastPdfUrl = undefined;
|
||||
page.on('response', response => {
|
||||
let lastPdfUrl = undefined
|
||||
page.on('response', (response) => {
|
||||
if (response.headers()['content-type'] === 'application/pdf') {
|
||||
lastPdfUrl = response.url();
|
||||
lastPdfUrl = response.url()
|
||||
}
|
||||
});
|
||||
})
|
||||
|
||||
try {
|
||||
const response = await page.goto(url, { timeout: 30 * 1000, waitUntil: ['networkidle2'] });
|
||||
const finalUrl = response.url();
|
||||
const contentType = response.headers()['content-type'];
|
||||
const response = await page.goto(url, {
|
||||
timeout: 30 * 1000,
|
||||
waitUntil: ['networkidle2'],
|
||||
})
|
||||
if (!response) {
|
||||
throw new Error('No response from page')
|
||||
}
|
||||
|
||||
logRecord.finalUrl = response.url();
|
||||
logRecord.contentType = response.headers()['content-type'];
|
||||
const finalUrl = response.url()
|
||||
const contentType = response.headers()['content-type']
|
||||
|
||||
return { context, page, response, finalUrl, contentType };
|
||||
logRecord.finalUrl = response.url()
|
||||
logRecord.contentType = response.headers()['content-type']
|
||||
|
||||
return { context, page, response, finalUrl, contentType }
|
||||
} catch (error) {
|
||||
if (lastPdfUrl) {
|
||||
return { context, page, finalUrl: lastPdfUrl, contentType: 'application/pdf' };
|
||||
return {
|
||||
context,
|
||||
page,
|
||||
finalUrl: lastPdfUrl,
|
||||
contentType: 'application/pdf',
|
||||
}
|
||||
}
|
||||
await context.close();
|
||||
throw error;
|
||||
await context.close()
|
||||
throw error
|
||||
}
|
||||
}
|
||||
|
||||
async function retrieveHtml(page, logRecord) {
|
||||
let domContent = '', title;
|
||||
async function retrieveHtml(page: Page, logRecord: Record<string, any>) {
|
||||
let domContent = '',
|
||||
title
|
||||
try {
|
||||
title = await page.title();
|
||||
logRecord.title = title;
|
||||
title = await page.title()
|
||||
logRecord.title = title
|
||||
|
||||
const pageScrollingStart = Date.now();
|
||||
const pageScrollingStart = Date.now()
|
||||
/* scroll with a 5 seconds timeout */
|
||||
await Promise.race([
|
||||
new Promise(resolve => {
|
||||
(async function () {
|
||||
try {
|
||||
await page.evaluate(`(async () => {
|
||||
await page
|
||||
.evaluate(
|
||||
`(async () => {
|
||||
/* credit: https://github.com/puppeteer/puppeteer/issues/305 */
|
||||
return new Promise((resolve, reject) => {
|
||||
let scrollHeight = document.body.scrollHeight;
|
||||
@ -422,46 +492,56 @@ async function retrieveHtml(page, logRecord) {
|
||||
}
|
||||
}, 10);
|
||||
});
|
||||
})()`);
|
||||
} catch (e) {
|
||||
logRecord.scrollError = true;
|
||||
} finally {
|
||||
resolve(true);
|
||||
}
|
||||
})();
|
||||
}),
|
||||
page.waitForTimeout(5000),
|
||||
]);
|
||||
logRecord.timing = { ...logRecord.timing, pageScrolled: Date.now() - pageScrollingStart };
|
||||
})()`
|
||||
)
|
||||
.catch((e) => {
|
||||
console.log('error scrolling page', e)
|
||||
logRecord.scrollError = true
|
||||
}),
|
||||
new Promise((r) => setTimeout(r, 5000)),
|
||||
])
|
||||
|
||||
const iframes = {};
|
||||
const urls = [];
|
||||
const framesPromises = [];
|
||||
const allowedUrls = /instagram\.com/gi;
|
||||
logRecord.timing = {
|
||||
...logRecord.timing,
|
||||
pageScrolled: Date.now() - pageScrollingStart,
|
||||
}
|
||||
|
||||
const iframes: Record<string, any> = {}
|
||||
const urls: string[] = []
|
||||
const framesPromises = []
|
||||
const allowedUrls = /instagram\.com/gi
|
||||
|
||||
for (const frame of page.mainFrame().childFrames()) {
|
||||
if (frame.url() && allowedUrls.test(frame.url())) {
|
||||
urls.push(frame.url());
|
||||
framesPromises.push(frame.evaluate(el => el.innerHTML, await frame.$('body')));
|
||||
urls.push(frame.url())
|
||||
framesPromises.push(
|
||||
frame.evaluate((el) => el?.innerHTML, await frame.$('body'))
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
(await Promise.all(framesPromises)).forEach((frame, index) => (iframes[urls[index]] = frame));
|
||||
;(await Promise.all(framesPromises)).forEach(
|
||||
(frame, index) => (iframes[urls[index]] = frame)
|
||||
)
|
||||
|
||||
const domContentCapturingStart = Date.now();
|
||||
const domContentCapturingStart = Date.now()
|
||||
// get document body with all hidden elements removed
|
||||
domContent = await page.evaluate(iframes => {
|
||||
const BI_SRC_REGEXP = /url\("(.+?)"\)/gi;
|
||||
domContent = await page.evaluate((iframes) => {
|
||||
const BI_SRC_REGEXP = /url\("(.+?)"\)/gi
|
||||
|
||||
Array.from(document.body.getElementsByTagName('*')).forEach(el => {
|
||||
const style = window.getComputedStyle(el);
|
||||
Array.from(document.body.getElementsByTagName('*')).forEach((el) => {
|
||||
const style = window.getComputedStyle(el)
|
||||
const src = el.getAttribute('src')
|
||||
|
||||
try {
|
||||
// Removing blurred images since they are mostly the copies of lazy loaded ones
|
||||
if (el.tagName && ['img', 'image'].includes(el.tagName.toLowerCase())) {
|
||||
const filter = style.getPropertyValue('filter');
|
||||
if (
|
||||
el.tagName &&
|
||||
['img', 'image'].includes(el.tagName.toLowerCase())
|
||||
) {
|
||||
const filter = style.getPropertyValue('filter')
|
||||
if (filter && filter.startsWith('blur')) {
|
||||
el.parentNode && el.parentNode.removeChild(el);
|
||||
el.parentNode && el.parentNode.removeChild(el)
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
@ -469,69 +549,80 @@ async function retrieveHtml(page, logRecord) {
|
||||
}
|
||||
|
||||
// convert all nodes with background image to img nodes
|
||||
if (!['', 'none'].includes(style.getPropertyValue('background-image'))) {
|
||||
const filter = style.getPropertyValue('filter');
|
||||
if (
|
||||
!['', 'none'].includes(style.getPropertyValue('background-image'))
|
||||
) {
|
||||
const filter = style.getPropertyValue('filter')
|
||||
// avoiding image nodes with a blur effect creation
|
||||
if (filter && filter.startsWith('blur')) {
|
||||
el && el.parentNode && el.parentNode.removeChild(el);
|
||||
el && el.parentNode && el.parentNode.removeChild(el)
|
||||
} else {
|
||||
const matchedSRC = BI_SRC_REGEXP.exec(style.getPropertyValue('background-image'));
|
||||
const matchedSRC = BI_SRC_REGEXP.exec(
|
||||
style.getPropertyValue('background-image')
|
||||
)
|
||||
// Using "g" flag with a regex we have to manually break down lastIndex to zero after every usage
|
||||
// More details here: https://stackoverflow.com/questions/1520800/why-does-a-regexp-with-global-flag-give-wrong-results
|
||||
BI_SRC_REGEXP.lastIndex = 0;
|
||||
BI_SRC_REGEXP.lastIndex = 0
|
||||
|
||||
if (matchedSRC && matchedSRC[1] && !el.src) {
|
||||
if (matchedSRC && matchedSRC[1] && !src) {
|
||||
// Replacing element only of there are no content inside, b/c might remove important div with content.
|
||||
// Article example: http://www.josiahzayner.com/2017/01/genetic-designer-part-i.html
|
||||
// DIV with class "content-inner" has `url("https://resources.blogblog.com/blogblog/data/1kt/travel/bg_container.png")` background image.
|
||||
if (!el.textContent) {
|
||||
const img = document.createElement('img');
|
||||
img.src = matchedSRC[1];
|
||||
el && el.parentNode && el.parentNode.replaceChild(img, el);
|
||||
const img = document.createElement('img')
|
||||
img.src = matchedSRC[1]
|
||||
el && el.parentNode && el.parentNode.replaceChild(img, el)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (el.tagName === 'IFRAME') {
|
||||
if (iframes[el.src]) {
|
||||
const newNode = document.createElement('div');
|
||||
newNode.className = 'omnivore-instagram-embed';
|
||||
newNode.innerHTML = iframes[el.src];
|
||||
el && el.parentNode && el.parentNode.replaceChild(newNode, el);
|
||||
if (src && iframes[src]) {
|
||||
const newNode = document.createElement('div')
|
||||
newNode.className = 'omnivore-instagram-embed'
|
||||
newNode.innerHTML = iframes[src]
|
||||
el && el.parentNode && el.parentNode.replaceChild(newNode, el)
|
||||
}
|
||||
}
|
||||
});
|
||||
})
|
||||
|
||||
if (document.querySelector('[data-translate="managed_checking_msg"]') ||
|
||||
document.getElementById('px-block-form-wrapper')) {
|
||||
if (
|
||||
document.querySelector('[data-translate="managed_checking_msg"]') ||
|
||||
document.getElementById('px-block-form-wrapper')
|
||||
) {
|
||||
return 'IS_BLOCKED'
|
||||
}
|
||||
|
||||
return document.documentElement.outerHTML;
|
||||
}, iframes);
|
||||
logRecord.puppeteerSuccess = true;
|
||||
return document.documentElement.outerHTML
|
||||
}, iframes)
|
||||
logRecord.puppeteerSuccess = true
|
||||
logRecord.timing = {
|
||||
...logRecord.timing,
|
||||
contenCaptured: Date.now() - domContentCapturingStart,
|
||||
};
|
||||
}
|
||||
|
||||
// [END puppeteer-block]
|
||||
} catch (e) {
|
||||
if (e.message.startsWith('net::ERR_BLOCKED_BY_CLIENT at ')) {
|
||||
logRecord.blockedByClient = true;
|
||||
if (e instanceof Error) {
|
||||
if (e.message.startsWith('net::ERR_BLOCKED_BY_CLIENT at ')) {
|
||||
logRecord.blockedByClient = true
|
||||
} else {
|
||||
logRecord.puppeteerSuccess = false
|
||||
logRecord.puppeteerError = {
|
||||
message: e.message,
|
||||
stack: e.stack,
|
||||
}
|
||||
}
|
||||
} else {
|
||||
logRecord.puppeteerSuccess = false;
|
||||
logRecord.puppeteerError = {
|
||||
message: e.message,
|
||||
stack: e.stack,
|
||||
};
|
||||
logRecord.puppeteerSuccess = false
|
||||
logRecord.puppeteerError = e
|
||||
}
|
||||
}
|
||||
if (domContent === 'IS_BLOCKED') {
|
||||
return { isBlocked: true };
|
||||
return { isBlocked: true }
|
||||
}
|
||||
return { domContent, title };
|
||||
return { domContent, title }
|
||||
}
|
||||
|
||||
// async function preview(req, res) {
|
||||
@ -669,7 +760,7 @@ const DOM_PURIFY_CONFIG = {
|
||||
],
|
||||
}
|
||||
|
||||
function domPurifySanitizeHook(node, data) {
|
||||
function domPurifySanitizeHook(node: Element, data: SanitizeElementHookEvent) {
|
||||
if (data.tagName === 'iframe') {
|
||||
const urlRegex = /^(https?:)?\/\/www\.youtube(-nocookie)?\.com\/embed\//i
|
||||
const src = node.getAttribute('src') || ''
|
||||
@ -688,7 +779,7 @@ function domPurifySanitizeHook(node, data) {
|
||||
}
|
||||
}
|
||||
|
||||
function getPurifiedContent(html) {
|
||||
function getPurifiedContent(html: Document) {
|
||||
const newWindow = parseHTML('')
|
||||
const DOMPurify = createDOMPurify(newWindow)
|
||||
DOMPurify.addHook('uponSanitizeElement', domPurifySanitizeHook)
|
||||
@ -696,13 +787,16 @@ function getPurifiedContent(html) {
|
||||
return parseHTML(clean).document
|
||||
}
|
||||
|
||||
function signImageProxyUrl(url) {
|
||||
function signImageProxyUrl(url: string) {
|
||||
return encode(
|
||||
crypto.createHmac('sha256', process.env.IMAGE_PROXY_SECRET).update(url).digest()
|
||||
crypto
|
||||
.createHmac('sha256', process.env.IMAGE_PROXY_SECRET || '')
|
||||
.update(url)
|
||||
.digest()
|
||||
)
|
||||
}
|
||||
|
||||
function createImageProxyUrl(url, width = 0, height = 0) {
|
||||
function createImageProxyUrl(url: string, width = 0, height = 0) {
|
||||
if (!process.env.IMAGE_PROXY_URL || !process.env.IMAGE_PROXY_SECRET) {
|
||||
return url
|
||||
}
|
||||
@ -713,7 +807,7 @@ function createImageProxyUrl(url, width = 0, height = 0) {
|
||||
return `${process.env.IMAGE_PROXY_URL}/${width}x${height},s${signature}/${url}`
|
||||
}
|
||||
|
||||
async function getReadabilityResult(url, document) {
|
||||
async function getReadabilityResult(url: string, document: Document) {
|
||||
// First attempt to read the article as is.
|
||||
// if that fails attempt to purify then read
|
||||
const sources = [
|
||||
@ -747,9 +841,3 @@ async function getReadabilityResult(url, document) {
|
||||
|
||||
return null
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
fetchContent,
|
||||
// preview,
|
||||
};
|
||||
|
||||
173
packages/puppeteer-parse/src/readability.d.ts
vendored
Normal file
173
packages/puppeteer-parse/src/readability.d.ts
vendored
Normal file
@ -0,0 +1,173 @@
|
||||
// Type definitions for non-npm package mozilla-readability 0.2
|
||||
// Project: https://github.com/mozilla/readability
|
||||
// Definitions by: Charles Vandevoorde <https://github.com/charlesvdv>, Alex Wendland <https://github.com/awendland>
|
||||
// Definitions: https://github.com/DefinitelyTyped/DefinitelyTyped
|
||||
// TypeScript Version: 2.2
|
||||
|
||||
declare module '@omnivore/readability' {
|
||||
/**
|
||||
* A standalone version of the readability library used for Firefox Reader View.
|
||||
*
|
||||
* Note that isProbablyReaderable() was moved into a separate file in https://github.com/mozilla/readability/commit/2620542dd1e8380220d82afa97a2c283ae636e40
|
||||
* and therefore is no longer part of the Readability class.
|
||||
*/
|
||||
class Readability {
|
||||
/**
|
||||
* ## Usage on the web
|
||||
*
|
||||
* To parse a document, you must create a new Readability object from a
|
||||
* DOM document object, and then call parse(). Here's an example:
|
||||
*
|
||||
* ```js
|
||||
* var article = new Readability(document).parse();
|
||||
* ```
|
||||
*
|
||||
* If you're using Readability on the web, you will likely be able to
|
||||
* use a document reference from elsewhere (e.g. fetched via XMLHttpRequest,
|
||||
* in a same-origin <iframe> you have access to, etc.).
|
||||
*
|
||||
* ## Usage from node.js
|
||||
*
|
||||
* In node.js, you won't generally have a DOM document object. To obtain one, you can use external
|
||||
* libraries like [jsdom](https://github.com/tmpvar/jsdom). While this repository contains a parser of
|
||||
* its own (`JSDOMParser`), that is restricted to reading XML-compatible markup and therefore we do
|
||||
* not recommend it for general use.
|
||||
*
|
||||
* If you're using `jsdom` to create a DOM object, you should ensure that the page doesn't run (page)
|
||||
* scripts (avoid fetching remote resources etc.) as well as passing it the page's URI as the `url`
|
||||
* property of the `options` object you pass the `JSDOM` constructor.
|
||||
*
|
||||
* ```js
|
||||
* var JSDOM = require('jsdom').JSDOM;
|
||||
* var doc = new JSDOM("<body>Here's a bunch of text</body>", {
|
||||
* url: "https://www.example.com/the-page-i-got-the-source-from",
|
||||
* });
|
||||
* let reader = new Readability(doc.window.document);
|
||||
* let article = reader.parse();
|
||||
* ```
|
||||
*/
|
||||
constructor(doc: Document, options?: Readability.Options)
|
||||
|
||||
/**
|
||||
* Runs readability.
|
||||
*
|
||||
* ## Workflow:
|
||||
*
|
||||
* 1. Prep the document by removing script tags, css, etc.
|
||||
* 2. Build readability's DOM tree.
|
||||
* 3. Grab the article content from the current dom tree.
|
||||
* 4. Replace the current DOM tree with the new one.
|
||||
* 5. Read peacefully.
|
||||
*
|
||||
* ## Additional notes:
|
||||
*
|
||||
* Readability's parse() works by modifying the DOM. This removes some
|
||||
* elements in the web page. You could avoid this by passing the clone
|
||||
* of the document object while creating a Readability object.
|
||||
*
|
||||
* ```js
|
||||
* var documentClone = document.cloneNode(true);
|
||||
* var article = new Readability(documentClone).parse();
|
||||
* ```
|
||||
*
|
||||
* The response will be null if the processing failed (https://github.com/mozilla/readability/blob/52ab9b5c8916c306a47b2119270dcdabebf9d203/Readability.js#L2038)
|
||||
*/
|
||||
async parse(): Promise<Readability.ParseResult | null>
|
||||
}
|
||||
|
||||
namespace Readability {
|
||||
interface Options {
|
||||
/**
|
||||
* Control whether log messages are sent to the console
|
||||
*/
|
||||
debug?: boolean
|
||||
|
||||
/**
|
||||
* Set a maximum size on the documents that will be processed. This size is
|
||||
* checked before any parsing operations occur. If the number of elements in
|
||||
* the document exceeds this threshold then an Error will be thrown.
|
||||
*
|
||||
* See implementation details at https://github.com/mozilla/readability/blob/52ab9b5c8916c306a47b2119270dcdabebf9d203/Readability.js#L2019
|
||||
*/
|
||||
maxElemsToParse?: number
|
||||
|
||||
nbTopCandidates?: number
|
||||
|
||||
/**
|
||||
* Minimum number of characters in the extracted textContent in order to
|
||||
* consider the article correctly identified. If the threshold is not met then
|
||||
* the extraction process will automatically run again with different flags.
|
||||
*
|
||||
* See implementation details at https://github.com/mozilla/readability/blob/52ab9b5c8916c306a47b2119270dcdabebf9d203/Readability.js#L1208
|
||||
*
|
||||
* Changed from wordThreshold in https://github.com/mozilla/readability/commit/3ff9a166fb27928f222c4c0722e730eda412658a
|
||||
*/
|
||||
charThreshold?: number
|
||||
|
||||
/**
|
||||
* parse() removes the class="" attribute from every element in the given
|
||||
* subtree, except those that match CLASSES_TO_PRESERVE and
|
||||
* the classesToPreserve array from the options object.
|
||||
*/
|
||||
classesToPreserve?: string[]
|
||||
|
||||
/**
|
||||
* By default Readability will strip all classes from the HTML elements in the
|
||||
* processed article. By setting this to `true` the classes will be retained.
|
||||
*
|
||||
* This is a blanket alternative to `classesToPreserve`.
|
||||
*
|
||||
* Added in https://github.com/mozilla/readability/commit/2982216913af2c66b0690e88606b03116553ad92
|
||||
*/
|
||||
|
||||
keepClasses?: boolean
|
||||
url?: string
|
||||
|
||||
/**
|
||||
* Function that converts a regular image url into imageproxy url
|
||||
* @param url string
|
||||
*/
|
||||
createImageProxyUrl?: (
|
||||
url: string,
|
||||
width?: number,
|
||||
height?: number
|
||||
) => string
|
||||
|
||||
/**
|
||||
* By default, Readability will clean all tables from the HTML elements in the
|
||||
* processed article. But newsletters in emails use tables to display their content.
|
||||
* By setting this to `true`, these tables will be retained.
|
||||
*/
|
||||
keepTables?: boolean
|
||||
ignoreLinkDensity?: boolean
|
||||
}
|
||||
|
||||
interface ParseResult {
|
||||
/** Article title */
|
||||
title: string
|
||||
/** Author metadata */
|
||||
byline?: string | null
|
||||
/** Content direction */
|
||||
dir?: string | null
|
||||
/** HTML string of processed article content */
|
||||
content: string
|
||||
/** non-HTML version of `content` */
|
||||
textContent: string
|
||||
/** Length of an article, in characters */
|
||||
length: number
|
||||
/** Article description, or short excerpt from the content */
|
||||
excerpt: string
|
||||
/** Article site name */
|
||||
siteName?: string | null
|
||||
/** Article site icon */
|
||||
siteIcon?: string | null
|
||||
/** Article preview image */
|
||||
previewImage?: string | null
|
||||
/** Article published date */
|
||||
publishedDate?: Date | null
|
||||
language?: string | null
|
||||
}
|
||||
}
|
||||
|
||||
export { Readability }
|
||||
}
|
||||
3
packages/puppeteer-parse/test/babel-register.js
Normal file
3
packages/puppeteer-parse/test/babel-register.js
Normal file
@ -0,0 +1,3 @@
|
||||
const register = require('@babel/register').default
|
||||
|
||||
register({ extensions: ['.ts', '.tsx', '.js', '.jsx'] })
|
||||
@ -1,9 +0,0 @@
|
||||
const chai = require("chai");
|
||||
|
||||
const expect = chai.expect;
|
||||
|
||||
describe('Stub test', () => {
|
||||
it('should pass', () => {
|
||||
expect(true).to.be.true
|
||||
})
|
||||
})
|
||||
8
packages/puppeteer-parse/test/stub.test.ts
Normal file
8
packages/puppeteer-parse/test/stub.test.ts
Normal file
@ -0,0 +1,8 @@
|
||||
import 'mocha'
|
||||
import { expect } from 'chai'
|
||||
|
||||
describe('stub test', () => {
|
||||
it('should pass', () => {
|
||||
expect(true).to.be.true
|
||||
})
|
||||
})
|
||||
8
packages/puppeteer-parse/tsconfig.json
Normal file
8
packages/puppeteer-parse/tsconfig.json
Normal file
@ -0,0 +1,8 @@
|
||||
{
|
||||
"extends": "./../../tsconfig.json",
|
||||
"compilerOptions": {
|
||||
"outDir": "build",
|
||||
"rootDir": "."
|
||||
},
|
||||
"include": ["src"]
|
||||
}
|
||||
92
yarn.lock
92
yarn.lock
@ -12416,13 +12416,6 @@ cron-parser@^4.6.0:
|
||||
dependencies:
|
||||
luxon "^3.2.1"
|
||||
|
||||
cross-fetch@3.1.5, cross-fetch@^3.0.6, cross-fetch@^3.1.5:
|
||||
version "3.1.5"
|
||||
resolved "https://registry.yarnpkg.com/cross-fetch/-/cross-fetch-3.1.5.tgz#e1389f44d9e7ba767907f7af8454787952ab534f"
|
||||
integrity sha512-lvb1SBsI0Z7GDwmuid+mU3kWVBwTVUbe7S0H52yaaAdQOXq2YktTCZdlAcNKFzE6QtRz0snpw9bNiPeOIkkQvw==
|
||||
dependencies:
|
||||
node-fetch "2.6.7"
|
||||
|
||||
cross-fetch@4.0.0:
|
||||
version "4.0.0"
|
||||
resolved "https://registry.yarnpkg.com/cross-fetch/-/cross-fetch-4.0.0.tgz#f037aef1580bb3a1a35164ea2a848ba81b445983"
|
||||
@ -12430,6 +12423,13 @@ cross-fetch@4.0.0:
|
||||
dependencies:
|
||||
node-fetch "^2.6.12"
|
||||
|
||||
cross-fetch@^3.0.6, cross-fetch@^3.1.5:
|
||||
version "3.1.5"
|
||||
resolved "https://registry.yarnpkg.com/cross-fetch/-/cross-fetch-3.1.5.tgz#e1389f44d9e7ba767907f7af8454787952ab534f"
|
||||
integrity sha512-lvb1SBsI0Z7GDwmuid+mU3kWVBwTVUbe7S0H52yaaAdQOXq2YktTCZdlAcNKFzE6QtRz0snpw9bNiPeOIkkQvw==
|
||||
dependencies:
|
||||
node-fetch "2.6.7"
|
||||
|
||||
cross-spawn@^6.0.0:
|
||||
version "6.0.5"
|
||||
resolved "https://registry.yarnpkg.com/cross-spawn/-/cross-spawn-6.0.5.tgz#4a5ec7c64dfae22c3a14124dbacdee846d80cbc4"
|
||||
@ -13171,11 +13171,6 @@ detect-port@^1.3.0:
|
||||
address "^1.0.1"
|
||||
debug "^2.6.0"
|
||||
|
||||
devtools-protocol@0.0.1045489:
|
||||
version "0.0.1045489"
|
||||
resolved "https://registry.yarnpkg.com/devtools-protocol/-/devtools-protocol-0.0.1045489.tgz#f959ad560b05acd72d55644bc3fb8168a83abf28"
|
||||
integrity sha512-D+PTmWulkuQW4D1NTiCRCFxF7pQPn0hgp4YyX4wAQ6xYXKOadSWPR3ENGDQ47MW/Ewc9v2rpC/UEEGahgBYpSQ==
|
||||
|
||||
devtools-protocol@0.0.1147663:
|
||||
version "0.0.1147663"
|
||||
resolved "https://registry.yarnpkg.com/devtools-protocol/-/devtools-protocol-0.0.1147663.tgz#4ec5610b39a6250d1f87e6b9c7e16688ed0ac78e"
|
||||
@ -17239,14 +17234,6 @@ https-browserify@^1.0.0:
|
||||
resolved "https://registry.yarnpkg.com/https-browserify/-/https-browserify-1.0.0.tgz#ec06c10e0a34c0f2faf199f7fd7fc78fffd03c73"
|
||||
integrity sha1-7AbBDgo0wPL68Zn3/X/Hj//QPHM=
|
||||
|
||||
https-proxy-agent@5.0.1, https-proxy-agent@^5.0.0:
|
||||
version "5.0.1"
|
||||
resolved "https://registry.yarnpkg.com/https-proxy-agent/-/https-proxy-agent-5.0.1.tgz#c59ef224a04fe8b754f3db0063a25ea30d0005d6"
|
||||
integrity sha512-dFcAjpTQFgoLMzC2VwU+C/CbS7uRL0lWmxDITmqm7C+7F0Odmj6s9l6alZc6AELXhrnggM2CeWSXHGOdX2YtwA==
|
||||
dependencies:
|
||||
agent-base "6"
|
||||
debug "4"
|
||||
|
||||
https-proxy-agent@^4.0.0:
|
||||
version "4.0.0"
|
||||
resolved "https://registry.yarnpkg.com/https-proxy-agent/-/https-proxy-agent-4.0.0.tgz#702b71fb5520a132a66de1f67541d9e62154d82b"
|
||||
@ -17255,6 +17242,14 @@ https-proxy-agent@^4.0.0:
|
||||
agent-base "5"
|
||||
debug "4"
|
||||
|
||||
https-proxy-agent@^5.0.0:
|
||||
version "5.0.1"
|
||||
resolved "https://registry.yarnpkg.com/https-proxy-agent/-/https-proxy-agent-5.0.1.tgz#c59ef224a04fe8b754f3db0063a25ea30d0005d6"
|
||||
integrity sha512-dFcAjpTQFgoLMzC2VwU+C/CbS7uRL0lWmxDITmqm7C+7F0Odmj6s9l6alZc6AELXhrnggM2CeWSXHGOdX2YtwA==
|
||||
dependencies:
|
||||
agent-base "6"
|
||||
debug "4"
|
||||
|
||||
https-proxy-agent@^7.0.0, https-proxy-agent@^7.0.1:
|
||||
version "7.0.1"
|
||||
resolved "https://registry.yarnpkg.com/https-proxy-agent/-/https-proxy-agent-7.0.1.tgz#0277e28f13a07d45c663633841e20a40aaafe0ab"
|
||||
@ -24911,7 +24906,7 @@ proxy-from-env@1.0.0:
|
||||
resolved "https://registry.yarnpkg.com/proxy-from-env/-/proxy-from-env-1.0.0.tgz#33c50398f70ea7eb96d21f7b817630a55791c7ee"
|
||||
integrity sha1-M8UDmPcOp+uW0h97gXYwpVeRx+4=
|
||||
|
||||
proxy-from-env@1.1.0, proxy-from-env@^1.1.0:
|
||||
proxy-from-env@^1.1.0:
|
||||
version "1.1.0"
|
||||
resolved "https://registry.yarnpkg.com/proxy-from-env/-/proxy-from-env-1.1.0.tgz#e102f16ca355424865755d2c9e8ea4f24d58c3e2"
|
||||
integrity sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==
|
||||
@ -25009,22 +25004,6 @@ pupa@^2.1.1:
|
||||
dependencies:
|
||||
escape-goat "^2.0.0"
|
||||
|
||||
puppeteer-core@^19.1.1:
|
||||
version "19.1.1"
|
||||
resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-19.1.1.tgz#6416ff925a9cc78523c490482a17a2998f7c0626"
|
||||
integrity sha512-jV26Ke0VFel4MoXLjqm50uAW2uwksTP6Md1tvtXqWqXM5FyboKI6E9YYJ1qEQilUAqlhgGq8xLN5+SL8bPz/kw==
|
||||
dependencies:
|
||||
cross-fetch "3.1.5"
|
||||
debug "4.3.4"
|
||||
devtools-protocol "0.0.1045489"
|
||||
extract-zip "2.0.1"
|
||||
https-proxy-agent "5.0.1"
|
||||
proxy-from-env "1.1.0"
|
||||
rimraf "3.0.2"
|
||||
tar-fs "2.1.1"
|
||||
unbzip2-stream "1.4.3"
|
||||
ws "8.9.0"
|
||||
|
||||
puppeteer-core@^20.9.0:
|
||||
version "20.9.0"
|
||||
resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-20.9.0.tgz#6f4b420001b64419deab38d398a4d9cd071040e6"
|
||||
@ -26751,13 +26730,6 @@ rfdc@^1.3.0:
|
||||
resolved "https://registry.yarnpkg.com/rfdc/-/rfdc-1.3.0.tgz#d0b7c441ab2720d05dc4cf26e01c89631d9da08b"
|
||||
integrity sha512-V2hovdzFbOi77/WajaSMXk2OLm+xNIeQdMMuB7icj7bk6zi2F8GGAxigcnDFpJHbNyNcgyJDiP+8nOrY5cZGrA==
|
||||
|
||||
rimraf@3.0.2, rimraf@^3.0.0, rimraf@^3.0.2:
|
||||
version "3.0.2"
|
||||
resolved "https://registry.yarnpkg.com/rimraf/-/rimraf-3.0.2.tgz#f1a5402ba6220ad52cc1282bac1ae3aa49fd061a"
|
||||
integrity sha512-JZkJMZkAGFFPP2YqXZXPbMlMBgsxzE8ILs4lMIX/2o0L9UBw9O/Y3o6wFw/i9YLapcUJWwqbi3kdxIPdC62TIA==
|
||||
dependencies:
|
||||
glob "^7.1.3"
|
||||
|
||||
rimraf@^2.2.8, rimraf@^2.5.4, rimraf@^2.6.1, rimraf@^2.6.3:
|
||||
version "2.7.1"
|
||||
resolved "https://registry.yarnpkg.com/rimraf/-/rimraf-2.7.1.tgz#35797f13a7fdadc566142c29d4f07ccad483e3ec"
|
||||
@ -26765,6 +26737,13 @@ rimraf@^2.2.8, rimraf@^2.5.4, rimraf@^2.6.1, rimraf@^2.6.3:
|
||||
dependencies:
|
||||
glob "^7.1.3"
|
||||
|
||||
rimraf@^3.0.0, rimraf@^3.0.2:
|
||||
version "3.0.2"
|
||||
resolved "https://registry.yarnpkg.com/rimraf/-/rimraf-3.0.2.tgz#f1a5402ba6220ad52cc1282bac1ae3aa49fd061a"
|
||||
integrity sha512-JZkJMZkAGFFPP2YqXZXPbMlMBgsxzE8ILs4lMIX/2o0L9UBw9O/Y3o6wFw/i9YLapcUJWwqbi3kdxIPdC62TIA==
|
||||
dependencies:
|
||||
glob "^7.1.3"
|
||||
|
||||
rimraf@^4.4.1:
|
||||
version "4.4.1"
|
||||
resolved "https://registry.yarnpkg.com/rimraf/-/rimraf-4.4.1.tgz#bd33364f67021c5b79e93d7f4fa0568c7c21b755"
|
||||
@ -28533,16 +28512,6 @@ tapable@^2.0.0, tapable@^2.1.1, tapable@^2.2.0:
|
||||
resolved "https://registry.yarnpkg.com/tapable/-/tapable-2.2.1.tgz#1967a73ef4060a82f12ab96af86d52fdb76eeca0"
|
||||
integrity sha512-GNzQvQTOIP6RyTfE2Qxb8ZVlNmw0n88vp1szwWRimP02mnTsx3Wtn5qRdqY9w2XduFNUgvOwhNnQsjwCp+kqaQ==
|
||||
|
||||
tar-fs@2.1.1, tar-fs@^2.0.0:
|
||||
version "2.1.1"
|
||||
resolved "https://registry.yarnpkg.com/tar-fs/-/tar-fs-2.1.1.tgz#489a15ab85f1f0befabb370b7de4f9eb5cbe8784"
|
||||
integrity sha512-V0r2Y9scmbDRLCNex/+hYzvp/zyYjvFbHPNgVTKfQvVrb6guiE/fxP+XblDNR011utopbkex2nM4dHNV6GDsng==
|
||||
dependencies:
|
||||
chownr "^1.1.1"
|
||||
mkdirp-classic "^0.5.2"
|
||||
pump "^3.0.0"
|
||||
tar-stream "^2.1.4"
|
||||
|
||||
tar-fs@3.0.4, tar-fs@^3.0.4:
|
||||
version "3.0.4"
|
||||
resolved "https://registry.yarnpkg.com/tar-fs/-/tar-fs-3.0.4.tgz#a21dc60a2d5d9f55e0089ccd78124f1d3771dbbf"
|
||||
@ -28552,6 +28521,16 @@ tar-fs@3.0.4, tar-fs@^3.0.4:
|
||||
pump "^3.0.0"
|
||||
tar-stream "^3.1.5"
|
||||
|
||||
tar-fs@^2.0.0:
|
||||
version "2.1.1"
|
||||
resolved "https://registry.yarnpkg.com/tar-fs/-/tar-fs-2.1.1.tgz#489a15ab85f1f0befabb370b7de4f9eb5cbe8784"
|
||||
integrity sha512-V0r2Y9scmbDRLCNex/+hYzvp/zyYjvFbHPNgVTKfQvVrb6guiE/fxP+XblDNR011utopbkex2nM4dHNV6GDsng==
|
||||
dependencies:
|
||||
chownr "^1.1.1"
|
||||
mkdirp-classic "^0.5.2"
|
||||
pump "^3.0.0"
|
||||
tar-stream "^2.1.4"
|
||||
|
||||
tar-stream@^2.1.4, tar-stream@~2.2.0:
|
||||
version "2.2.0"
|
||||
resolved "https://registry.yarnpkg.com/tar-stream/-/tar-stream-2.2.0.tgz#acad84c284136b060dc3faa64474aa9aebd77287"
|
||||
@ -30935,11 +30914,6 @@ ws@8.13.0:
|
||||
resolved "https://registry.yarnpkg.com/ws/-/ws-8.13.0.tgz#9a9fb92f93cf41512a0735c8f4dd09b8a1211cd0"
|
||||
integrity sha512-x9vcZYTrFPC7aSIbj7sRCYo7L/Xb8Iy+pW0ng0wt2vCJv7M9HOMy0UoN3rr+IFC7hb7vXoqS+P9ktyLLLhO+LA==
|
||||
|
||||
ws@8.9.0:
|
||||
version "8.9.0"
|
||||
resolved "https://registry.yarnpkg.com/ws/-/ws-8.9.0.tgz#2a994bb67144be1b53fe2d23c53c028adeb7f45e"
|
||||
integrity sha512-Ja7nszREasGaYUYCI2k4lCKIRTt+y7XuqVoHR44YpI49TtryyqbqvDMn5eqfW7e6HzTukDRIsXqzVHScqRcafg==
|
||||
|
||||
"ws@^5.2.0 || ^6.0.0 || ^7.0.0", ws@^7.3.1, ws@^7.4.6:
|
||||
version "7.5.7"
|
||||
resolved "https://registry.yarnpkg.com/ws/-/ws-7.5.7.tgz#9e0ac77ee50af70d58326ecff7e85eb3fa375e67"
|
||||
|
||||
Reference in New Issue
Block a user