From cd3402b98a79a5a259aa23d52ed7fd68b436731d Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Mon, 15 Jan 2024 23:32:26 +0800 Subject: [PATCH] rewrite puppeteer in typescript --- packages/content-fetch/item.js | 75 +++ packages/content-handler/package.json | 2 +- packages/puppeteer-parse/.eslintignore | 2 + packages/puppeteer-parse/.eslintrc | 14 + packages/puppeteer-parse/mocha-config.json | 5 + packages/puppeteer-parse/package.json | 11 +- .../{index.js => src/index.ts} | 606 ++++++++++-------- packages/puppeteer-parse/src/readability.d.ts | 173 +++++ .../puppeteer-parse/test/babel-register.js | 3 + packages/puppeteer-parse/test/stub.test.js | 9 - packages/puppeteer-parse/test/stub.test.ts | 8 + packages/puppeteer-parse/tsconfig.json | 8 + yarn.lock | 92 +-- 13 files changed, 678 insertions(+), 330 deletions(-) create mode 100644 packages/content-fetch/item.js create mode 100644 packages/puppeteer-parse/.eslintignore create mode 100644 packages/puppeteer-parse/.eslintrc create mode 100644 packages/puppeteer-parse/mocha-config.json rename packages/puppeteer-parse/{index.js => src/index.ts} (54%) create mode 100644 packages/puppeteer-parse/src/readability.d.ts create mode 100644 packages/puppeteer-parse/test/babel-register.js delete mode 100644 packages/puppeteer-parse/test/stub.test.js create mode 100644 packages/puppeteer-parse/test/stub.test.ts create mode 100644 packages/puppeteer-parse/tsconfig.json diff --git a/packages/content-fetch/item.js b/packages/content-fetch/item.js new file mode 100644 index 000000000..5703358e2 --- /dev/null +++ b/packages/content-fetch/item.js @@ -0,0 +1,75 @@ +const { interfaces } = require('mocha'); +const { uploadPdf, sendSavePageMutation, sendCreateArticleMutation, sendImportStatusUpdate } = require('./api'); + +interface Item { + url: string; + userId: string; + contentType: string; + articleSavingRequestId: string; + state: string; + labels: string[]; + source: string; + folder: string; + rssFeedUrl: string; + savedAt: string; + publishedAt: string; + readabilityResult: string; +} + +exports.saveItem = async (item: Item) => { + const { url, userId, contentType, articleSavingRequestId, state, labels, source, folder, rssFeedUrl, savedAt, publishedAt, readabilityResult } = item; + try { + if (contentType === 'application/pdf') { + const uploadFileId = await uploadPdf(url, userId, articleSavingRequestId); + const uploadedPdf = await sendCreateArticleMutation(userId, { + url: encodeURI(url), + articleSavingRequestId, + uploadFileId, + state, + labels, + source, + folder, + rssFeedUrl, + savedAt, + publishedAt, + }); + if (!uploadedPdf) { + console.error('error while saving uploaded pdf', url); + return false; + } + } else { + const apiResponse = await sendSavePageMutation(userId, { + url, + clientRequestId: articleSavingRequestId,h + title, + originalContent: content, + parseResult: readabilityResult, + state, + labels, + rssFeedUrl, + savedAt, + publishedAt, + source, + folder, + }); + if (!apiResponse) { + console.error('error while saving page', url); + return false; + } else if (apiResponse.error === 'UNAUTHORIZED') { + console.log('user is deleted, do not retry', userId); + return true; + } else { + importStatus = readabilityResult ? 'imported' : 'failed'; + } + } + } catch (error) { + logRecord.error = error.message; + } finally { + // mark import failed on the last failed retry + const retryCount = req.headers['x-cloudtasks-taskretrycount']; + if (retryCount === MAX_RETRY_COUNT) { + console.log('max retry count reached'); + importStatus = importStatus || 'failed'; + } + } +} diff --git a/packages/content-handler/package.json b/packages/content-handler/package.json index ae5017f0a..bc6269811 100644 --- a/packages/content-handler/package.json +++ b/packages/content-handler/package.json @@ -36,7 +36,7 @@ "linkedom": "^0.14.16", "lodash": "^4.17.21", "luxon": "^3.0.4", - "puppeteer-core": "^19.1.1", + "puppeteer-core": "^20.9.0", "underscore": "^1.13.6", "uuid": "^9.0.0" }, diff --git a/packages/puppeteer-parse/.eslintignore b/packages/puppeteer-parse/.eslintignore new file mode 100644 index 000000000..b38db2f29 --- /dev/null +++ b/packages/puppeteer-parse/.eslintignore @@ -0,0 +1,2 @@ +node_modules/ +build/ diff --git a/packages/puppeteer-parse/.eslintrc b/packages/puppeteer-parse/.eslintrc new file mode 100644 index 000000000..301be9795 --- /dev/null +++ b/packages/puppeteer-parse/.eslintrc @@ -0,0 +1,14 @@ +{ + "extends": "../../.eslintrc", + "parserOptions": { + "project": "tsconfig.json" + }, + "rules": { + "@typescript-eslint/no-floating-promises": [ + "error", + { + "ignoreIIFE": true + } + ] + } +} diff --git a/packages/puppeteer-parse/mocha-config.json b/packages/puppeteer-parse/mocha-config.json new file mode 100644 index 000000000..44d1d24c1 --- /dev/null +++ b/packages/puppeteer-parse/mocha-config.json @@ -0,0 +1,5 @@ +{ + "extension": ["ts"], + "spec": "test/**/*.test.ts", + "require": "test/babel-register.js" + } \ No newline at end of file diff --git a/packages/puppeteer-parse/package.json b/packages/puppeteer-parse/package.json index ab6fadfd8..87e2943d5 100644 --- a/packages/puppeteer-parse/package.json +++ b/packages/puppeteer-parse/package.json @@ -2,10 +2,14 @@ "name": "@omnivore/puppeteer-parse", "version": "1.0.0", "description": "Accepts URL of the article and parses its content", - "main": "index.js", + "main": "build/src/index.js", + "files": [ + "build/src" + ], "dependencies": { "@omnivore/content-handler": "1.0.0", "@omnivore/readability": "1.0.0", + "axios": "^1.4.0", "crypto": "^1.0.1", "dompurify": "^2.4.1", "linkedom": "^0.14.9", @@ -20,7 +24,10 @@ "mocha": "^10.0.0" }, "scripts": { - "test": "mocha test/*.js" + "test": "yarn mocha -r ts-node/register --config mocha-config.json", + "test:typecheck": "tsc --noEmit", + "lint": "eslint src --ext ts,js,tsx,jsx", + "build": "tsc" }, "volta": { "extends": "../../package.json" diff --git a/packages/puppeteer-parse/index.js b/packages/puppeteer-parse/src/index.ts similarity index 54% rename from packages/puppeteer-parse/index.js rename to packages/puppeteer-parse/src/index.ts index 99af85cbe..fc2c4f5f4 100644 --- a/packages/puppeteer-parse/index.js +++ b/packages/puppeteer-parse/src/index.ts @@ -1,99 +1,106 @@ -/* eslint-disable no-undef */ -/* eslint-disable no-empty */ -/* eslint-disable @typescript-eslint/explicit-function-return-type */ -/* eslint-disable @typescript-eslint/no-var-requires */ -/* eslint-disable @typescript-eslint/no-require-imports */ -const { encode } = require("urlsafe-base64"); -const crypto = require("crypto"); - -const Url = require('url'); -const os = require('os'); +/* eslint-disable @typescript-eslint/no-unsafe-member-access */ +/* eslint-disable @typescript-eslint/no-unsafe-assignment */ +import { preHandleContent, preParseContent } from '@omnivore/content-handler' +import { Readability } from '@omnivore/readability' +import axios from 'axios' +import crypto from 'crypto' +import createDOMPurify, { SanitizeElementHookEvent } from 'dompurify' // const { Storage } = require('@google-cloud/storage'); -const { parseHTML } = require('linkedom'); -const { preHandleContent, preParseContent } = require("@omnivore/content-handler"); -const { Readability } = require("@omnivore/readability"); - -const puppeteer = require('puppeteer-extra'); +import { parseHTML } from 'linkedom' +import path from 'path' +import { Browser, BrowserContext, Page, Protocol } from 'puppeteer-core' +import puppeteer from 'puppeteer-extra' +import AdblockerPlugin from 'puppeteer-extra-plugin-adblocker' +import StealthPlugin from 'puppeteer-extra-plugin-stealth' +import Url from 'url' +import { encode } from 'urlsafe-base64' // Add stealth plugin to hide puppeteer usage -const StealthPlugin = require('puppeteer-extra-plugin-stealth'); -puppeteer.use(StealthPlugin()); - +puppeteer.use(StealthPlugin()) // Add adblocker plugin to block all ads and trackers (saves bandwidth) -const AdblockerPlugin = require('puppeteer-extra-plugin-adblocker'); -puppeteer.use(AdblockerPlugin({ blockTrackers: true })); - -const createDOMPurify = require("dompurify"); +puppeteer.use(AdblockerPlugin({ blockTrackers: true })) // const storage = new Storage(); -const ALLOWED_ORIGINS = process.env.ALLOWED_ORIGINS ? process.env.ALLOWED_ORIGINS.split(',') : []; +// const ALLOWED_ORIGINS = process.env.ALLOWED_ORIGINS +// ? process.env.ALLOWED_ORIGINS.split(',') +// : [] // const previewBucket = process.env.PREVIEW_IMAGE_BUCKET ? storage.bucket(process.env.PREVIEW_IMAGE_BUCKET) : undefined; -const filePath = `${os.tmpdir()}/previewImage.png`; +// const filePath = `${os.tmpdir()}/previewImage.png` -const MOBILE_USER_AGENT = 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.62 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' -const DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36' -const BOT_DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36' -const NON_BOT_DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36' +const MOBILE_USER_AGENT = + 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.62 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' +const DESKTOP_USER_AGENT = + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36' +const BOT_DESKTOP_USER_AGENT = + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36' +const NON_BOT_DESKTOP_USER_AGENT = + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36' const NON_BOT_HOSTS = ['bloomberg.com', 'forbes.com'] -const NON_SCRIPT_HOSTS= ['medium.com', 'fastcompany.com', 'fortelabs.com']; +const NON_SCRIPT_HOSTS = ['medium.com', 'fastcompany.com', 'fortelabs.com'] -const ALLOWED_CONTENT_TYPES = ['text/html', 'application/octet-stream', 'text/plain', 'application/pdf']; +const ALLOWED_CONTENT_TYPES = [ + 'text/html', + 'application/octet-stream', + 'text/plain', + 'application/pdf', +] +const REQUEST_TIMEOUT = 30000 -const userAgentForUrl = (url) => { +const userAgentForUrl = (url: string) => { try { - const u = new URL(url); + const u = new URL(url) for (const host of NON_BOT_HOSTS) { if (u.hostname.endsWith(host)) { - return NON_BOT_DESKTOP_USER_AGENT; + return NON_BOT_DESKTOP_USER_AGENT } } } catch (e) { console.log('error getting user agent for url', url, e) } return DESKTOP_USER_AGENT -}; +} -const fetchContentWithScrapingBee = async (url) => { +const fetchContentWithScrapingBee = async (url: string) => { try { const response = await axios.get('https://app.scrapingbee.com/api/v1', { params: { - 'api_key': process.env.SCRAPINGBEE_API_KEY, - 'url': url, - 'render_js': 'false', - 'premium_proxy': 'true', - 'country_code':'us' + api_key: process.env.SCRAPINGBEE_API_KEY, + url: url, + render_js: 'false', + premium_proxy: 'true', + country_code: 'us', }, timeout: REQUEST_TIMEOUT, }) - - const dom = parseHTML(response.data).document; + + const dom = parseHTML(response.data).document return { title: dom.title, domContent: dom.documentElement.outerHTML, url } } catch (e) { - console.error('error fetching with scrapingbee', e.message) + console.error('error fetching with scrapingbee', e) return { title: url, domContent: '', url } } } -const enableJavascriptForUrl = (url) => { +const enableJavascriptForUrl = (url: string) => { try { - const u = new URL(url); + const u = new URL(url) for (const host of NON_SCRIPT_HOSTS) { if (u.hostname.endsWith(host)) { - return false; + return false } } } catch (e) { console.log('error getting hostname for url', url, e) } return true -}; +} // launch Puppeteer const getBrowserPromise = (async () => { - console.log("starting puppeteer browser") - return puppeteer.launch({ + console.log('starting puppeteer browser') + return (await puppeteer.launch({ args: [ '--allow-running-insecure-content', '--autoplay-policy=user-gesture-required', @@ -123,112 +130,141 @@ const getBrowserPromise = (async () => { height: 1080, isLandscape: true, isMobile: false, - width: 1920 + width: 1920, }, executablePath: process.env.CHROMIUM_PATH, headless: !!process.env.LAUNCH_HEADLESS, timeout: 120000, // 2 minutes - }); -})(); + })) as Browser +})() -async function fetchContent(url, locale, timezone) { - let functionStartTime = Date.now(); - let logRecord = { +export const fetchContent = async ( + url: string, + locale: string, + timezone: string +) => { + const functionStartTime = Date.now() + const logRecord = { url, functionStartTime, locale, timezone, } - console.log(`content-fetch request`, logRecord); + console.log(`content-fetch request`, logRecord) - let context, page, finalUrl, title, content, contentType, readabilityResult = null; + let context: BrowserContext | undefined, + page: Page | undefined, + finalUrl: string | undefined, + title: string | undefined, + content: string | undefined, + contentType: string | undefined, + readabilityResult: Readability.ParseResult | null | undefined try { - url = getUrl(url); + url = getUrl(url) if (!url) { - throw new Error('Valid URL to parse not specified'); + throw new Error('Valid URL to parse not specified') } // pre handle url with custom handlers try { - const browser = await getBrowserPromise; - const result = await preHandleContent(url, browser); + const browser = await getBrowserPromise + const result = await preHandleContent(url, browser) if (result && result.url) { - validateUrlString(url); - url = result.url; + validateUrlString(url) + url = result.url + } + if (result && result.title) { + title = result.title + } + if (result && result.content) { + content = result.content + } + if (result && result.contentType) { + contentType = result.contentType } - if (result && result.title) { title = result.title } - if (result && result.content) { content = result.content } - if (result && result.contentType) { contentType = result.contentType } } catch (e) { - console.info('error with handler: ', e); + console.info('error with handler: ', e) } if ((!content || !title) && contentType !== 'application/pdf') { - const result = await retrievePage(url, logRecord, functionStartTime, locale, timezone); - if (result && result.context) { context = result.context } - if (result && result.page) { page = result.page } - if (result && result.finalUrl) { finalUrl = result.finalUrl } - if (result && result.contentType) { contentType = result.contentType } + const result = await retrievePage( + url, + logRecord, + functionStartTime, + locale, + timezone + ) + if (result && result.context) { + context = result.context + } + if (result && result.page) { + page = result.page + } + if (result && result.finalUrl) { + finalUrl = result.finalUrl + } + if (result && result.contentType) { + contentType = result.contentType + } } else { finalUrl = url } if (contentType !== 'application/pdf') { - if (!content || !title) { - const result = await retrieveHtml(page, logRecord); + if (page && (!content || !title)) { + const result = await retrieveHtml(page, logRecord) if (result.isBlocked) { const sbResult = await fetchContentWithScrapingBee(url) title = sbResult.title content = sbResult.domContent } else { - title = result.title; - content = result.domContent; + title = result.title + content = result.domContent } } else { - console.info('using prefetched content and title'); + console.info('using prefetched content and title') } } } catch (e) { - console.error(`Error while retrieving page ${url}`, e); + console.error(`Error while retrieving page ${url}`, e) // fallback to scrapingbee for non pdf content if (url && contentType !== 'application/pdf') { - console.info('fallback to scrapingbee', url); + console.info('fallback to scrapingbee', url) - const fetchStartTime = Date.now(); - const sbResult = await fetchContentWithScrapingBee(url); - content = sbResult.domContent; - title = sbResult.title; + const sbResult = await fetchContentWithScrapingBee(url) + content = sbResult.domContent + title = sbResult.title } else { - throw e; + throw e } } finally { // close browser context if it was opened if (context) { - await context.close(); + await context.close() } // save non pdf content if (url && contentType !== 'application/pdf') { // parse content if it is not empty if (content) { - let document = parseHTML(content).document; + let document = parseHTML(content).document // preParse content const preParsedDom = await preParseContent(url, document) if (preParsedDom) { document = preParsedDom } - readabilityResult = await getReadabilityResult(url, document); + readabilityResult = await getReadabilityResult(url, document) } } - console.info(`content-fetch result`, logRecord); - - return { finalUrl, title, content, readabilityResult, contentType }; + console.info(`content-fetch result`, logRecord) } + + return { finalUrl, title, content, readabilityResult, contentType } } -function validateUrlString(url) { - const u = new URL(url); +function validateUrlString(url: string) { + const u = new URL(url) // Make sure the URL is http or https if (u.protocol !== 'http:' && u.protocol !== 'https:') { throw new Error('Invalid URL protocol check failed') @@ -243,60 +279,75 @@ function validateUrlString(url) { } } -function tryParseUrl(urlStr) { +function tryParseUrl(urlStr: string) { if (!urlStr) { - return null; + return null } - + // a regular expression to match all URLs - const regex = /(https?:\/\/[^\s]+)/g; - - const matches = urlStr.match(regex); - + const regex = /(https?:\/\/[^\s]+)/g + + const matches = urlStr.match(regex) + if (matches) { - return matches[0]; // only return first match + return matches[0] // only return first match } else { - return null; + return null } } -function getUrl(urlStr) { +function getUrl(urlStr: string) { const url = tryParseUrl(urlStr) if (!url) { - throw new Error('No URL specified'); + throw new Error('No URL specified') } - validateUrlString(url); + validateUrlString(url) - const parsed = Url.parse(url); - return parsed.href; + const parsed = Url.parse(url) + return parsed.href } -async function retrievePage(url, logRecord, functionStartTime, locale, timezone) { - validateUrlString(url); +async function retrievePage( + url: string, + logRecord: Record, + functionStartTime: number, + locale: string, + timezone: string +) { + validateUrlString(url) - const browser = await getBrowserPromise; - logRecord.timing = { ...logRecord.timing, browserOpened: Date.now() - functionStartTime }; + const browser = await getBrowserPromise + logRecord.timing = { + ...logRecord.timing, + browserOpened: Date.now() - functionStartTime, + } - const context = await browser.createIncognitoBrowserContext(); + const context = await browser.createIncognitoBrowserContext() const page = await context.newPage() if (!enableJavascriptForUrl(url)) { - await page.setJavaScriptEnabled(false); + await page.setJavaScriptEnabled(false) } - await page.setUserAgent(userAgentForUrl(url)); + await page.setUserAgent(userAgentForUrl(url)) // set locale for the page if (locale) { - await page.setExtraHTTPHeaders({ 'Accept-Language': locale }); + await page.setExtraHTTPHeaders({ 'Accept-Language': locale }) } // set timezone for the page if (timezone) { - await page.emulateTimezone(timezone); + await page.emulateTimezone(timezone) } - const client = await page.target().createCDPSession(); + const client = await page.target().createCDPSession() + + const downloadPath = path.resolve('./download_dir/') + await client.send('Page.setDownloadBehavior', { + behavior: 'allow', + downloadPath, + }) // intercept request when response headers was received await client.send('Network.setRequestInterception', { @@ -307,107 +358,126 @@ async function retrievePage(url, logRecord, functionStartTime, locale, timezone) interceptionStage: 'HeadersReceived', }, ], - }); - - const path = require('path'); - const download_path = path.resolve('./download_dir/'); - - await client.send('Page.setDownloadBehavior', { - behavior: 'allow', - userDataDir: './', - downloadPath: download_path, }) - client.on('Network.requestIntercepted', async e => { - const headers = e.responseHeaders || {}; + client.on( + 'Network.requestIntercepted', + (e: Protocol.Network.RequestInterceptedEvent) => { + ;(async () => { + const headers = e.responseHeaders || {} - const [contentType] = (headers['content-type'] || headers['Content-Type'] || '') - .toLowerCase() - .split(';'); - const obj = { interceptionId: e.interceptionId }; + const [contentType] = ( + headers['content-type'] || + headers['Content-Type'] || + '' + ) + .toLowerCase() + .split(';') + const obj: Protocol.Network.ContinueInterceptedRequestRequest = { + interceptionId: e.interceptionId, + } - if (e.responseStatusCode >= 200 && e.responseStatusCode < 300) { - // We only check content-type on success responses - // as it doesn't matter what the content type is for things - // like redirects - if (contentType && !ALLOWED_CONTENT_TYPES.includes(contentType)) { - obj['errorReason'] = 'BlockedByClient'; - } + if ( + e.responseStatusCode && + e.responseStatusCode >= 200 && + e.responseStatusCode < 300 + ) { + // We only check content-type on success responses + // as it doesn't matter what the content type is for things + // like redirects + if (contentType && !ALLOWED_CONTENT_TYPES.includes(contentType)) { + obj['errorReason'] = 'BlockedByClient' + } + } + + try { + await client.send('Network.continueInterceptedRequest', obj) + } catch { + // ignore + } + })() } - - try { - await client.send('Network.continueInterceptedRequest', obj); - // eslint-disable-next-line no-empty - } catch {} - }); + ) /* - * Disallow MathJax from running in Puppeteer and modifying the document, - * we shall instead run it in our frontend application to transform any - * mathjax content when present. - */ - await page.setRequestInterception(true); - let requestCount = 0; - page.on('request', request => { - if (request.resourceType() === 'font') { - // Disallow fonts from loading - request.abort(); - return; - } - if (requestCount++ > 100) { - request.abort(); - return; - } - if ( - request.resourceType() === 'script' && - request.url().toLowerCase().indexOf('mathjax') > -1 - ) { - request.abort(); - return - } - request.continue(); - }); + * Disallow MathJax from running in Puppeteer and modifying the document, + * we shall instead run it in our frontend application to transform any + * mathjax content when present. + */ + await page.setRequestInterception(true) + let requestCount = 0 + page.on('request', (request) => { + ;(async () => { + if (request.resourceType() === 'font') { + // Disallow fonts from loading + return request.abort() + } + if (requestCount++ > 100) { + return request.abort() + } + if ( + request.resourceType() === 'script' && + request.url().toLowerCase().indexOf('mathjax') > -1 + ) { + return request.abort() + } + + await request.continue() + })() + }) // Puppeteer fails during download of PDf files, // so record the failure and use those items - let lastPdfUrl = undefined; - page.on('response', response => { + let lastPdfUrl = undefined + page.on('response', (response) => { if (response.headers()['content-type'] === 'application/pdf') { - lastPdfUrl = response.url(); + lastPdfUrl = response.url() } - }); + }) try { - const response = await page.goto(url, { timeout: 30 * 1000, waitUntil: ['networkidle2'] }); - const finalUrl = response.url(); - const contentType = response.headers()['content-type']; + const response = await page.goto(url, { + timeout: 30 * 1000, + waitUntil: ['networkidle2'], + }) + if (!response) { + throw new Error('No response from page') + } - logRecord.finalUrl = response.url(); - logRecord.contentType = response.headers()['content-type']; + const finalUrl = response.url() + const contentType = response.headers()['content-type'] - return { context, page, response, finalUrl, contentType }; + logRecord.finalUrl = response.url() + logRecord.contentType = response.headers()['content-type'] + + return { context, page, response, finalUrl, contentType } } catch (error) { if (lastPdfUrl) { - return { context, page, finalUrl: lastPdfUrl, contentType: 'application/pdf' }; + return { + context, + page, + finalUrl: lastPdfUrl, + contentType: 'application/pdf', + } } - await context.close(); - throw error; + await context.close() + throw error } } -async function retrieveHtml(page, logRecord) { - let domContent = '', title; +async function retrieveHtml(page: Page, logRecord: Record) { + let domContent = '', + title try { - title = await page.title(); - logRecord.title = title; + title = await page.title() + logRecord.title = title - const pageScrollingStart = Date.now(); + const pageScrollingStart = Date.now() /* scroll with a 5 seconds timeout */ await Promise.race([ - new Promise(resolve => { - (async function () { - try { - await page.evaluate(`(async () => { + await page + .evaluate( + `(async () => { /* credit: https://github.com/puppeteer/puppeteer/issues/305 */ return new Promise((resolve, reject) => { let scrollHeight = document.body.scrollHeight; @@ -422,46 +492,56 @@ async function retrieveHtml(page, logRecord) { } }, 10); }); - })()`); - } catch (e) { - logRecord.scrollError = true; - } finally { - resolve(true); - } - })(); - }), - page.waitForTimeout(5000), - ]); - logRecord.timing = { ...logRecord.timing, pageScrolled: Date.now() - pageScrollingStart }; + })()` + ) + .catch((e) => { + console.log('error scrolling page', e) + logRecord.scrollError = true + }), + new Promise((r) => setTimeout(r, 5000)), + ]) - const iframes = {}; - const urls = []; - const framesPromises = []; - const allowedUrls = /instagram\.com/gi; + logRecord.timing = { + ...logRecord.timing, + pageScrolled: Date.now() - pageScrollingStart, + } + + const iframes: Record = {} + const urls: string[] = [] + const framesPromises = [] + const allowedUrls = /instagram\.com/gi for (const frame of page.mainFrame().childFrames()) { if (frame.url() && allowedUrls.test(frame.url())) { - urls.push(frame.url()); - framesPromises.push(frame.evaluate(el => el.innerHTML, await frame.$('body'))); + urls.push(frame.url()) + framesPromises.push( + frame.evaluate((el) => el?.innerHTML, await frame.$('body')) + ) } } - (await Promise.all(framesPromises)).forEach((frame, index) => (iframes[urls[index]] = frame)); + ;(await Promise.all(framesPromises)).forEach( + (frame, index) => (iframes[urls[index]] = frame) + ) - const domContentCapturingStart = Date.now(); + const domContentCapturingStart = Date.now() // get document body with all hidden elements removed - domContent = await page.evaluate(iframes => { - const BI_SRC_REGEXP = /url\("(.+?)"\)/gi; + domContent = await page.evaluate((iframes) => { + const BI_SRC_REGEXP = /url\("(.+?)"\)/gi - Array.from(document.body.getElementsByTagName('*')).forEach(el => { - const style = window.getComputedStyle(el); + Array.from(document.body.getElementsByTagName('*')).forEach((el) => { + const style = window.getComputedStyle(el) + const src = el.getAttribute('src') try { // Removing blurred images since they are mostly the copies of lazy loaded ones - if (el.tagName && ['img', 'image'].includes(el.tagName.toLowerCase())) { - const filter = style.getPropertyValue('filter'); + if ( + el.tagName && + ['img', 'image'].includes(el.tagName.toLowerCase()) + ) { + const filter = style.getPropertyValue('filter') if (filter && filter.startsWith('blur')) { - el.parentNode && el.parentNode.removeChild(el); + el.parentNode && el.parentNode.removeChild(el) } } } catch (err) { @@ -469,69 +549,80 @@ async function retrieveHtml(page, logRecord) { } // convert all nodes with background image to img nodes - if (!['', 'none'].includes(style.getPropertyValue('background-image'))) { - const filter = style.getPropertyValue('filter'); + if ( + !['', 'none'].includes(style.getPropertyValue('background-image')) + ) { + const filter = style.getPropertyValue('filter') // avoiding image nodes with a blur effect creation if (filter && filter.startsWith('blur')) { - el && el.parentNode && el.parentNode.removeChild(el); + el && el.parentNode && el.parentNode.removeChild(el) } else { - const matchedSRC = BI_SRC_REGEXP.exec(style.getPropertyValue('background-image')); + const matchedSRC = BI_SRC_REGEXP.exec( + style.getPropertyValue('background-image') + ) // Using "g" flag with a regex we have to manually break down lastIndex to zero after every usage // More details here: https://stackoverflow.com/questions/1520800/why-does-a-regexp-with-global-flag-give-wrong-results - BI_SRC_REGEXP.lastIndex = 0; + BI_SRC_REGEXP.lastIndex = 0 - if (matchedSRC && matchedSRC[1] && !el.src) { + if (matchedSRC && matchedSRC[1] && !src) { // Replacing element only of there are no content inside, b/c might remove important div with content. // Article example: http://www.josiahzayner.com/2017/01/genetic-designer-part-i.html // DIV with class "content-inner" has `url("https://resources.blogblog.com/blogblog/data/1kt/travel/bg_container.png")` background image. if (!el.textContent) { - const img = document.createElement('img'); - img.src = matchedSRC[1]; - el && el.parentNode && el.parentNode.replaceChild(img, el); + const img = document.createElement('img') + img.src = matchedSRC[1] + el && el.parentNode && el.parentNode.replaceChild(img, el) } } } } if (el.tagName === 'IFRAME') { - if (iframes[el.src]) { - const newNode = document.createElement('div'); - newNode.className = 'omnivore-instagram-embed'; - newNode.innerHTML = iframes[el.src]; - el && el.parentNode && el.parentNode.replaceChild(newNode, el); + if (src && iframes[src]) { + const newNode = document.createElement('div') + newNode.className = 'omnivore-instagram-embed' + newNode.innerHTML = iframes[src] + el && el.parentNode && el.parentNode.replaceChild(newNode, el) } } - }); + }) - if (document.querySelector('[data-translate="managed_checking_msg"]') || - document.getElementById('px-block-form-wrapper')) { + if ( + document.querySelector('[data-translate="managed_checking_msg"]') || + document.getElementById('px-block-form-wrapper') + ) { return 'IS_BLOCKED' } - return document.documentElement.outerHTML; - }, iframes); - logRecord.puppeteerSuccess = true; + return document.documentElement.outerHTML + }, iframes) + logRecord.puppeteerSuccess = true logRecord.timing = { ...logRecord.timing, contenCaptured: Date.now() - domContentCapturingStart, - }; + } // [END puppeteer-block] } catch (e) { - if (e.message.startsWith('net::ERR_BLOCKED_BY_CLIENT at ')) { - logRecord.blockedByClient = true; + if (e instanceof Error) { + if (e.message.startsWith('net::ERR_BLOCKED_BY_CLIENT at ')) { + logRecord.blockedByClient = true + } else { + logRecord.puppeteerSuccess = false + logRecord.puppeteerError = { + message: e.message, + stack: e.stack, + } + } } else { - logRecord.puppeteerSuccess = false; - logRecord.puppeteerError = { - message: e.message, - stack: e.stack, - }; + logRecord.puppeteerSuccess = false + logRecord.puppeteerError = e } } if (domContent === 'IS_BLOCKED') { - return { isBlocked: true }; + return { isBlocked: true } } - return { domContent, title }; + return { domContent, title } } // async function preview(req, res) { @@ -669,7 +760,7 @@ const DOM_PURIFY_CONFIG = { ], } -function domPurifySanitizeHook(node, data) { +function domPurifySanitizeHook(node: Element, data: SanitizeElementHookEvent) { if (data.tagName === 'iframe') { const urlRegex = /^(https?:)?\/\/www\.youtube(-nocookie)?\.com\/embed\//i const src = node.getAttribute('src') || '' @@ -688,7 +779,7 @@ function domPurifySanitizeHook(node, data) { } } -function getPurifiedContent(html) { +function getPurifiedContent(html: Document) { const newWindow = parseHTML('') const DOMPurify = createDOMPurify(newWindow) DOMPurify.addHook('uponSanitizeElement', domPurifySanitizeHook) @@ -696,13 +787,16 @@ function getPurifiedContent(html) { return parseHTML(clean).document } -function signImageProxyUrl(url) { +function signImageProxyUrl(url: string) { return encode( - crypto.createHmac('sha256', process.env.IMAGE_PROXY_SECRET).update(url).digest() + crypto + .createHmac('sha256', process.env.IMAGE_PROXY_SECRET || '') + .update(url) + .digest() ) } -function createImageProxyUrl(url, width = 0, height = 0) { +function createImageProxyUrl(url: string, width = 0, height = 0) { if (!process.env.IMAGE_PROXY_URL || !process.env.IMAGE_PROXY_SECRET) { return url } @@ -713,7 +807,7 @@ function createImageProxyUrl(url, width = 0, height = 0) { return `${process.env.IMAGE_PROXY_URL}/${width}x${height},s${signature}/${url}` } -async function getReadabilityResult(url, document) { +async function getReadabilityResult(url: string, document: Document) { // First attempt to read the article as is. // if that fails attempt to purify then read const sources = [ @@ -747,9 +841,3 @@ async function getReadabilityResult(url, document) { return null } - -module.exports = { - fetchContent, - // preview, -}; - diff --git a/packages/puppeteer-parse/src/readability.d.ts b/packages/puppeteer-parse/src/readability.d.ts new file mode 100644 index 000000000..4722588cd --- /dev/null +++ b/packages/puppeteer-parse/src/readability.d.ts @@ -0,0 +1,173 @@ +// Type definitions for non-npm package mozilla-readability 0.2 +// Project: https://github.com/mozilla/readability +// Definitions by: Charles Vandevoorde , Alex Wendland +// Definitions: https://github.com/DefinitelyTyped/DefinitelyTyped +// TypeScript Version: 2.2 + +declare module '@omnivore/readability' { + /** + * A standalone version of the readability library used for Firefox Reader View. + * + * Note that isProbablyReaderable() was moved into a separate file in https://github.com/mozilla/readability/commit/2620542dd1e8380220d82afa97a2c283ae636e40 + * and therefore is no longer part of the Readability class. + */ + class Readability { + /** + * ## Usage on the web + * + * To parse a document, you must create a new Readability object from a + * DOM document object, and then call parse(). Here's an example: + * + * ```js + * var article = new Readability(document).parse(); + * ``` + * + * If you're using Readability on the web, you will likely be able to + * use a document reference from elsewhere (e.g. fetched via XMLHttpRequest, + * in a same-origin