From 484676750e509f8859857dde32b5ea1a120e012d Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Tue, 14 May 2024 13:04:17 +0800 Subject: [PATCH 1/2] reconnect/restart browser if it crashed/lost connections --- packages/puppeteer-parse/src/index.ts | 86 ++++++++++++++++----------- 1 file changed, 51 insertions(+), 35 deletions(-) diff --git a/packages/puppeteer-parse/src/index.ts b/packages/puppeteer-parse/src/index.ts index 19e877a07..486ee9cbd 100644 --- a/packages/puppeteer-parse/src/index.ts +++ b/packages/puppeteer-parse/src/index.ts @@ -57,25 +57,19 @@ const userAgentForUrl = (url: string) => { } const fetchContentWithScrapingBee = async (url: string) => { - try { - const response = await axios.get('https://app.scrapingbee.com/api/v1', { - params: { - api_key: process.env.SCRAPINGBEE_API_KEY, - url: url, - render_js: 'false', - premium_proxy: 'true', - country_code: 'us', - }, - timeout: REQUEST_TIMEOUT, - }) + const response = await axios.get('https://app.scrapingbee.com/api/v1', { + params: { + api_key: process.env.SCRAPINGBEE_API_KEY, + url: url, + render_js: 'false', + premium_proxy: 'true', + country_code: 'us', + }, + timeout: REQUEST_TIMEOUT, + }) - const dom = parseHTML(response.data).document - return { title: dom.title, domContent: dom.documentElement.outerHTML, url } - } catch (e) { - console.error('error fetching with scrapingbee', e) - - return { title: url, domContent: '', url } - } + const dom = parseHTML(response.data).document + return { title: dom.title, domContent: dom.documentElement.outerHTML, url } } const enableJavascriptForUrl = (url: string) => { @@ -92,11 +86,13 @@ const enableJavascriptForUrl = (url: string) => { return true } +let browser: Browser + // launch Puppeteer -const getBrowserPromise = (async () => { +const launchBrowser = async () => { console.log('starting puppeteer browser') - const browser = (await puppeteer.launch({ + browser = (await puppeteer.launch({ args: [ '--allow-running-insecure-content', '--autoplay-policy=user-gesture-required', @@ -134,10 +130,28 @@ const getBrowserPromise = (async () => { dumpio: true, // show console logs in the terminal })) as Browser - console.log('browser started') + const version = await browser.version() + console.log('browser started', version) + + // eslint-disable-next-line @typescript-eslint/no-misused-promises + browser.on('disconnected', async () => { + console.log('browser disconnected, reconnecting...') + + const childProcess = browser.process() + if (childProcess) { + childProcess.kill('SIGINT') + console.log('browser child process killed') + } + + browser = await launchBrowser() + console.log('browser reconnected') + }) return browser -})() +} + +// initialize Puppeteer +;(async () => await launchBrowser())() export const fetchContent = async ( url: string, @@ -224,8 +238,11 @@ export const fetchContent = async ( } } catch (e) { console.error(`Error while retrieving page ${url}`, e) - const browser = await getBrowserPromise - console.log(browser.debugInfo.pendingProtocolErrors) + + console.error( + 'pendingProtocolErrors', + browser.debugInfo.pendingProtocolErrors + ) // fallback to scrapingbee for non pdf content if (url && contentType !== 'application/pdf') { @@ -310,7 +327,6 @@ async function retrievePage( ) { validateUrlString(url) - const browser = await getBrowserPromise logRecord.timing = { ...logRecord.timing, browserOpened: Date.now() - functionStartTime, @@ -469,9 +485,9 @@ async function retrieveHtml(page: Page, logRecord: Record) { const pageScrollingStart = Date.now() /* scroll with a 5 seconds timeout */ - await Promise.race([ - await page - .evaluate( + try { + await Promise.race([ + page.evaluate( `(async () => { /* credit: https://github.com/puppeteer/puppeteer/issues/305 */ return new Promise((resolve, reject) => { @@ -488,13 +504,13 @@ async function retrieveHtml(page: Page, logRecord: Record) { }, 10); }); })()` - ) - .catch((e) => { - console.log('error scrolling page', e) - logRecord.scrollError = true - }), - new Promise((r) => setTimeout(r, 5000)), - ]) + ), + new Promise((r) => setTimeout(r, 5000)), + ]) + } catch (error) { + console.error('Error scrolling page', error) + logRecord.scrollError = true + } logRecord.timing = { ...logRecord.timing, From cd315fa6c6f8ded7825df648f60d9a6a5441dd7a Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Tue, 14 May 2024 13:07:42 +0800 Subject: [PATCH 2/2] remove redundant assignment --- packages/puppeteer-parse/src/index.ts | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/packages/puppeteer-parse/src/index.ts b/packages/puppeteer-parse/src/index.ts index 486ee9cbd..b622cfd36 100644 --- a/packages/puppeteer-parse/src/index.ts +++ b/packages/puppeteer-parse/src/index.ts @@ -143,11 +143,9 @@ const launchBrowser = async () => { console.log('browser child process killed') } - browser = await launchBrowser() + await launchBrowser() console.log('browser reconnected') }) - - return browser } // initialize Puppeteer