Merge pull request #3950 from omnivore-app/fix/puppeteer-connection-timeout

reconnect/restart browser if it crashed/lost connections
This commit is contained in:
Hongbo Wu
2024-05-14 14:19:37 +08:00
committed by GitHub

View File

@ -57,25 +57,19 @@ const userAgentForUrl = (url: string) => {
}
const fetchContentWithScrapingBee = async (url: string) => {
try {
const response = await axios.get('https://app.scrapingbee.com/api/v1', {
params: {
api_key: process.env.SCRAPINGBEE_API_KEY,
url: url,
render_js: 'false',
premium_proxy: 'true',
country_code: 'us',
},
timeout: REQUEST_TIMEOUT,
})
const response = await axios.get('https://app.scrapingbee.com/api/v1', {
params: {
api_key: process.env.SCRAPINGBEE_API_KEY,
url: url,
render_js: 'false',
premium_proxy: 'true',
country_code: 'us',
},
timeout: REQUEST_TIMEOUT,
})
const dom = parseHTML(response.data).document
return { title: dom.title, domContent: dom.documentElement.outerHTML, url }
} catch (e) {
console.error('error fetching with scrapingbee', e)
return { title: url, domContent: '', url }
}
const dom = parseHTML(response.data).document
return { title: dom.title, domContent: dom.documentElement.outerHTML, url }
}
const enableJavascriptForUrl = (url: string) => {
@ -92,11 +86,13 @@ const enableJavascriptForUrl = (url: string) => {
return true
}
let browser: Browser
// launch Puppeteer
const getBrowserPromise = (async () => {
const launchBrowser = async () => {
console.log('starting puppeteer browser')
const browser = (await puppeteer.launch({
browser = (await puppeteer.launch({
args: [
'--allow-running-insecure-content',
'--autoplay-policy=user-gesture-required',
@ -134,10 +130,26 @@ const getBrowserPromise = (async () => {
dumpio: true, // show console logs in the terminal
})) as Browser
console.log('browser started')
const version = await browser.version()
console.log('browser started', version)
return browser
})()
// eslint-disable-next-line @typescript-eslint/no-misused-promises
browser.on('disconnected', async () => {
console.log('browser disconnected, reconnecting...')
const childProcess = browser.process()
if (childProcess) {
childProcess.kill('SIGINT')
console.log('browser child process killed')
}
await launchBrowser()
console.log('browser reconnected')
})
}
// initialize Puppeteer
;(async () => await launchBrowser())()
export const fetchContent = async (
url: string,
@ -224,8 +236,11 @@ export const fetchContent = async (
}
} catch (e) {
console.error(`Error while retrieving page ${url}`, e)
const browser = await getBrowserPromise
console.log(browser.debugInfo.pendingProtocolErrors)
console.error(
'pendingProtocolErrors',
browser.debugInfo.pendingProtocolErrors
)
// fallback to scrapingbee for non pdf content
if (url && contentType !== 'application/pdf') {
@ -310,7 +325,6 @@ async function retrievePage(
) {
validateUrlString(url)
const browser = await getBrowserPromise
logRecord.timing = {
...logRecord.timing,
browserOpened: Date.now() - functionStartTime,
@ -469,9 +483,9 @@ async function retrieveHtml(page: Page, logRecord: Record<string, any>) {
const pageScrollingStart = Date.now()
/* scroll with a 5 seconds timeout */
await Promise.race([
await page
.evaluate(
try {
await Promise.race([
page.evaluate(
`(async () => {
/* credit: https://github.com/puppeteer/puppeteer/issues/305 */
return new Promise((resolve, reject) => {
@ -488,13 +502,13 @@ async function retrieveHtml(page: Page, logRecord: Record<string, any>) {
}, 10);
});
})()`
)
.catch((e) => {
console.log('error scrolling page', e)
logRecord.scrollError = true
}),
new Promise((r) => setTimeout(r, 5000)),
])
),
new Promise((r) => setTimeout(r, 5000)),
])
} catch (error) {
console.error('Error scrolling page', error)
logRecord.scrollError = true
}
logRecord.timing = {
...logRecord.timing,