From a4449e6f931bc44b54d917efab89850ca4e50df5 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Mon, 12 Jun 2023 21:05:54 +0800 Subject: [PATCH] better handling error in puppeteer --- packages/puppeteer-parse/index.js | 181 ++++++++++++++++++------------ 1 file changed, 110 insertions(+), 71 deletions(-) diff --git a/packages/puppeteer-parse/index.js b/packages/puppeteer-parse/index.js index 7c5bda58a..14bbfde27 100644 --- a/packages/puppeteer-parse/index.js +++ b/packages/puppeteer-parse/index.js @@ -46,6 +46,8 @@ const ALLOWED_CONTENT_TYPES = ['text/html', 'application/octet-stream', 'text/pl const IMPORTER_METRICS_COLLECTOR_URL = process.env.IMPORTER_METRICS_COLLECTOR_URL; +const REQUEST_TIMEOUT = 30000; + const userAgentForUrl = (url) => { try { const u = new URL(url); @@ -69,7 +71,8 @@ const fetchContentWithScrapingBee = async (url) => { 'render_js': 'false', 'premium_proxy': 'true', 'country_code':'us' - } + }, + timeout: REQUEST_TIMEOUT, }) const dom = parseHTML(response.data).document; @@ -131,14 +134,19 @@ const getBrowserPromise = (async () => { })(); const uploadToSignedUrl = async ({ id, uploadSignedUrl }, contentType, contentObjUrl) => { - const stream = await axios.get(contentObjUrl, { responseType: 'stream' }); - return await axios.put(uploadSignedUrl, stream.data, { - headers: { - 'Content-Type': contentType, - }, - maxBodyLength: 1000000000, - maxContentLength: 100000000, - }) + try { + const stream = await axios.get(contentObjUrl, { responseType: 'stream' }); + return axios.put(uploadSignedUrl, stream.data, { + headers: { + 'Content-Type': contentType, + }, + maxBodyLength: 1000000000, + maxContentLength: 100000000, + }); + } catch (error) { + console.error('error uploading to signed url', error.message); + return null; + } }; const getUploadIdAndSignedUrl = async (userId, url, articleSavingRequestId) => { @@ -164,21 +172,39 @@ const getUploadIdAndSignedUrl = async (userId, url, articleSavingRequestId) => { } }); - const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data, + try { + const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data, { headers: { Cookie: `auth=${auth};`, 'Content-Type': 'application/json', }, + timeout: REQUEST_TIMEOUT, }); - return response.data.data.uploadFileRequest; + + if (response.data.data.uploadFileRequest.errorCodes && response.data.data.uploadFileRequest.errorCodes.length > 0) { + console.error('Error while getting upload id and signed url', response.data.data.uploadFileRequest.errorCodes[0]); + return null; + } + + return response.data.data.uploadFileRequest; + } catch (e) { + console.error('error getting upload id and signed url', e.message); + return null; + } }; const uploadPdf = async (url, userId, articleSavingRequestId) => { validateUrlString(url); const uploadResult = await getUploadIdAndSignedUrl(userId, url, articleSavingRequestId); - await uploadToSignedUrl(uploadResult, 'application/pdf', url); + if (!uploadResult) { + throw new Error('error while getting upload id and signed url'); + } + const uploaded = await uploadToSignedUrl(uploadResult, 'application/pdf', url); + if (!uploaded) { + throw new Error('error while uploading pdf'); + } return uploadResult.id; }; @@ -202,14 +228,26 @@ const sendCreateArticleMutation = async (userId, input) => { }); const auth = await signToken({ uid: userId }, process.env.JWT_SECRET); - const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data, + try { + const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data, { headers: { Cookie: `auth=${auth};`, 'Content-Type': 'application/json', }, + timeout: REQUEST_TIMEOUT, }); - return response.data.data.createArticle; + + if (response.data.data.createArticle.errorCodes && response.data.data.createArticle.errorCodes.length > 0) { + console.error('error while creating article', response.data.data.createArticle.errorCodes[0]); + return null; + } + + return response.data.data.createArticle; + } catch (error) { + console.error('error creating article', error.message); + return null; + } }; const sendSavePageMutation = async (userId, input) => { @@ -231,14 +269,26 @@ const sendSavePageMutation = async (userId, input) => { }); const auth = await signToken({ uid: userId }, process.env.JWT_SECRET); - const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data, + try { + const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data, { headers: { Cookie: `auth=${auth};`, 'Content-Type': 'application/json', }, + timeout: REQUEST_TIMEOUT, }); - return response.data.data.savePage; + + if (response.data.data.savePage.errorCodes && response.data.data.savePage.errorCodes.length > 0) { + console.error('error while saving page', response.data.data.savePage.errorCodes[0]); + return null; + } + + return response.data.data.savePage; + } catch (error) { + console.error('error saving page', error.message); + return null; + } }; const saveUploadedPdf = async (userId, url, uploadFileId, articleSavingRequestId) => { @@ -265,9 +315,10 @@ const sendImportStatusUpdate = async (userId, taskId, status) => { 'Authorization': auth, 'Content-Type': 'application/json', }, + timeout: REQUEST_TIMEOUT, }); } catch (e) { - console.error('Error while sending import status update', e); + console.error('error while sending import status update', e); } }; @@ -318,7 +369,7 @@ async function fetchContent(req, res) { console.info('error with handler: ', e); } - let context, page, finalUrl; + let context, page, finalUrl, statusCode = 200; try { if ((!content || !title) && contentType !== 'application/pdf') { const result = await retrievePage(url, logRecord, functionStartTime); @@ -332,7 +383,13 @@ async function fetchContent(req, res) { if (contentType === 'application/pdf') { const uploadedFileId = await uploadPdf(finalUrl, userId, articleSavingRequestId); - await saveUploadedPdf(userId, finalUrl, uploadedFileId, articleSavingRequestId); + const uploadedPdf = await saveUploadedPdf(userId, finalUrl, uploadedFileId, articleSavingRequestId); + if (!uploadedPdf) { + statusCode = 500; + logRecord.error = 'error while saving uploaded pdf'; + } else { + importStatus = 'imported'; + } } else { if (!content || !title) { const result = await retrieveHtml(page, logRecord); @@ -347,19 +404,39 @@ async function fetchContent(req, res) { } else { console.info('using prefetched content and title'); } - logRecord.fetchContentTime = Date.now() - functionStartTime; + } + } catch (e) { + logRecord.error = e.message; + console.error(`Error while retrieving page`, logRecord); + // fallback to scrapingbee for non pdf content + if (contentType !== 'application/pdf') { + const fetchStartTime = Date.now(); + const sbResult = await fetchContentWithScrapingBee(url); + content = sbResult.domContent; + title = sbResult.title; + logRecord.fetchContentTime = Date.now() - fetchStartTime; + } + } finally { + // close browser context if it was opened + if (context) { + await context.close(); + } + // save non pdf content + if (contentType !== 'application/pdf') { + // parse content if it is not empty let readabilityResult = null; if (content) { - const document = parseHTML(content).document; - + let document = parseHTML(content).document; // preParse content - const preParsedDom = (await preParseContent(url, document)) || document; - - readabilityResult = await getReadabilityResult(url, preParsedDom); + const preParsedDom = await preParseContent(url, document) + if (preParsedDom) { + document = preParsedDom + } + readabilityResult = await getReadabilityResult(url, document); } - + const apiResponse = await sendSavePageMutation(userId, { url, clientRequestId: articleSavingRequestId, @@ -369,61 +446,23 @@ async function fetchContent(req, res) { state, labels, }); - - logRecord.totalTime = Date.now() - functionStartTime; - logRecord.result = apiResponse.createArticle; - } - - importStatus = 'imported'; - } catch (e) { - logRecord.error = e.message; - console.error(`Error while retrieving page`, logRecord); - - // fallback to scrapingbee - const sbResult = await fetchContentWithScrapingBee(url); - const content = sbResult.domContent; - const title = sbResult.title; - logRecord.fetchContentTime = Date.now() - functionStartTime; - - let readabilityResult = null; - if (content) { - let document = parseHTML(content).document; - - // preParse content - const preParsedDom = await preParseContent(url, document) - if (preParsedDom) { - document = preParsedDom + if (!apiResponse) { + logRecord.error = 'error while saving page'; + statusCode = 500; + } else { + importStatus = readabilityResult ? 'imported' : 'failed'; } - - readabilityResult = await getReadabilityResult(url, document); } - const apiResponse = await sendSavePageMutation(userId, { - url, - clientRequestId: articleSavingRequestId, - title, - originalContent: content, - parseResult: readabilityResult, - state, - labels, - }); - logRecord.totalTime = Date.now() - functionStartTime; - logRecord.result = apiResponse.createArticle; - - importStatus = 'failed'; - } finally { - if (context) { - await context.close(); - } console.info(`parse-page`, logRecord); // send import status to update the metrics - if (taskId) { + if (taskId && importStatus) { await sendImportStatusUpdate(userId, taskId, importStatus); } - res.sendStatus(200); + res.sendStatus(statusCode); } }