From 3a79710dbf159c9fccbd1eddc06add9620ac3c04 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Tue, 5 Jul 2022 21:48:58 +0800 Subject: [PATCH] Always fall back to scrapingbee if there is an exception --- packages/content-fetch/fetch-content.js | 46 ++++++++++++++++++------- 1 file changed, 34 insertions(+), 12 deletions(-) diff --git a/packages/content-fetch/fetch-content.js b/packages/content-fetch/fetch-content.js index 2c6ae8a02..ccee392ce 100644 --- a/packages/content-fetch/fetch-content.js +++ b/packages/content-fetch/fetch-content.js @@ -301,18 +301,18 @@ async function fetchContent(req, res) { } } - var context, page, finalUrl; - if ((!content || !title) && contentType !== 'application/pdf') { - const result = await retrievePage(url) - if (result && result.context) { context = result.context } - if (result && result.page) { page = result.page } - if (result && result.finalUrl) { finalUrl = result.finalUrl } - if (result && result.contentType) { contentType = result.contentType } - } else { - finalUrl = url - } - + let context, page, finalUrl; try { + if ((!content || !title) && contentType !== 'application/pdf') { + const result = await retrievePage(url) + if (result && result.context) { context = result.context } + if (result && result.page) { page = result.page } + if (result && result.finalUrl) { finalUrl = result.finalUrl } + if (result && result.contentType) { contentType = result.contentType } + } else { + finalUrl = url + } + if (contentType === 'application/pdf') { const uploadedFileId = await uploadPdf(finalUrl, userId, articleSavingRequestId); const l = await saveUploadedPdf(userId, finalUrl, uploadedFileId, articleSavingRequestId); @@ -355,7 +355,29 @@ async function fetchContent(req, res) { console.log('error', e) logRecord.error = e.message; console.log(`Error while retrieving page`, logRecord); - return res.sendStatus(503); + + // fallback to scrapingbee + const sbResult = await fetchContentWithScrapingBee(url); + const url = finalUrl || sbResult.url; + const content = sbResult.domContent; + logRecord.fetchContentTime = Date.now() - functionStartTime; + + const apiResponse = await sendCreateArticleMutation(userId, { + url, + articleSavingRequestId, + preparedDocument: { + document: content, + pageInfo: { + title: sbResult.title, + canonicalUrl: url, + }, + }, + skipParsing: !content, + }); + + logRecord.totalTime = Date.now() - functionStartTime; + logRecord.result = apiResponse.createArticle; + console.log(`parse-page`, logRecord); } finally { if (context) { await context.close();