From 1a48ad03e4c6198a7b669b1f5e8ad90b9566113e Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Mon, 12 Jun 2023 12:25:04 +0800 Subject: [PATCH 1/3] use a separate queue for thumbnailer --- packages/api/src/utils/createTask.ts | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/packages/api/src/utils/createTask.ts b/packages/api/src/utils/createTask.ts index d6d9f12e5..c6cbe328c 100644 --- a/packages/api/src/utils/createTask.ts +++ b/packages/api/src/utils/createTask.ts @@ -56,7 +56,9 @@ const createHttpTaskWithToken = async ({ } // Construct the fully qualified queue name. - priority === 'low' && (queue = `${queue}-low`) + if (priority === 'low') { + queue = `${queue}-low` + } const parent = client.queuePath(project, location, queue) // Convert message to buffer. @@ -534,6 +536,7 @@ export const enqueueThumbnailTask = async ( payload, taskHandlerUrl: env.queue.thumbnailTaskHandlerUrl, requestHeaders: headers, + queue: 'omnivore-thumbnail-queue', }) if (!createdTasks || !createdTasks[0].name) { From a4449e6f931bc44b54d917efab89850ca4e50df5 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Mon, 12 Jun 2023 21:05:54 +0800 Subject: [PATCH 2/3] better handling error in puppeteer --- packages/puppeteer-parse/index.js | 181 ++++++++++++++++++------------ 1 file changed, 110 insertions(+), 71 deletions(-) diff --git a/packages/puppeteer-parse/index.js b/packages/puppeteer-parse/index.js index 7c5bda58a..14bbfde27 100644 --- a/packages/puppeteer-parse/index.js +++ b/packages/puppeteer-parse/index.js @@ -46,6 +46,8 @@ const ALLOWED_CONTENT_TYPES = ['text/html', 'application/octet-stream', 'text/pl const IMPORTER_METRICS_COLLECTOR_URL = process.env.IMPORTER_METRICS_COLLECTOR_URL; +const REQUEST_TIMEOUT = 30000; + const userAgentForUrl = (url) => { try { const u = new URL(url); @@ -69,7 +71,8 @@ const fetchContentWithScrapingBee = async (url) => { 'render_js': 'false', 'premium_proxy': 'true', 'country_code':'us' - } + }, + timeout: REQUEST_TIMEOUT, }) const dom = parseHTML(response.data).document; @@ -131,14 +134,19 @@ const getBrowserPromise = (async () => { })(); const uploadToSignedUrl = async ({ id, uploadSignedUrl }, contentType, contentObjUrl) => { - const stream = await axios.get(contentObjUrl, { responseType: 'stream' }); - return await axios.put(uploadSignedUrl, stream.data, { - headers: { - 'Content-Type': contentType, - }, - maxBodyLength: 1000000000, - maxContentLength: 100000000, - }) + try { + const stream = await axios.get(contentObjUrl, { responseType: 'stream' }); + return axios.put(uploadSignedUrl, stream.data, { + headers: { + 'Content-Type': contentType, + }, + maxBodyLength: 1000000000, + maxContentLength: 100000000, + }); + } catch (error) { + console.error('error uploading to signed url', error.message); + return null; + } }; const getUploadIdAndSignedUrl = async (userId, url, articleSavingRequestId) => { @@ -164,21 +172,39 @@ const getUploadIdAndSignedUrl = async (userId, url, articleSavingRequestId) => { } }); - const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data, + try { + const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data, { headers: { Cookie: `auth=${auth};`, 'Content-Type': 'application/json', }, + timeout: REQUEST_TIMEOUT, }); - return response.data.data.uploadFileRequest; + + if (response.data.data.uploadFileRequest.errorCodes && response.data.data.uploadFileRequest.errorCodes.length > 0) { + console.error('Error while getting upload id and signed url', response.data.data.uploadFileRequest.errorCodes[0]); + return null; + } + + return response.data.data.uploadFileRequest; + } catch (e) { + console.error('error getting upload id and signed url', e.message); + return null; + } }; const uploadPdf = async (url, userId, articleSavingRequestId) => { validateUrlString(url); const uploadResult = await getUploadIdAndSignedUrl(userId, url, articleSavingRequestId); - await uploadToSignedUrl(uploadResult, 'application/pdf', url); + if (!uploadResult) { + throw new Error('error while getting upload id and signed url'); + } + const uploaded = await uploadToSignedUrl(uploadResult, 'application/pdf', url); + if (!uploaded) { + throw new Error('error while uploading pdf'); + } return uploadResult.id; }; @@ -202,14 +228,26 @@ const sendCreateArticleMutation = async (userId, input) => { }); const auth = await signToken({ uid: userId }, process.env.JWT_SECRET); - const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data, + try { + const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data, { headers: { Cookie: `auth=${auth};`, 'Content-Type': 'application/json', }, + timeout: REQUEST_TIMEOUT, }); - return response.data.data.createArticle; + + if (response.data.data.createArticle.errorCodes && response.data.data.createArticle.errorCodes.length > 0) { + console.error('error while creating article', response.data.data.createArticle.errorCodes[0]); + return null; + } + + return response.data.data.createArticle; + } catch (error) { + console.error('error creating article', error.message); + return null; + } }; const sendSavePageMutation = async (userId, input) => { @@ -231,14 +269,26 @@ const sendSavePageMutation = async (userId, input) => { }); const auth = await signToken({ uid: userId }, process.env.JWT_SECRET); - const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data, + try { + const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data, { headers: { Cookie: `auth=${auth};`, 'Content-Type': 'application/json', }, + timeout: REQUEST_TIMEOUT, }); - return response.data.data.savePage; + + if (response.data.data.savePage.errorCodes && response.data.data.savePage.errorCodes.length > 0) { + console.error('error while saving page', response.data.data.savePage.errorCodes[0]); + return null; + } + + return response.data.data.savePage; + } catch (error) { + console.error('error saving page', error.message); + return null; + } }; const saveUploadedPdf = async (userId, url, uploadFileId, articleSavingRequestId) => { @@ -265,9 +315,10 @@ const sendImportStatusUpdate = async (userId, taskId, status) => { 'Authorization': auth, 'Content-Type': 'application/json', }, + timeout: REQUEST_TIMEOUT, }); } catch (e) { - console.error('Error while sending import status update', e); + console.error('error while sending import status update', e); } }; @@ -318,7 +369,7 @@ async function fetchContent(req, res) { console.info('error with handler: ', e); } - let context, page, finalUrl; + let context, page, finalUrl, statusCode = 200; try { if ((!content || !title) && contentType !== 'application/pdf') { const result = await retrievePage(url, logRecord, functionStartTime); @@ -332,7 +383,13 @@ async function fetchContent(req, res) { if (contentType === 'application/pdf') { const uploadedFileId = await uploadPdf(finalUrl, userId, articleSavingRequestId); - await saveUploadedPdf(userId, finalUrl, uploadedFileId, articleSavingRequestId); + const uploadedPdf = await saveUploadedPdf(userId, finalUrl, uploadedFileId, articleSavingRequestId); + if (!uploadedPdf) { + statusCode = 500; + logRecord.error = 'error while saving uploaded pdf'; + } else { + importStatus = 'imported'; + } } else { if (!content || !title) { const result = await retrieveHtml(page, logRecord); @@ -347,19 +404,39 @@ async function fetchContent(req, res) { } else { console.info('using prefetched content and title'); } - logRecord.fetchContentTime = Date.now() - functionStartTime; + } + } catch (e) { + logRecord.error = e.message; + console.error(`Error while retrieving page`, logRecord); + // fallback to scrapingbee for non pdf content + if (contentType !== 'application/pdf') { + const fetchStartTime = Date.now(); + const sbResult = await fetchContentWithScrapingBee(url); + content = sbResult.domContent; + title = sbResult.title; + logRecord.fetchContentTime = Date.now() - fetchStartTime; + } + } finally { + // close browser context if it was opened + if (context) { + await context.close(); + } + // save non pdf content + if (contentType !== 'application/pdf') { + // parse content if it is not empty let readabilityResult = null; if (content) { - const document = parseHTML(content).document; - + let document = parseHTML(content).document; // preParse content - const preParsedDom = (await preParseContent(url, document)) || document; - - readabilityResult = await getReadabilityResult(url, preParsedDom); + const preParsedDom = await preParseContent(url, document) + if (preParsedDom) { + document = preParsedDom + } + readabilityResult = await getReadabilityResult(url, document); } - + const apiResponse = await sendSavePageMutation(userId, { url, clientRequestId: articleSavingRequestId, @@ -369,61 +446,23 @@ async function fetchContent(req, res) { state, labels, }); - - logRecord.totalTime = Date.now() - functionStartTime; - logRecord.result = apiResponse.createArticle; - } - - importStatus = 'imported'; - } catch (e) { - logRecord.error = e.message; - console.error(`Error while retrieving page`, logRecord); - - // fallback to scrapingbee - const sbResult = await fetchContentWithScrapingBee(url); - const content = sbResult.domContent; - const title = sbResult.title; - logRecord.fetchContentTime = Date.now() - functionStartTime; - - let readabilityResult = null; - if (content) { - let document = parseHTML(content).document; - - // preParse content - const preParsedDom = await preParseContent(url, document) - if (preParsedDom) { - document = preParsedDom + if (!apiResponse) { + logRecord.error = 'error while saving page'; + statusCode = 500; + } else { + importStatus = readabilityResult ? 'imported' : 'failed'; } - - readabilityResult = await getReadabilityResult(url, document); } - const apiResponse = await sendSavePageMutation(userId, { - url, - clientRequestId: articleSavingRequestId, - title, - originalContent: content, - parseResult: readabilityResult, - state, - labels, - }); - logRecord.totalTime = Date.now() - functionStartTime; - logRecord.result = apiResponse.createArticle; - - importStatus = 'failed'; - } finally { - if (context) { - await context.close(); - } console.info(`parse-page`, logRecord); // send import status to update the metrics - if (taskId) { + if (taskId && importStatus) { await sendImportStatusUpdate(userId, taskId, importStatus); } - res.sendStatus(200); + res.sendStatus(statusCode); } } From 4a438f45f5d1f4bd848aeeffa220d96bc9d4001c Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Mon, 12 Jun 2023 21:31:35 +0800 Subject: [PATCH 3/3] add timeout and content length to the axios requests --- packages/import-handler/src/index.ts | 30 +++++++--- packages/puppeteer-parse/index.js | 2 +- packages/thumbnail-handler/src/index.ts | 78 +++++++++++++++++-------- 3 files changed, 77 insertions(+), 33 deletions(-) diff --git a/packages/import-handler/src/index.ts b/packages/import-handler/src/index.ts index 62a8fc0ef..799aaf1ac 100644 --- a/packages/import-handler/src/index.ts +++ b/packages/import-handler/src/index.ts @@ -223,15 +223,29 @@ const sendSavePageMutation = async (userId: string, input: unknown) => { }) const auth = (await signToken({ uid: userId }, JWT_SECRET)) as string - const response = await axios.post(`${REST_BACKEND_ENDPOINT}/graphql`, data, { - headers: { - Cookie: `auth=${auth};`, - 'Content-Type': 'application/json', - }, - }) + try { + const response = await axios.post( + `${REST_BACKEND_ENDPOINT}/graphql`, + data, + { + headers: { + Cookie: `auth=${auth};`, + 'Content-Type': 'application/json', + }, + timeout: 30000, // 30s + } + ) - /* eslint-disable @typescript-eslint/no-unsafe-member-access */ - return !!response.data.data.savePage + /* eslint-disable @typescript-eslint/no-unsafe-member-access */ + return !!response.data.data.savePage + } catch (error) { + if (axios.isAxiosError(error)) { + console.error('save page mutation error', error.message) + } else { + console.error(error) + } + return false + } } const contentHandler = async ( diff --git a/packages/puppeteer-parse/index.js b/packages/puppeteer-parse/index.js index 14bbfde27..fc5621d97 100644 --- a/packages/puppeteer-parse/index.js +++ b/packages/puppeteer-parse/index.js @@ -46,7 +46,7 @@ const ALLOWED_CONTENT_TYPES = ['text/html', 'application/octet-stream', 'text/pl const IMPORTER_METRICS_COLLECTOR_URL = process.env.IMPORTER_METRICS_COLLECTOR_URL; -const REQUEST_TIMEOUT = 30000; +const REQUEST_TIMEOUT = 30000; // 30 seconds const userAgentForUrl = (url) => { try { diff --git a/packages/thumbnail-handler/src/index.ts b/packages/thumbnail-handler/src/index.ts index 9cd79d7f6..da246b766 100644 --- a/packages/thumbnail-handler/src/index.ts +++ b/packages/thumbnail-handler/src/index.ts @@ -40,8 +40,12 @@ Sentry.GCPFunction.init({ }) const signToken = promisify(jwt.sign) +const REQUEST_TIMEOUT = 30000 // 30s -const articleQuery = async (userId: string, slug: string): Promise => { +const articleQuery = async ( + userId: string, + slug: string +): Promise => { const JWT_SECRET = process.env.JWT_SECRET const REST_BACKEND_ENDPOINT = process.env.REST_BACKEND_ENDPOINT @@ -71,18 +75,28 @@ const articleQuery = async (userId: string, slug: string): Promise => { }) const auth = (await signToken({ uid: userId }, JWT_SECRET)) as string - const response = await axios.post( - `${REST_BACKEND_ENDPOINT}/graphql`, - data, - { - headers: { - Cookie: `auth=${auth};`, - 'Content-Type': 'application/json', - }, - } - ) + try { + const response = await axios.post( + `${REST_BACKEND_ENDPOINT}/graphql`, + data, + { + headers: { + Cookie: `auth=${auth};`, + 'Content-Type': 'application/json', + }, + timeout: REQUEST_TIMEOUT, + } + ) - return response.data.data.article.article + return response.data.data.article.article + } catch (error) { + if (axios.isAxiosError(error)) { + console.error('article query error', error.message) + } else { + console.error(error) + } + return null + } } const updatePageMutation = async ( @@ -119,18 +133,28 @@ const updatePageMutation = async ( }) const auth = (await signToken({ uid: userId }, JWT_SECRET)) as string - const response = await axios.post( - `${REST_BACKEND_ENDPOINT}/graphql`, - data, - { - headers: { - Cookie: `auth=${auth};`, - 'Content-Type': 'application/json', - }, - } - ) + try { + const response = await axios.post( + `${REST_BACKEND_ENDPOINT}/graphql`, + data, + { + headers: { + Cookie: `auth=${auth};`, + 'Content-Type': 'application/json', + }, + timeout: REQUEST_TIMEOUT, + } + ) - return !!response.data.data.updatePage + return !!response.data.data.updatePage + } catch (error) { + if (axios.isAxiosError(error)) { + console.error('update page mutation error', error.message) + } else { + console.error(error) + } + return false + } } const isThumbnailRequest = (body: any): body is ThumbnailRequest => { @@ -142,6 +166,8 @@ const getImageSize = async (url: string): Promise<[number, number] | null> => { // get image file by url const response = await axios.get(url, { responseType: 'arraybuffer', + timeout: 5000, // 5s + maxContentLength: 10000000, // 10mb }) // eslint-disable-next-line @typescript-eslint/no-unsafe-argument @@ -265,7 +291,11 @@ export const thumbnailHandler = Sentry.GCPFunction.wrapHttpFunction( } const page = await articleQuery(uid, slug) - console.debug('find page', page.id) + if (!page) { + console.info('page not found') + return res.status(200).send('NOT_FOUND') + } + // update page with thumbnail if not already set if (page.image) { console.debug('thumbnail already set')