separate content-fetch in puppeteer packages from saving page content

This commit is contained in:
Hongbo Wu
2024-01-12 14:05:01 +08:00
parent c297a8dd4e
commit 51e586ed3d
7 changed files with 461 additions and 446 deletions

View File

@ -0,0 +1,205 @@
const axios = require('axios');
const jwt = require('jsonwebtoken');
const { promisify } = require('util');
const signToken = promisify(jwt.sign);
const IMPORTER_METRICS_COLLECTOR_URL = process.env.IMPORTER_METRICS_COLLECTOR_URL;
const REQUEST_TIMEOUT = 30000; // 30 seconds
exports.uploadToSignedUrl = async ({ id, uploadSignedUrl }, contentType, contentObjUrl) => {
try {
const stream = await axios.get(contentObjUrl, { responseType: 'stream', timeout: REQUEST_TIMEOUT });
return axios.put(uploadSignedUrl, stream.data, {
headers: {
'Content-Type': contentType,
},
maxBodyLength: 1000000000,
maxContentLength: 100000000,
timeout: REQUEST_TIMEOUT,
});
} catch (error) {
console.error('error uploading to signed url', error.message);
return null;
}
};
exports.getUploadIdAndSignedUrl = async (userId, url, articleSavingRequestId) => {
const auth = await signToken({ uid: userId }, process.env.JWT_SECRET);
const data = JSON.stringify({
query: `mutation UploadFileRequest($input: UploadFileRequestInput!) {
uploadFileRequest(input:$input) {
... on UploadFileRequestError {
errorCodes
}
... on UploadFileRequestSuccess {
id
uploadSignedUrl
}
}
}`,
variables: {
input: {
url,
contentType: 'application/pdf',
clientRequestId: articleSavingRequestId,
}
}
});
try {
const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data,
{
headers: {
Cookie: `auth=${auth};`,
'Content-Type': 'application/json',
},
timeout: REQUEST_TIMEOUT,
});
if (response.data.data.uploadFileRequest.errorCodes && response.data.data.uploadFileRequest.errorCodes.length > 0) {
console.error('Error while getting upload id and signed url', response.data.data.uploadFileRequest.errorCodes[0]);
return null;
}
return response.data.data.uploadFileRequest;
} catch (e) {
console.error('error getting upload id and signed url', e.message);
return null;
}
};
exports.uploadPdf = async (url, userId, articleSavingRequestId) => {
validateUrlString(url);
const uploadResult = await getUploadIdAndSignedUrl(userId, url, articleSavingRequestId);
if (!uploadResult) {
throw new Error('error while getting upload id and signed url');
}
const uploaded = await uploadToSignedUrl(uploadResult, 'application/pdf', url);
if (!uploaded) {
throw new Error('error while uploading pdf');
}
return uploadResult.id;
};
exports.sendCreateArticleMutation = async (userId, input) => {
const data = JSON.stringify({
query: `mutation CreateArticle ($input: CreateArticleInput!){
createArticle(input:$input){
... on CreateArticleSuccess{
createdArticle{
id
}
}
... on CreateArticleError{
errorCodes
}
}
}`,
variables: {
input,
},
});
const auth = await signToken({ uid: userId }, process.env.JWT_SECRET);
try {
const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data,
{
headers: {
Cookie: `auth=${auth};`,
'Content-Type': 'application/json',
},
timeout: REQUEST_TIMEOUT,
});
if (response.data.data.createArticle.errorCodes && response.data.data.createArticle.errorCodes.length > 0) {
console.error('error while creating article', response.data.data.createArticle.errorCodes[0]);
return null;
}
return response.data.data.createArticle;
} catch (error) {
console.error('error creating article', error.message);
return null;
}
};
exports.sendSavePageMutation = async (userId, input) => {
const data = JSON.stringify({
query: `mutation SavePage ($input: SavePageInput!){
savePage(input:$input){
... on SaveSuccess{
url
clientRequestId
}
... on SaveError{
errorCodes
}
}
}`,
variables: {
input,
},
});
const auth = await signToken({ uid: userId }, process.env.JWT_SECRET);
try {
const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data,
{
headers: {
Cookie: `auth=${auth};`,
'Content-Type': 'application/json',
},
timeout: REQUEST_TIMEOUT,
});
if (response.data.data.savePage.errorCodes && response.data.data.savePage.errorCodes.length > 0) {
console.error('error while saving page', response.data.data.savePage.errorCodes[0]);
if (response.data.data.savePage.errorCodes[0] === 'UNAUTHORIZED') {
return { error: 'UNAUTHORIZED' };
}
return null;
}
return response.data.data.savePage;
} catch (error) {
console.error('error saving page', error.message);
return null;
}
};
exports.saveUploadedPdf = async (userId, url, uploadFileId, articleSavingRequestId) => {
return sendCreateArticleMutation(userId, {
url: encodeURI(url),
articleSavingRequestId,
uploadFileId: uploadFileId,
state,
labels,
source,
folder,
},
);
};
exports.sendImportStatusUpdate = async (userId, taskId, status) => {
try {
const auth = await signToken({ uid: userId }, process.env.JWT_SECRET);
await axios.post(
IMPORTER_METRICS_COLLECTOR_URL,
{
taskId,
status,
},
{
headers: {
'Authorization': auth,
'Content-Type': 'application/json',
},
timeout: REQUEST_TIMEOUT,
});
} catch (e) {
console.error('error while sending import status update', e);
}
};

View File

@ -1,8 +1,8 @@
require('dotenv').config(); require('dotenv').config();
const express = require('express'); const express = require('express');
const { contentFetchRequestHandler } = require('./request_handler');
const app = express(); const app = express();
const { fetchContent } = require("@omnivore/puppeteer-parse");
app.use(express.json()); app.use(express.json());
app.use(express.urlencoded({ extended: true })); app.use(express.urlencoded({ extended: true }));
@ -11,22 +11,19 @@ if (!process.env.VERIFICATION_TOKEN) {
throw new Error('VERIFICATION_TOKEN environment variable is not set'); throw new Error('VERIFICATION_TOKEN environment variable is not set');
} }
app.get('/', async (req, res) => {
if (req.query.token !== process.env.VERIFICATION_TOKEN) {
console.log('query does not include valid token')
res.sendStatus(403)
return
}
await fetchContent(req, res)
});
app.post('/', async (req, res) => { app.all('/', async (req, res) => {
if (req.query.token !== process.env.VERIFICATION_TOKEN) { if (req.method !== 'GET' && req.method !== 'POST') {
console.log('query does not include valid token') console.error('request method is not GET or POST')
res.sendStatus(403) return res.sendStatus(405)
return
} }
await fetchContent(req, res)
if (req.query.token !== process.env.VERIFICATION_TOKEN) {
console.error('query does not include valid token')
return res.sendStatus(403)
}
return contentFetchRequestHandler(req, res);
}); });
const PORT = parseInt(process.env.PORT) || 8080; const PORT = parseInt(process.env.PORT) || 8080;

View File

@ -5,7 +5,7 @@
/* eslint-disable @typescript-eslint/no-require-imports */ /* eslint-disable @typescript-eslint/no-require-imports */
require('dotenv').config(); require('dotenv').config();
const Sentry = require('@sentry/serverless'); const Sentry = require('@sentry/serverless');
const { fetchContent, preview } = require("@omnivore/puppeteer-parse"); const { contentFetchRequestHandler } = require('./request_handler');
Sentry.GCPFunction.init({ Sentry.GCPFunction.init({
dsn: process.env.SENTRY_DSN, dsn: process.env.SENTRY_DSN,
@ -19,7 +19,7 @@ Sentry.GCPFunction.init({
* @param {Object} req Cloud Function request context. * @param {Object} req Cloud Function request context.
* @param {Object} res Cloud Function response context. * @param {Object} res Cloud Function response context.
*/ */
exports.puppeteer = Sentry.GCPFunction.wrapHttpFunction(fetchContent); exports.puppeteer = Sentry.GCPFunction.wrapHttpFunction(contentFetchRequestHandler);
/** /**
* Cloud Function entry point, HTTP trigger. * Cloud Function entry point, HTTP trigger.
@ -30,4 +30,4 @@ exports.puppeteer = Sentry.GCPFunction.wrapHttpFunction(fetchContent);
* * url - URL address of the page to open * * url - URL address of the page to open
* @param {Object} res Cloud Function response context. * @param {Object} res Cloud Function response context.
*/ */
exports.preview = Sentry.GCPFunction.wrapHttpFunction(preview); // exports.preview = Sentry.GCPFunction.wrapHttpFunction(preview);

View File

@ -4,8 +4,10 @@
"description": "Service that fetches page content from a URL", "description": "Service that fetches page content from a URL",
"main": "index.js", "main": "index.js",
"dependencies": { "dependencies": {
"axios": "^0.27.2",
"dotenv": "^8.2.0", "dotenv": "^8.2.0",
"express": "^4.17.1", "express": "^4.17.1",
"jsonwebtoken": "^8.5.1",
"@google-cloud/functions-framework": "^3.0.0", "@google-cloud/functions-framework": "^3.0.0",
"@omnivore/puppeteer-parse": "^1.0.0", "@omnivore/puppeteer-parse": "^1.0.0",
"@sentry/serverless": "^7.77.0", "@sentry/serverless": "^7.77.0",
@ -18,7 +20,6 @@
"scripts": { "scripts": {
"start": "node app.js", "start": "node app.js",
"start_gcf": "npx functions-framework --port=9090 --target=puppeteer", "start_gcf": "npx functions-framework --port=9090 --target=puppeteer",
"start_preview": "npx functions-framework --target=preview",
"test": "mocha test/*.js" "test": "mocha test/*.js"
}, },
"volta": { "volta": {

View File

@ -0,0 +1,114 @@
const { fetchContent } = require("@omnivore/puppeteer-parse");
const { uploadPdf, sendSavePageMutation, sendCreateArticleMutation, sendImportStatusUpdate } = require('./api');
const MAX_RETRY_COUNT = process.env.MAX_RETRY_COUNT || '1';
exports.contentFetchRequestHandler = async (req, res) => {
let functionStartTime = Date.now();
const userId = (req.query ? req.query.userId : undefined) || (req.body ? req.body.userId : undefined);
const articleSavingRequestId = (req.query ? req.query.saveRequestId : undefined) || (req.body ? req.body.saveRequestId : undefined);
const state = req.body.state
const labels = req.body.labels
const source = req.body.source || 'puppeteer-parse';
const taskId = req.body.taskId; // taskId is used to update import status
const url = (req.query ? req.query.url : undefined) || (req.body ? req.body.url : undefined);
const locale = (req.query ? req.query.locale : undefined) || (req.body ? req.body.locale : undefined);
const timezone = (req.query ? req.query.timezone : undefined) || (req.body ? req.body.timezone : undefined);
const rssFeedUrl = req.body.rssFeedUrl;
const savedAt = req.body.savedAt;
const publishedAt = req.body.publishedAt;
const folder = req.body.folder;
const users = req.body ? req.body.users : undefined; // users is used when saving article for multiple users
let logRecord = {
url,
userId,
articleSavingRequestId,
labels: {
source,
},
state,
labelsToAdd: labels,
taskId: taskId,
locale,
timezone,
rssFeedUrl,
savedAt,
publishedAt,
folder,
users,
};
console.log(`Article parsing request`, logRecord);
let importStatus, statusCode = 200;
try {
const { finalUrl, title, content, readabilityResult, contentType } = await fetchContent(url, locale, timezone);
if (contentType === 'application/pdf') {
const uploadFileId = await uploadPdf(finalUrl, userId, articleSavingRequestId);
const uploadedPdf = await sendCreateArticleMutation(userId, {
url: encodeURI(finalUrl),
articleSavingRequestId,
uploadFileId,
state,
labels,
source,
folder,
rssFeedUrl,
savedAt,
publishedAt,
});
if (!uploadedPdf) {
statusCode = 500;
logRecord.error = 'error while saving uploaded pdf';
} else {
importStatus = 'imported';
}
} else {
const apiResponse = await sendSavePageMutation(userId, {
url,
clientRequestId: articleSavingRequestId,
title,
originalContent: content,
parseResult: readabilityResult,
state,
labels,
rssFeedUrl,
savedAt,
publishedAt,
source,
folder,
});
if (!apiResponse) {
logRecord.error = 'error while saving page';
statusCode = 500;
} else if (apiResponse.error === 'UNAUTHORIZED') {
console.log('user is deleted, do not retry', logRecord);
return res.sendStatus(200);
} else {
importStatus = readabilityResult ? 'imported' : 'failed';
}
}
} catch (error) {
logRecord.error = error.message;
} finally {
logRecord.totalTime = Date.now() - functionStartTime;
console.log(`parse-page result`, logRecord);
// mark import failed on the last failed retry
const retryCount = req.headers['x-cloudtasks-taskretrycount'];
if (retryCount === MAX_RETRY_COUNT) {
console.log('max retry count reached');
importStatus = importStatus || 'failed';
}
// send import status to update the metrics
if (taskId && importStatus) {
await sendImportStatusUpdate(userId, taskId, importStatus);
}
res.sendStatus(statusCode);
}
}

View File

@ -7,12 +7,8 @@ const { encode } = require("urlsafe-base64");
const crypto = require("crypto"); const crypto = require("crypto");
const Url = require('url'); const Url = require('url');
const axios = require('axios');
const jwt = require('jsonwebtoken');
const { promisify } = require('util');
const signToken = promisify(jwt.sign);
const os = require('os'); const os = require('os');
const { Storage } = require('@google-cloud/storage'); // const { Storage } = require('@google-cloud/storage');
const { parseHTML } = require('linkedom'); const { parseHTML } = require('linkedom');
const { preHandleContent, preParseContent } = require("@omnivore/content-handler"); const { preHandleContent, preParseContent } = require("@omnivore/content-handler");
const { Readability } = require("@omnivore/readability"); const { Readability } = require("@omnivore/readability");
@ -29,9 +25,9 @@ puppeteer.use(AdblockerPlugin({ blockTrackers: true }));
const createDOMPurify = require("dompurify"); const createDOMPurify = require("dompurify");
const storage = new Storage(); // const storage = new Storage();
const ALLOWED_ORIGINS = process.env.ALLOWED_ORIGINS ? process.env.ALLOWED_ORIGINS.split(',') : []; const ALLOWED_ORIGINS = process.env.ALLOWED_ORIGINS ? process.env.ALLOWED_ORIGINS.split(',') : [];
const previewBucket = process.env.PREVIEW_IMAGE_BUCKET ? storage.bucket(process.env.PREVIEW_IMAGE_BUCKET) : undefined; // const previewBucket = process.env.PREVIEW_IMAGE_BUCKET ? storage.bucket(process.env.PREVIEW_IMAGE_BUCKET) : undefined;
const filePath = `${os.tmpdir()}/previewImage.png`; const filePath = `${os.tmpdir()}/previewImage.png`;
@ -44,11 +40,6 @@ const NON_SCRIPT_HOSTS= ['medium.com', 'fastcompany.com', 'fortelabs.com'];
const ALLOWED_CONTENT_TYPES = ['text/html', 'application/octet-stream', 'text/plain', 'application/pdf']; const ALLOWED_CONTENT_TYPES = ['text/html', 'application/octet-stream', 'text/plain', 'application/pdf'];
const IMPORTER_METRICS_COLLECTOR_URL = process.env.IMPORTER_METRICS_COLLECTOR_URL;
const REQUEST_TIMEOUT = 30000; // 30 seconds
const MAX_RETRY_COUNT = process.env.MAX_RETRY_COUNT || '1';
const userAgentForUrl = (url) => { const userAgentForUrl = (url) => {
try { try {
const u = new URL(url); const u = new URL(url);
@ -140,249 +131,21 @@ const getBrowserPromise = (async () => {
}); });
})(); })();
const uploadToSignedUrl = async ({ id, uploadSignedUrl }, contentType, contentObjUrl) => { async function fetchContent(url, locale, timezone) {
try {
const stream = await axios.get(contentObjUrl, { responseType: 'stream', timeout: REQUEST_TIMEOUT });
return axios.put(uploadSignedUrl, stream.data, {
headers: {
'Content-Type': contentType,
},
maxBodyLength: 1000000000,
maxContentLength: 100000000,
timeout: REQUEST_TIMEOUT,
});
} catch (error) {
console.error('error uploading to signed url', error.message);
return null;
}
};
const getUploadIdAndSignedUrl = async (userId, url, articleSavingRequestId) => {
const auth = await signToken({ uid: userId }, process.env.JWT_SECRET);
const data = JSON.stringify({
query: `mutation UploadFileRequest($input: UploadFileRequestInput!) {
uploadFileRequest(input:$input) {
... on UploadFileRequestError {
errorCodes
}
... on UploadFileRequestSuccess {
id
uploadSignedUrl
}
}
}`,
variables: {
input: {
url,
contentType: 'application/pdf',
clientRequestId: articleSavingRequestId,
}
}
});
try {
const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data,
{
headers: {
Cookie: `auth=${auth};`,
'Content-Type': 'application/json',
},
timeout: REQUEST_TIMEOUT,
});
if (response.data.data.uploadFileRequest.errorCodes && response.data.data.uploadFileRequest.errorCodes.length > 0) {
console.error('Error while getting upload id and signed url', response.data.data.uploadFileRequest.errorCodes[0]);
return null;
}
return response.data.data.uploadFileRequest;
} catch (e) {
console.error('error getting upload id and signed url', e.message);
return null;
}
};
const uploadPdf = async (url, userId, articleSavingRequestId) => {
validateUrlString(url);
const uploadResult = await getUploadIdAndSignedUrl(userId, url, articleSavingRequestId);
if (!uploadResult) {
throw new Error('error while getting upload id and signed url');
}
const uploaded = await uploadToSignedUrl(uploadResult, 'application/pdf', url);
if (!uploaded) {
throw new Error('error while uploading pdf');
}
return uploadResult.id;
};
const sendCreateArticleMutation = async (userId, input) => {
const data = JSON.stringify({
query: `mutation CreateArticle ($input: CreateArticleInput!){
createArticle(input:$input){
... on CreateArticleSuccess{
createdArticle{
id
}
}
... on CreateArticleError{
errorCodes
}
}
}`,
variables: {
input,
},
});
const auth = await signToken({ uid: userId }, process.env.JWT_SECRET);
try {
const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data,
{
headers: {
Cookie: `auth=${auth};`,
'Content-Type': 'application/json',
},
timeout: REQUEST_TIMEOUT,
});
if (response.data.data.createArticle.errorCodes && response.data.data.createArticle.errorCodes.length > 0) {
console.error('error while creating article', response.data.data.createArticle.errorCodes[0]);
return null;
}
return response.data.data.createArticle;
} catch (error) {
console.error('error creating article', error.message);
return null;
}
};
const sendSavePageMutation = async (userId, input) => {
const data = JSON.stringify({
query: `mutation SavePage ($input: SavePageInput!){
savePage(input:$input){
... on SaveSuccess{
url
clientRequestId
}
... on SaveError{
errorCodes
}
}
}`,
variables: {
input,
},
});
const auth = await signToken({ uid: userId }, process.env.JWT_SECRET);
try {
const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data,
{
headers: {
Cookie: `auth=${auth};`,
'Content-Type': 'application/json',
},
timeout: REQUEST_TIMEOUT,
});
if (response.data.data.savePage.errorCodes && response.data.data.savePage.errorCodes.length > 0) {
console.error('error while saving page', response.data.data.savePage.errorCodes[0]);
if (response.data.data.savePage.errorCodes[0] === 'UNAUTHORIZED') {
return { error: 'UNAUTHORIZED' };
}
return null;
}
return response.data.data.savePage;
} catch (error) {
console.error('error saving page', error.message);
return null;
}
};
const saveUploadedPdf = async (userId, url, uploadFileId, articleSavingRequestId) => {
return sendCreateArticleMutation(userId, {
url: encodeURI(url),
articleSavingRequestId,
uploadFileId: uploadFileId,
state,
labels,
source,
folder,
},
);
};
const sendImportStatusUpdate = async (userId, taskId, status) => {
try {
const auth = await signToken({ uid: userId }, process.env.JWT_SECRET);
await axios.post(
IMPORTER_METRICS_COLLECTOR_URL,
{
taskId,
status,
},
{
headers: {
'Authorization': auth,
'Content-Type': 'application/json',
},
timeout: REQUEST_TIMEOUT,
});
} catch (e) {
console.error('error while sending import status update', e);
}
};
async function fetchContent(req, res) {
let functionStartTime = Date.now(); let functionStartTime = Date.now();
const userId = (req.query ? req.query.userId : undefined) || (req.body ? req.body.userId : undefined);
const articleSavingRequestId = (req.query ? req.query.saveRequestId : undefined) || (req.body ? req.body.saveRequestId : undefined);
const state = req.body.state
const labels = req.body.labels
const source = req.body.source || 'puppeteer-parse';
const taskId = req.body.taskId; // taskId is used to update import status
const urlStr = (req.query ? req.query.url : undefined) || (req.body ? req.body.url : undefined);
const locale = (req.query ? req.query.locale : undefined) || (req.body ? req.body.locale : undefined);
const timezone = (req.query ? req.query.timezone : undefined) || (req.body ? req.body.timezone : undefined);
const rssFeedUrl = req.body.rssFeedUrl;
const savedAt = req.body.savedAt;
const publishedAt = req.body.publishedAt;
const folder = req.body.folder;
let logRecord = { let logRecord = {
url: urlStr, url,
userId, functionStartTime,
articleSavingRequestId,
labels: {
source,
},
state,
labelsToAdd: labels,
taskId: taskId,
locale, locale,
timezone, timezone,
rssFeedUrl, }
savedAt, console.log(`content-fetch request`, logRecord);
publishedAt,
folder,
};
console.info(`Article parsing request`, logRecord); let context, page, finalUrl, title, content, contentType, readabilityResult = null;
let url, context, page, finalUrl, title, content, contentType, importStatus, statusCode = 200;
try { try {
url = getUrl(urlStr); url = getUrl(url);
if (!url) { if (!url) {
logRecord.urlIsInvalid = true; throw new Error('Valid URL to parse not specified');
logRecord.error = 'Valid URL to parse not specified';
statusCode = 400;
return;
} }
// pre handle url with custom handlers // pre handle url with custom handlers
@ -410,27 +173,7 @@ async function fetchContent(req, res) {
finalUrl = url finalUrl = url
} }
if (contentType === 'application/pdf') { if (contentType !== 'application/pdf') {
const uploadFileId = await uploadPdf(finalUrl, userId, articleSavingRequestId);
const uploadedPdf = await sendCreateArticleMutation(userId, {
url: encodeURI(finalUrl),
articleSavingRequestId,
uploadFileId,
state,
labels,
source,
folder,
rssFeedUrl,
savedAt,
publishedAt,
});
if (!uploadedPdf) {
statusCode = 500;
logRecord.error = 'error while saving uploaded pdf';
} else {
importStatus = 'imported';
}
} else {
if (!content || !title) { if (!content || !title) {
const result = await retrieveHtml(page, logRecord); const result = await retrieveHtml(page, logRecord);
if (result.isBlocked) { if (result.isBlocked) {
@ -444,12 +187,9 @@ async function fetchContent(req, res) {
} else { } else {
console.info('using prefetched content and title'); console.info('using prefetched content and title');
} }
logRecord.fetchContentTime = Date.now() - functionStartTime;
} }
} catch (e) { } catch (e) {
logRecord.error = e.message; console.error(`Error while retrieving page ${url}`, e);
console.error(`Error while retrieving page`, logRecord);
statusCode = 500;
// fallback to scrapingbee for non pdf content // fallback to scrapingbee for non pdf content
if (url && contentType !== 'application/pdf') { if (url && contentType !== 'application/pdf') {
@ -459,8 +199,8 @@ async function fetchContent(req, res) {
const sbResult = await fetchContentWithScrapingBee(url); const sbResult = await fetchContentWithScrapingBee(url);
content = sbResult.domContent; content = sbResult.domContent;
title = sbResult.title; title = sbResult.title;
logRecord.fetchContentTime = Date.now() - fetchStartTime; } else {
statusCode = 200; throw e;
} }
} finally { } finally {
// close browser context if it was opened // close browser context if it was opened
@ -470,7 +210,6 @@ async function fetchContent(req, res) {
// save non pdf content // save non pdf content
if (url && contentType !== 'application/pdf') { if (url && contentType !== 'application/pdf') {
// parse content if it is not empty // parse content if it is not empty
let readabilityResult = null;
if (content) { if (content) {
let document = parseHTML(content).document; let document = parseHTML(content).document;
// preParse content // preParse content
@ -480,48 +219,11 @@ async function fetchContent(req, res) {
} }
readabilityResult = await getReadabilityResult(url, document); readabilityResult = await getReadabilityResult(url, document);
} }
const apiResponse = await sendSavePageMutation(userId, {
url,
clientRequestId: articleSavingRequestId,
title,
originalContent: content,
parseResult: readabilityResult,
state,
labels,
rssFeedUrl,
savedAt,
publishedAt,
source,
folder,
});
if (!apiResponse) {
logRecord.error = 'error while saving page';
statusCode = 500;
} else if (apiResponse.error === 'UNAUTHORIZED') {
console.info('user is deleted, do not retry', logRecord);
return res.sendStatus(200);
} else {
importStatus = readabilityResult ? 'imported' : 'failed';
}
} }
logRecord.totalTime = Date.now() - functionStartTime; console.info(`content-fetch result`, logRecord);
console.info(`parse-page`, logRecord);
// mark import failed on the last failed retry return { finalUrl, title, content, readabilityResult, contentType };
const retryCount = req.headers['x-cloudtasks-taskretrycount'];
if (retryCount == MAX_RETRY_COUNT) {
console.info('max retry count reached');
importStatus = importStatus || 'failed';
}
// send import status to update the metrics
if (taskId && importStatus) {
await sendImportStatusUpdate(userId, taskId, importStatus);
}
res.sendStatus(statusCode);
} }
} }
@ -832,126 +534,126 @@ async function retrieveHtml(page, logRecord) {
return { domContent, title }; return { domContent, title };
} }
async function preview(req, res) { // async function preview(req, res) {
const functionStartTime = Date.now(); // const functionStartTime = Date.now();
// Grabbing execution and trace ids to attach logs to the appropriate function call // // Grabbing execution and trace ids to attach logs to the appropriate function call
const execution_id = req.get('function-execution-id'); // const execution_id = req.get('function-execution-id');
const traceId = (req.get('x-cloud-trace-context') || '').split('/')[0]; // const traceId = (req.get('x-cloud-trace-context') || '').split('/')[0];
const console = buildconsole('cloudfunctions.googleapis.com%2Fcloud-functions', { // const console = buildconsole('cloudfunctions.googleapis.com%2Fcloud-functions', {
trace: `projects/${process.env.GCLOUD_PROJECT}/traces/${traceId}`, // trace: `projects/${process.env.GCLOUD_PROJECT}/traces/${traceId}`,
labels: { // labels: {
execution_id: execution_id, // execution_id: execution_id,
}, // },
}); // });
if (!process.env.PREVIEW_IMAGE_BUCKET) { // if (!process.env.PREVIEW_IMAGE_BUCKET) {
console.error(`PREVIEW_IMAGE_BUCKET not set`) // console.error(`PREVIEW_IMAGE_BUCKET not set`)
return res.sendStatus(500); // return res.sendStatus(500);
} // }
const urlStr = (req.query ? req.query.url : undefined) || (req.body ? req.body.url : undefined); // const urlStr = (req.query ? req.query.url : undefined) || (req.body ? req.body.url : undefined);
const url = getUrl(urlStr); // const url = getUrl(urlStr);
console.log('preview request url', url); // console.log('preview request url', url);
const logRecord = { // const logRecord = {
url, // url,
query: req.query, // query: req.query,
origin: req.get('Origin'), // origin: req.get('Origin'),
labels: { // labels: {
source: 'publicImagePreview', // source: 'publicImagePreview',
}, // },
}; // };
console.info(`Public preview image generation request`, logRecord); // console.info(`Public preview image generation request`, logRecord);
if (!url) { // if (!url) {
logRecord.urlIsInvalid = true; // logRecord.urlIsInvalid = true;
console.error(`Valid URL to parse is not specified`, logRecord); // console.error(`Valid URL to parse is not specified`, logRecord);
return res.sendStatus(400); // return res.sendStatus(400);
} // }
const { origin } = new URL(url); // const { origin } = new URL(url);
if (!ALLOWED_ORIGINS.some(o => o === origin)) { // if (!ALLOWED_ORIGINS.some(o => o === origin)) {
logRecord.forbiddenOrigin = true; // logRecord.forbiddenOrigin = true;
console.error(`This origin is not allowed: ${origin}`, logRecord); // console.error(`This origin is not allowed: ${origin}`, logRecord);
return res.sendStatus(400); // return res.sendStatus(400);
} // }
const browser = await getBrowserPromise; // const browser = await getBrowserPromise;
logRecord.timing = { ...logRecord.timing, browserOpened: Date.now() - functionStartTime }; // logRecord.timing = { ...logRecord.timing, browserOpened: Date.now() - functionStartTime };
const page = await browser.newPage(); // const page = await browser.newPage();
const pageLoadingStart = Date.now(); // const pageLoadingStart = Date.now();
const modifiedUrl = new URL(url); // const modifiedUrl = new URL(url);
modifiedUrl.searchParams.append('fontSize', '24'); // modifiedUrl.searchParams.append('fontSize', '24');
modifiedUrl.searchParams.append('adjustAspectRatio', '1.91'); // modifiedUrl.searchParams.append('adjustAspectRatio', '1.91');
try { // try {
await page.goto(modifiedUrl.toString()); // await page.goto(modifiedUrl.toString());
logRecord.timing = { ...logRecord.timing, pageLoaded: Date.now() - pageLoadingStart }; // logRecord.timing = { ...logRecord.timing, pageLoaded: Date.now() - pageLoadingStart };
} catch (error) { // } catch (error) {
console.log('error going to page: ', modifiedUrl) // console.log('error going to page: ', modifiedUrl)
console.log(error) // console.log(error)
throw error // throw error
} // }
// We lookup the destination path from our own page content and avoid trusting any passed query params // // We lookup the destination path from our own page content and avoid trusting any passed query params
// selector - CSS selector of the element to get screenshot of // // selector - CSS selector of the element to get screenshot of
const selector = decodeURIComponent( // const selector = decodeURIComponent(
await page.$eval( // await page.$eval(
"head > meta[name='omnivore:preview_image_selector']", // "head > meta[name='omnivore:preview_image_selector']",
element => element.content, // element => element.content,
), // ),
); // );
if (!selector) { // if (!selector) {
logRecord.selectorIsInvalid = true; // logRecord.selectorIsInvalid = true;
console.error(`Valid element selector is not specified`, logRecord); // console.error(`Valid element selector is not specified`, logRecord);
await page.close(); // await page.close();
return res.sendStatus(400); // return res.sendStatus(400);
} // }
logRecord.selector = selector; // logRecord.selector = selector;
// destination - destination pathname for the image to save with // // destination - destination pathname for the image to save with
const destination = decodeURIComponent( // const destination = decodeURIComponent(
await page.$eval( // await page.$eval(
"head > meta[name='omnivore:preview_image_destination']", // "head > meta[name='omnivore:preview_image_destination']",
element => element.content, // element => element.content,
), // ),
); // );
if (!destination) { // if (!destination) {
logRecord.destinationIsInvalid = true; // logRecord.destinationIsInvalid = true;
console.error(`Valid file destination is not specified`, logRecord); // console.error(`Valid file destination is not specified`, logRecord);
await page.close(); // await page.close();
return res.sendStatus(400); // return res.sendStatus(400);
} // }
logRecord.destination = destination; // logRecord.destination = destination;
const screenshotTakingStart = Date.now(); // const screenshotTakingStart = Date.now();
try { // try {
await page.waitForSelector(selector, { timeout: 3000 }); // wait for the selector to load // await page.waitForSelector(selector, { timeout: 3000 }); // wait for the selector to load
} catch (error) { // } catch (error) {
logRecord.elementNotFound = true; // logRecord.elementNotFound = true;
console.error(`Element is not presented on the page`, logRecord); // console.error(`Element is not presented on the page`, logRecord);
await page.close(); // await page.close();
return res.sendStatus(400); // return res.sendStatus(400);
} // }
const element = await page.$(selector); // const element = await page.$(selector);
await element.screenshot({ path: filePath }); // take screenshot of the element in puppeteer // await element.screenshot({ path: filePath }); // take screenshot of the element in puppeteer
logRecord.timing = { ...logRecord.timing, screenshotTaken: Date.now() - screenshotTakingStart }; // logRecord.timing = { ...logRecord.timing, screenshotTaken: Date.now() - screenshotTakingStart };
await page.close(); // await page.close();
try { // try {
const [file] = await previewBucket.upload(filePath, { // const [file] = await previewBucket.upload(filePath, {
destination, // destination,
metadata: logRecord, // metadata: logRecord,
}); // });
logRecord.file = file.metadata; // logRecord.file = file.metadata;
} catch (e) { // } catch (e) {
console.log('error uploading to bucket, this is non-fatal', e) // console.log('error uploading to bucket, this is non-fatal', e)
} // }
console.info(`preview-image`, logRecord); // console.info(`preview-image`, logRecord);
return res.redirect(`${process.env.PREVIEW_IMAGE_CDN_ORIGIN}/${destination}`); // return res.redirect(`${process.env.PREVIEW_IMAGE_CDN_ORIGIN}/${destination}`);
} // }
const DOM_PURIFY_CONFIG = { const DOM_PURIFY_CONFIG = {
ADD_TAGS: ['iframe'], ADD_TAGS: ['iframe'],
@ -1048,6 +750,6 @@ async function getReadabilityResult(url, document) {
module.exports = { module.exports = {
fetchContent, fetchContent,
preview, // preview,
}; };

View File

@ -4,19 +4,15 @@
"description": "Accepts URL of the article and parses its content", "description": "Accepts URL of the article and parses its content",
"main": "index.js", "main": "index.js",
"dependencies": { "dependencies": {
"@google-cloud/storage": "^7.0.1",
"@omnivore/content-handler": "1.0.0", "@omnivore/content-handler": "1.0.0",
"@omnivore/readability": "1.0.0", "@omnivore/readability": "1.0.0",
"axios": "^0.27.2",
"crypto": "^1.0.1", "crypto": "^1.0.1",
"dompurify": "^2.4.1", "dompurify": "^2.4.1",
"jsonwebtoken": "^8.5.1",
"linkedom": "^0.14.9", "linkedom": "^0.14.9",
"puppeteer-core": "^20.9.0", "puppeteer-core": "^20.9.0",
"puppeteer-extra": "^3.3.4", "puppeteer-extra": "^3.3.4",
"puppeteer-extra-plugin-adblocker": "^2.13.5", "puppeteer-extra-plugin-adblocker": "^2.13.5",
"puppeteer-extra-plugin-stealth": "^2.11.1", "puppeteer-extra-plugin-stealth": "^2.11.1",
"underscore": "^1.13.4",
"urlsafe-base64": "^1.0.0" "urlsafe-base64": "^1.0.0"
}, },
"devDependencies": { "devDependencies": {