separate content-fetch in puppeteer packages from saving page content
This commit is contained in:
205
packages/content-fetch/api.js
Normal file
205
packages/content-fetch/api.js
Normal file
@ -0,0 +1,205 @@
|
|||||||
|
const axios = require('axios');
|
||||||
|
const jwt = require('jsonwebtoken');
|
||||||
|
const { promisify } = require('util');
|
||||||
|
const signToken = promisify(jwt.sign);
|
||||||
|
|
||||||
|
const IMPORTER_METRICS_COLLECTOR_URL = process.env.IMPORTER_METRICS_COLLECTOR_URL;
|
||||||
|
const REQUEST_TIMEOUT = 30000; // 30 seconds
|
||||||
|
|
||||||
|
exports.uploadToSignedUrl = async ({ id, uploadSignedUrl }, contentType, contentObjUrl) => {
|
||||||
|
try {
|
||||||
|
const stream = await axios.get(contentObjUrl, { responseType: 'stream', timeout: REQUEST_TIMEOUT });
|
||||||
|
return axios.put(uploadSignedUrl, stream.data, {
|
||||||
|
headers: {
|
||||||
|
'Content-Type': contentType,
|
||||||
|
},
|
||||||
|
maxBodyLength: 1000000000,
|
||||||
|
maxContentLength: 100000000,
|
||||||
|
timeout: REQUEST_TIMEOUT,
|
||||||
|
});
|
||||||
|
} catch (error) {
|
||||||
|
console.error('error uploading to signed url', error.message);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
exports.getUploadIdAndSignedUrl = async (userId, url, articleSavingRequestId) => {
|
||||||
|
const auth = await signToken({ uid: userId }, process.env.JWT_SECRET);
|
||||||
|
const data = JSON.stringify({
|
||||||
|
query: `mutation UploadFileRequest($input: UploadFileRequestInput!) {
|
||||||
|
uploadFileRequest(input:$input) {
|
||||||
|
... on UploadFileRequestError {
|
||||||
|
errorCodes
|
||||||
|
}
|
||||||
|
... on UploadFileRequestSuccess {
|
||||||
|
id
|
||||||
|
uploadSignedUrl
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}`,
|
||||||
|
variables: {
|
||||||
|
input: {
|
||||||
|
url,
|
||||||
|
contentType: 'application/pdf',
|
||||||
|
clientRequestId: articleSavingRequestId,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
try {
|
||||||
|
const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data,
|
||||||
|
{
|
||||||
|
headers: {
|
||||||
|
Cookie: `auth=${auth};`,
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
},
|
||||||
|
timeout: REQUEST_TIMEOUT,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (response.data.data.uploadFileRequest.errorCodes && response.data.data.uploadFileRequest.errorCodes.length > 0) {
|
||||||
|
console.error('Error while getting upload id and signed url', response.data.data.uploadFileRequest.errorCodes[0]);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return response.data.data.uploadFileRequest;
|
||||||
|
} catch (e) {
|
||||||
|
console.error('error getting upload id and signed url', e.message);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
exports.uploadPdf = async (url, userId, articleSavingRequestId) => {
|
||||||
|
validateUrlString(url);
|
||||||
|
|
||||||
|
const uploadResult = await getUploadIdAndSignedUrl(userId, url, articleSavingRequestId);
|
||||||
|
if (!uploadResult) {
|
||||||
|
throw new Error('error while getting upload id and signed url');
|
||||||
|
}
|
||||||
|
const uploaded = await uploadToSignedUrl(uploadResult, 'application/pdf', url);
|
||||||
|
if (!uploaded) {
|
||||||
|
throw new Error('error while uploading pdf');
|
||||||
|
}
|
||||||
|
return uploadResult.id;
|
||||||
|
};
|
||||||
|
|
||||||
|
exports.sendCreateArticleMutation = async (userId, input) => {
|
||||||
|
const data = JSON.stringify({
|
||||||
|
query: `mutation CreateArticle ($input: CreateArticleInput!){
|
||||||
|
createArticle(input:$input){
|
||||||
|
... on CreateArticleSuccess{
|
||||||
|
createdArticle{
|
||||||
|
id
|
||||||
|
}
|
||||||
|
}
|
||||||
|
... on CreateArticleError{
|
||||||
|
errorCodes
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}`,
|
||||||
|
variables: {
|
||||||
|
input,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
const auth = await signToken({ uid: userId }, process.env.JWT_SECRET);
|
||||||
|
try {
|
||||||
|
const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data,
|
||||||
|
{
|
||||||
|
headers: {
|
||||||
|
Cookie: `auth=${auth};`,
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
},
|
||||||
|
timeout: REQUEST_TIMEOUT,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (response.data.data.createArticle.errorCodes && response.data.data.createArticle.errorCodes.length > 0) {
|
||||||
|
console.error('error while creating article', response.data.data.createArticle.errorCodes[0]);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return response.data.data.createArticle;
|
||||||
|
} catch (error) {
|
||||||
|
console.error('error creating article', error.message);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
exports.sendSavePageMutation = async (userId, input) => {
|
||||||
|
const data = JSON.stringify({
|
||||||
|
query: `mutation SavePage ($input: SavePageInput!){
|
||||||
|
savePage(input:$input){
|
||||||
|
... on SaveSuccess{
|
||||||
|
url
|
||||||
|
clientRequestId
|
||||||
|
}
|
||||||
|
... on SaveError{
|
||||||
|
errorCodes
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}`,
|
||||||
|
variables: {
|
||||||
|
input,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
const auth = await signToken({ uid: userId }, process.env.JWT_SECRET);
|
||||||
|
try {
|
||||||
|
const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data,
|
||||||
|
{
|
||||||
|
headers: {
|
||||||
|
Cookie: `auth=${auth};`,
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
},
|
||||||
|
timeout: REQUEST_TIMEOUT,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (response.data.data.savePage.errorCodes && response.data.data.savePage.errorCodes.length > 0) {
|
||||||
|
console.error('error while saving page', response.data.data.savePage.errorCodes[0]);
|
||||||
|
if (response.data.data.savePage.errorCodes[0] === 'UNAUTHORIZED') {
|
||||||
|
return { error: 'UNAUTHORIZED' };
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return response.data.data.savePage;
|
||||||
|
} catch (error) {
|
||||||
|
console.error('error saving page', error.message);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
exports.saveUploadedPdf = async (userId, url, uploadFileId, articleSavingRequestId) => {
|
||||||
|
return sendCreateArticleMutation(userId, {
|
||||||
|
url: encodeURI(url),
|
||||||
|
articleSavingRequestId,
|
||||||
|
uploadFileId: uploadFileId,
|
||||||
|
state,
|
||||||
|
labels,
|
||||||
|
source,
|
||||||
|
folder,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
};
|
||||||
|
|
||||||
|
exports.sendImportStatusUpdate = async (userId, taskId, status) => {
|
||||||
|
try {
|
||||||
|
const auth = await signToken({ uid: userId }, process.env.JWT_SECRET);
|
||||||
|
|
||||||
|
await axios.post(
|
||||||
|
IMPORTER_METRICS_COLLECTOR_URL,
|
||||||
|
{
|
||||||
|
taskId,
|
||||||
|
status,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
headers: {
|
||||||
|
'Authorization': auth,
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
},
|
||||||
|
timeout: REQUEST_TIMEOUT,
|
||||||
|
});
|
||||||
|
} catch (e) {
|
||||||
|
console.error('error while sending import status update', e);
|
||||||
|
}
|
||||||
|
};
|
||||||
@ -1,8 +1,8 @@
|
|||||||
require('dotenv').config();
|
require('dotenv').config();
|
||||||
const express = require('express');
|
const express = require('express');
|
||||||
|
const { contentFetchRequestHandler } = require('./request_handler');
|
||||||
|
|
||||||
const app = express();
|
const app = express();
|
||||||
const { fetchContent } = require("@omnivore/puppeteer-parse");
|
|
||||||
|
|
||||||
app.use(express.json());
|
app.use(express.json());
|
||||||
app.use(express.urlencoded({ extended: true }));
|
app.use(express.urlencoded({ extended: true }));
|
||||||
@ -11,22 +11,19 @@ if (!process.env.VERIFICATION_TOKEN) {
|
|||||||
throw new Error('VERIFICATION_TOKEN environment variable is not set');
|
throw new Error('VERIFICATION_TOKEN environment variable is not set');
|
||||||
}
|
}
|
||||||
|
|
||||||
app.get('/', async (req, res) => {
|
|
||||||
if (req.query.token !== process.env.VERIFICATION_TOKEN) {
|
|
||||||
console.log('query does not include valid token')
|
|
||||||
res.sendStatus(403)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
await fetchContent(req, res)
|
|
||||||
});
|
|
||||||
|
|
||||||
app.post('/', async (req, res) => {
|
app.all('/', async (req, res) => {
|
||||||
if (req.query.token !== process.env.VERIFICATION_TOKEN) {
|
if (req.method !== 'GET' && req.method !== 'POST') {
|
||||||
console.log('query does not include valid token')
|
console.error('request method is not GET or POST')
|
||||||
res.sendStatus(403)
|
return res.sendStatus(405)
|
||||||
return
|
|
||||||
}
|
}
|
||||||
await fetchContent(req, res)
|
|
||||||
|
if (req.query.token !== process.env.VERIFICATION_TOKEN) {
|
||||||
|
console.error('query does not include valid token')
|
||||||
|
return res.sendStatus(403)
|
||||||
|
}
|
||||||
|
|
||||||
|
return contentFetchRequestHandler(req, res);
|
||||||
});
|
});
|
||||||
|
|
||||||
const PORT = parseInt(process.env.PORT) || 8080;
|
const PORT = parseInt(process.env.PORT) || 8080;
|
||||||
|
|||||||
@ -5,7 +5,7 @@
|
|||||||
/* eslint-disable @typescript-eslint/no-require-imports */
|
/* eslint-disable @typescript-eslint/no-require-imports */
|
||||||
require('dotenv').config();
|
require('dotenv').config();
|
||||||
const Sentry = require('@sentry/serverless');
|
const Sentry = require('@sentry/serverless');
|
||||||
const { fetchContent, preview } = require("@omnivore/puppeteer-parse");
|
const { contentFetchRequestHandler } = require('./request_handler');
|
||||||
|
|
||||||
Sentry.GCPFunction.init({
|
Sentry.GCPFunction.init({
|
||||||
dsn: process.env.SENTRY_DSN,
|
dsn: process.env.SENTRY_DSN,
|
||||||
@ -19,7 +19,7 @@ Sentry.GCPFunction.init({
|
|||||||
* @param {Object} req Cloud Function request context.
|
* @param {Object} req Cloud Function request context.
|
||||||
* @param {Object} res Cloud Function response context.
|
* @param {Object} res Cloud Function response context.
|
||||||
*/
|
*/
|
||||||
exports.puppeteer = Sentry.GCPFunction.wrapHttpFunction(fetchContent);
|
exports.puppeteer = Sentry.GCPFunction.wrapHttpFunction(contentFetchRequestHandler);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Cloud Function entry point, HTTP trigger.
|
* Cloud Function entry point, HTTP trigger.
|
||||||
@ -30,4 +30,4 @@ exports.puppeteer = Sentry.GCPFunction.wrapHttpFunction(fetchContent);
|
|||||||
* * url - URL address of the page to open
|
* * url - URL address of the page to open
|
||||||
* @param {Object} res Cloud Function response context.
|
* @param {Object} res Cloud Function response context.
|
||||||
*/
|
*/
|
||||||
exports.preview = Sentry.GCPFunction.wrapHttpFunction(preview);
|
// exports.preview = Sentry.GCPFunction.wrapHttpFunction(preview);
|
||||||
|
|||||||
@ -4,8 +4,10 @@
|
|||||||
"description": "Service that fetches page content from a URL",
|
"description": "Service that fetches page content from a URL",
|
||||||
"main": "index.js",
|
"main": "index.js",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
|
"axios": "^0.27.2",
|
||||||
"dotenv": "^8.2.0",
|
"dotenv": "^8.2.0",
|
||||||
"express": "^4.17.1",
|
"express": "^4.17.1",
|
||||||
|
"jsonwebtoken": "^8.5.1",
|
||||||
"@google-cloud/functions-framework": "^3.0.0",
|
"@google-cloud/functions-framework": "^3.0.0",
|
||||||
"@omnivore/puppeteer-parse": "^1.0.0",
|
"@omnivore/puppeteer-parse": "^1.0.0",
|
||||||
"@sentry/serverless": "^7.77.0",
|
"@sentry/serverless": "^7.77.0",
|
||||||
@ -18,7 +20,6 @@
|
|||||||
"scripts": {
|
"scripts": {
|
||||||
"start": "node app.js",
|
"start": "node app.js",
|
||||||
"start_gcf": "npx functions-framework --port=9090 --target=puppeteer",
|
"start_gcf": "npx functions-framework --port=9090 --target=puppeteer",
|
||||||
"start_preview": "npx functions-framework --target=preview",
|
|
||||||
"test": "mocha test/*.js"
|
"test": "mocha test/*.js"
|
||||||
},
|
},
|
||||||
"volta": {
|
"volta": {
|
||||||
|
|||||||
114
packages/content-fetch/request_handler.js
Normal file
114
packages/content-fetch/request_handler.js
Normal file
@ -0,0 +1,114 @@
|
|||||||
|
const { fetchContent } = require("@omnivore/puppeteer-parse");
|
||||||
|
const { uploadPdf, sendSavePageMutation, sendCreateArticleMutation, sendImportStatusUpdate } = require('./api');
|
||||||
|
|
||||||
|
const MAX_RETRY_COUNT = process.env.MAX_RETRY_COUNT || '1';
|
||||||
|
|
||||||
|
exports.contentFetchRequestHandler = async (req, res) => {
|
||||||
|
let functionStartTime = Date.now();
|
||||||
|
|
||||||
|
const userId = (req.query ? req.query.userId : undefined) || (req.body ? req.body.userId : undefined);
|
||||||
|
const articleSavingRequestId = (req.query ? req.query.saveRequestId : undefined) || (req.body ? req.body.saveRequestId : undefined);
|
||||||
|
const state = req.body.state
|
||||||
|
const labels = req.body.labels
|
||||||
|
const source = req.body.source || 'puppeteer-parse';
|
||||||
|
const taskId = req.body.taskId; // taskId is used to update import status
|
||||||
|
const url = (req.query ? req.query.url : undefined) || (req.body ? req.body.url : undefined);
|
||||||
|
const locale = (req.query ? req.query.locale : undefined) || (req.body ? req.body.locale : undefined);
|
||||||
|
const timezone = (req.query ? req.query.timezone : undefined) || (req.body ? req.body.timezone : undefined);
|
||||||
|
const rssFeedUrl = req.body.rssFeedUrl;
|
||||||
|
const savedAt = req.body.savedAt;
|
||||||
|
const publishedAt = req.body.publishedAt;
|
||||||
|
const folder = req.body.folder;
|
||||||
|
const users = req.body ? req.body.users : undefined; // users is used when saving article for multiple users
|
||||||
|
|
||||||
|
let logRecord = {
|
||||||
|
url,
|
||||||
|
userId,
|
||||||
|
articleSavingRequestId,
|
||||||
|
labels: {
|
||||||
|
source,
|
||||||
|
},
|
||||||
|
state,
|
||||||
|
labelsToAdd: labels,
|
||||||
|
taskId: taskId,
|
||||||
|
locale,
|
||||||
|
timezone,
|
||||||
|
rssFeedUrl,
|
||||||
|
savedAt,
|
||||||
|
publishedAt,
|
||||||
|
folder,
|
||||||
|
users,
|
||||||
|
};
|
||||||
|
|
||||||
|
console.log(`Article parsing request`, logRecord);
|
||||||
|
|
||||||
|
let importStatus, statusCode = 200;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const { finalUrl, title, content, readabilityResult, contentType } = await fetchContent(url, locale, timezone);
|
||||||
|
if (contentType === 'application/pdf') {
|
||||||
|
const uploadFileId = await uploadPdf(finalUrl, userId, articleSavingRequestId);
|
||||||
|
const uploadedPdf = await sendCreateArticleMutation(userId, {
|
||||||
|
url: encodeURI(finalUrl),
|
||||||
|
articleSavingRequestId,
|
||||||
|
uploadFileId,
|
||||||
|
state,
|
||||||
|
labels,
|
||||||
|
source,
|
||||||
|
folder,
|
||||||
|
rssFeedUrl,
|
||||||
|
savedAt,
|
||||||
|
publishedAt,
|
||||||
|
});
|
||||||
|
if (!uploadedPdf) {
|
||||||
|
statusCode = 500;
|
||||||
|
logRecord.error = 'error while saving uploaded pdf';
|
||||||
|
} else {
|
||||||
|
importStatus = 'imported';
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
const apiResponse = await sendSavePageMutation(userId, {
|
||||||
|
url,
|
||||||
|
clientRequestId: articleSavingRequestId,
|
||||||
|
title,
|
||||||
|
originalContent: content,
|
||||||
|
parseResult: readabilityResult,
|
||||||
|
state,
|
||||||
|
labels,
|
||||||
|
rssFeedUrl,
|
||||||
|
savedAt,
|
||||||
|
publishedAt,
|
||||||
|
source,
|
||||||
|
folder,
|
||||||
|
});
|
||||||
|
if (!apiResponse) {
|
||||||
|
logRecord.error = 'error while saving page';
|
||||||
|
statusCode = 500;
|
||||||
|
} else if (apiResponse.error === 'UNAUTHORIZED') {
|
||||||
|
console.log('user is deleted, do not retry', logRecord);
|
||||||
|
return res.sendStatus(200);
|
||||||
|
} else {
|
||||||
|
importStatus = readabilityResult ? 'imported' : 'failed';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
logRecord.error = error.message;
|
||||||
|
} finally {
|
||||||
|
logRecord.totalTime = Date.now() - functionStartTime;
|
||||||
|
console.log(`parse-page result`, logRecord);
|
||||||
|
|
||||||
|
// mark import failed on the last failed retry
|
||||||
|
const retryCount = req.headers['x-cloudtasks-taskretrycount'];
|
||||||
|
if (retryCount === MAX_RETRY_COUNT) {
|
||||||
|
console.log('max retry count reached');
|
||||||
|
importStatus = importStatus || 'failed';
|
||||||
|
}
|
||||||
|
|
||||||
|
// send import status to update the metrics
|
||||||
|
if (taskId && importStatus) {
|
||||||
|
await sendImportStatusUpdate(userId, taskId, importStatus);
|
||||||
|
}
|
||||||
|
|
||||||
|
res.sendStatus(statusCode);
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -7,12 +7,8 @@ const { encode } = require("urlsafe-base64");
|
|||||||
const crypto = require("crypto");
|
const crypto = require("crypto");
|
||||||
|
|
||||||
const Url = require('url');
|
const Url = require('url');
|
||||||
const axios = require('axios');
|
|
||||||
const jwt = require('jsonwebtoken');
|
|
||||||
const { promisify } = require('util');
|
|
||||||
const signToken = promisify(jwt.sign);
|
|
||||||
const os = require('os');
|
const os = require('os');
|
||||||
const { Storage } = require('@google-cloud/storage');
|
// const { Storage } = require('@google-cloud/storage');
|
||||||
const { parseHTML } = require('linkedom');
|
const { parseHTML } = require('linkedom');
|
||||||
const { preHandleContent, preParseContent } = require("@omnivore/content-handler");
|
const { preHandleContent, preParseContent } = require("@omnivore/content-handler");
|
||||||
const { Readability } = require("@omnivore/readability");
|
const { Readability } = require("@omnivore/readability");
|
||||||
@ -29,9 +25,9 @@ puppeteer.use(AdblockerPlugin({ blockTrackers: true }));
|
|||||||
|
|
||||||
const createDOMPurify = require("dompurify");
|
const createDOMPurify = require("dompurify");
|
||||||
|
|
||||||
const storage = new Storage();
|
// const storage = new Storage();
|
||||||
const ALLOWED_ORIGINS = process.env.ALLOWED_ORIGINS ? process.env.ALLOWED_ORIGINS.split(',') : [];
|
const ALLOWED_ORIGINS = process.env.ALLOWED_ORIGINS ? process.env.ALLOWED_ORIGINS.split(',') : [];
|
||||||
const previewBucket = process.env.PREVIEW_IMAGE_BUCKET ? storage.bucket(process.env.PREVIEW_IMAGE_BUCKET) : undefined;
|
// const previewBucket = process.env.PREVIEW_IMAGE_BUCKET ? storage.bucket(process.env.PREVIEW_IMAGE_BUCKET) : undefined;
|
||||||
|
|
||||||
const filePath = `${os.tmpdir()}/previewImage.png`;
|
const filePath = `${os.tmpdir()}/previewImage.png`;
|
||||||
|
|
||||||
@ -44,11 +40,6 @@ const NON_SCRIPT_HOSTS= ['medium.com', 'fastcompany.com', 'fortelabs.com'];
|
|||||||
|
|
||||||
const ALLOWED_CONTENT_TYPES = ['text/html', 'application/octet-stream', 'text/plain', 'application/pdf'];
|
const ALLOWED_CONTENT_TYPES = ['text/html', 'application/octet-stream', 'text/plain', 'application/pdf'];
|
||||||
|
|
||||||
const IMPORTER_METRICS_COLLECTOR_URL = process.env.IMPORTER_METRICS_COLLECTOR_URL;
|
|
||||||
|
|
||||||
const REQUEST_TIMEOUT = 30000; // 30 seconds
|
|
||||||
const MAX_RETRY_COUNT = process.env.MAX_RETRY_COUNT || '1';
|
|
||||||
|
|
||||||
const userAgentForUrl = (url) => {
|
const userAgentForUrl = (url) => {
|
||||||
try {
|
try {
|
||||||
const u = new URL(url);
|
const u = new URL(url);
|
||||||
@ -140,249 +131,21 @@ const getBrowserPromise = (async () => {
|
|||||||
});
|
});
|
||||||
})();
|
})();
|
||||||
|
|
||||||
const uploadToSignedUrl = async ({ id, uploadSignedUrl }, contentType, contentObjUrl) => {
|
async function fetchContent(url, locale, timezone) {
|
||||||
try {
|
|
||||||
const stream = await axios.get(contentObjUrl, { responseType: 'stream', timeout: REQUEST_TIMEOUT });
|
|
||||||
return axios.put(uploadSignedUrl, stream.data, {
|
|
||||||
headers: {
|
|
||||||
'Content-Type': contentType,
|
|
||||||
},
|
|
||||||
maxBodyLength: 1000000000,
|
|
||||||
maxContentLength: 100000000,
|
|
||||||
timeout: REQUEST_TIMEOUT,
|
|
||||||
});
|
|
||||||
} catch (error) {
|
|
||||||
console.error('error uploading to signed url', error.message);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
const getUploadIdAndSignedUrl = async (userId, url, articleSavingRequestId) => {
|
|
||||||
const auth = await signToken({ uid: userId }, process.env.JWT_SECRET);
|
|
||||||
const data = JSON.stringify({
|
|
||||||
query: `mutation UploadFileRequest($input: UploadFileRequestInput!) {
|
|
||||||
uploadFileRequest(input:$input) {
|
|
||||||
... on UploadFileRequestError {
|
|
||||||
errorCodes
|
|
||||||
}
|
|
||||||
... on UploadFileRequestSuccess {
|
|
||||||
id
|
|
||||||
uploadSignedUrl
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}`,
|
|
||||||
variables: {
|
|
||||||
input: {
|
|
||||||
url,
|
|
||||||
contentType: 'application/pdf',
|
|
||||||
clientRequestId: articleSavingRequestId,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
try {
|
|
||||||
const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data,
|
|
||||||
{
|
|
||||||
headers: {
|
|
||||||
Cookie: `auth=${auth};`,
|
|
||||||
'Content-Type': 'application/json',
|
|
||||||
},
|
|
||||||
timeout: REQUEST_TIMEOUT,
|
|
||||||
});
|
|
||||||
|
|
||||||
if (response.data.data.uploadFileRequest.errorCodes && response.data.data.uploadFileRequest.errorCodes.length > 0) {
|
|
||||||
console.error('Error while getting upload id and signed url', response.data.data.uploadFileRequest.errorCodes[0]);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
return response.data.data.uploadFileRequest;
|
|
||||||
} catch (e) {
|
|
||||||
console.error('error getting upload id and signed url', e.message);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
const uploadPdf = async (url, userId, articleSavingRequestId) => {
|
|
||||||
validateUrlString(url);
|
|
||||||
|
|
||||||
const uploadResult = await getUploadIdAndSignedUrl(userId, url, articleSavingRequestId);
|
|
||||||
if (!uploadResult) {
|
|
||||||
throw new Error('error while getting upload id and signed url');
|
|
||||||
}
|
|
||||||
const uploaded = await uploadToSignedUrl(uploadResult, 'application/pdf', url);
|
|
||||||
if (!uploaded) {
|
|
||||||
throw new Error('error while uploading pdf');
|
|
||||||
}
|
|
||||||
return uploadResult.id;
|
|
||||||
};
|
|
||||||
|
|
||||||
const sendCreateArticleMutation = async (userId, input) => {
|
|
||||||
const data = JSON.stringify({
|
|
||||||
query: `mutation CreateArticle ($input: CreateArticleInput!){
|
|
||||||
createArticle(input:$input){
|
|
||||||
... on CreateArticleSuccess{
|
|
||||||
createdArticle{
|
|
||||||
id
|
|
||||||
}
|
|
||||||
}
|
|
||||||
... on CreateArticleError{
|
|
||||||
errorCodes
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}`,
|
|
||||||
variables: {
|
|
||||||
input,
|
|
||||||
},
|
|
||||||
});
|
|
||||||
|
|
||||||
const auth = await signToken({ uid: userId }, process.env.JWT_SECRET);
|
|
||||||
try {
|
|
||||||
const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data,
|
|
||||||
{
|
|
||||||
headers: {
|
|
||||||
Cookie: `auth=${auth};`,
|
|
||||||
'Content-Type': 'application/json',
|
|
||||||
},
|
|
||||||
timeout: REQUEST_TIMEOUT,
|
|
||||||
});
|
|
||||||
|
|
||||||
if (response.data.data.createArticle.errorCodes && response.data.data.createArticle.errorCodes.length > 0) {
|
|
||||||
console.error('error while creating article', response.data.data.createArticle.errorCodes[0]);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
return response.data.data.createArticle;
|
|
||||||
} catch (error) {
|
|
||||||
console.error('error creating article', error.message);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
const sendSavePageMutation = async (userId, input) => {
|
|
||||||
const data = JSON.stringify({
|
|
||||||
query: `mutation SavePage ($input: SavePageInput!){
|
|
||||||
savePage(input:$input){
|
|
||||||
... on SaveSuccess{
|
|
||||||
url
|
|
||||||
clientRequestId
|
|
||||||
}
|
|
||||||
... on SaveError{
|
|
||||||
errorCodes
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}`,
|
|
||||||
variables: {
|
|
||||||
input,
|
|
||||||
},
|
|
||||||
});
|
|
||||||
|
|
||||||
const auth = await signToken({ uid: userId }, process.env.JWT_SECRET);
|
|
||||||
try {
|
|
||||||
const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data,
|
|
||||||
{
|
|
||||||
headers: {
|
|
||||||
Cookie: `auth=${auth};`,
|
|
||||||
'Content-Type': 'application/json',
|
|
||||||
},
|
|
||||||
timeout: REQUEST_TIMEOUT,
|
|
||||||
});
|
|
||||||
|
|
||||||
if (response.data.data.savePage.errorCodes && response.data.data.savePage.errorCodes.length > 0) {
|
|
||||||
console.error('error while saving page', response.data.data.savePage.errorCodes[0]);
|
|
||||||
if (response.data.data.savePage.errorCodes[0] === 'UNAUTHORIZED') {
|
|
||||||
return { error: 'UNAUTHORIZED' };
|
|
||||||
}
|
|
||||||
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
return response.data.data.savePage;
|
|
||||||
} catch (error) {
|
|
||||||
console.error('error saving page', error.message);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
const saveUploadedPdf = async (userId, url, uploadFileId, articleSavingRequestId) => {
|
|
||||||
return sendCreateArticleMutation(userId, {
|
|
||||||
url: encodeURI(url),
|
|
||||||
articleSavingRequestId,
|
|
||||||
uploadFileId: uploadFileId,
|
|
||||||
state,
|
|
||||||
labels,
|
|
||||||
source,
|
|
||||||
folder,
|
|
||||||
},
|
|
||||||
);
|
|
||||||
};
|
|
||||||
|
|
||||||
const sendImportStatusUpdate = async (userId, taskId, status) => {
|
|
||||||
try {
|
|
||||||
const auth = await signToken({ uid: userId }, process.env.JWT_SECRET);
|
|
||||||
|
|
||||||
await axios.post(
|
|
||||||
IMPORTER_METRICS_COLLECTOR_URL,
|
|
||||||
{
|
|
||||||
taskId,
|
|
||||||
status,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
headers: {
|
|
||||||
'Authorization': auth,
|
|
||||||
'Content-Type': 'application/json',
|
|
||||||
},
|
|
||||||
timeout: REQUEST_TIMEOUT,
|
|
||||||
});
|
|
||||||
} catch (e) {
|
|
||||||
console.error('error while sending import status update', e);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
async function fetchContent(req, res) {
|
|
||||||
let functionStartTime = Date.now();
|
let functionStartTime = Date.now();
|
||||||
|
|
||||||
const userId = (req.query ? req.query.userId : undefined) || (req.body ? req.body.userId : undefined);
|
|
||||||
const articleSavingRequestId = (req.query ? req.query.saveRequestId : undefined) || (req.body ? req.body.saveRequestId : undefined);
|
|
||||||
const state = req.body.state
|
|
||||||
const labels = req.body.labels
|
|
||||||
const source = req.body.source || 'puppeteer-parse';
|
|
||||||
const taskId = req.body.taskId; // taskId is used to update import status
|
|
||||||
const urlStr = (req.query ? req.query.url : undefined) || (req.body ? req.body.url : undefined);
|
|
||||||
const locale = (req.query ? req.query.locale : undefined) || (req.body ? req.body.locale : undefined);
|
|
||||||
const timezone = (req.query ? req.query.timezone : undefined) || (req.body ? req.body.timezone : undefined);
|
|
||||||
const rssFeedUrl = req.body.rssFeedUrl;
|
|
||||||
const savedAt = req.body.savedAt;
|
|
||||||
const publishedAt = req.body.publishedAt;
|
|
||||||
const folder = req.body.folder;
|
|
||||||
|
|
||||||
let logRecord = {
|
let logRecord = {
|
||||||
url: urlStr,
|
url,
|
||||||
userId,
|
functionStartTime,
|
||||||
articleSavingRequestId,
|
|
||||||
labels: {
|
|
||||||
source,
|
|
||||||
},
|
|
||||||
state,
|
|
||||||
labelsToAdd: labels,
|
|
||||||
taskId: taskId,
|
|
||||||
locale,
|
locale,
|
||||||
timezone,
|
timezone,
|
||||||
rssFeedUrl,
|
}
|
||||||
savedAt,
|
console.log(`content-fetch request`, logRecord);
|
||||||
publishedAt,
|
|
||||||
folder,
|
|
||||||
};
|
|
||||||
|
|
||||||
console.info(`Article parsing request`, logRecord);
|
let context, page, finalUrl, title, content, contentType, readabilityResult = null;
|
||||||
|
|
||||||
let url, context, page, finalUrl, title, content, contentType, importStatus, statusCode = 200;
|
|
||||||
try {
|
try {
|
||||||
url = getUrl(urlStr);
|
url = getUrl(url);
|
||||||
if (!url) {
|
if (!url) {
|
||||||
logRecord.urlIsInvalid = true;
|
throw new Error('Valid URL to parse not specified');
|
||||||
logRecord.error = 'Valid URL to parse not specified';
|
|
||||||
statusCode = 400;
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// pre handle url with custom handlers
|
// pre handle url with custom handlers
|
||||||
@ -410,27 +173,7 @@ async function fetchContent(req, res) {
|
|||||||
finalUrl = url
|
finalUrl = url
|
||||||
}
|
}
|
||||||
|
|
||||||
if (contentType === 'application/pdf') {
|
if (contentType !== 'application/pdf') {
|
||||||
const uploadFileId = await uploadPdf(finalUrl, userId, articleSavingRequestId);
|
|
||||||
const uploadedPdf = await sendCreateArticleMutation(userId, {
|
|
||||||
url: encodeURI(finalUrl),
|
|
||||||
articleSavingRequestId,
|
|
||||||
uploadFileId,
|
|
||||||
state,
|
|
||||||
labels,
|
|
||||||
source,
|
|
||||||
folder,
|
|
||||||
rssFeedUrl,
|
|
||||||
savedAt,
|
|
||||||
publishedAt,
|
|
||||||
});
|
|
||||||
if (!uploadedPdf) {
|
|
||||||
statusCode = 500;
|
|
||||||
logRecord.error = 'error while saving uploaded pdf';
|
|
||||||
} else {
|
|
||||||
importStatus = 'imported';
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (!content || !title) {
|
if (!content || !title) {
|
||||||
const result = await retrieveHtml(page, logRecord);
|
const result = await retrieveHtml(page, logRecord);
|
||||||
if (result.isBlocked) {
|
if (result.isBlocked) {
|
||||||
@ -444,12 +187,9 @@ async function fetchContent(req, res) {
|
|||||||
} else {
|
} else {
|
||||||
console.info('using prefetched content and title');
|
console.info('using prefetched content and title');
|
||||||
}
|
}
|
||||||
logRecord.fetchContentTime = Date.now() - functionStartTime;
|
|
||||||
}
|
}
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
logRecord.error = e.message;
|
console.error(`Error while retrieving page ${url}`, e);
|
||||||
console.error(`Error while retrieving page`, logRecord);
|
|
||||||
statusCode = 500;
|
|
||||||
|
|
||||||
// fallback to scrapingbee for non pdf content
|
// fallback to scrapingbee for non pdf content
|
||||||
if (url && contentType !== 'application/pdf') {
|
if (url && contentType !== 'application/pdf') {
|
||||||
@ -459,8 +199,8 @@ async function fetchContent(req, res) {
|
|||||||
const sbResult = await fetchContentWithScrapingBee(url);
|
const sbResult = await fetchContentWithScrapingBee(url);
|
||||||
content = sbResult.domContent;
|
content = sbResult.domContent;
|
||||||
title = sbResult.title;
|
title = sbResult.title;
|
||||||
logRecord.fetchContentTime = Date.now() - fetchStartTime;
|
} else {
|
||||||
statusCode = 200;
|
throw e;
|
||||||
}
|
}
|
||||||
} finally {
|
} finally {
|
||||||
// close browser context if it was opened
|
// close browser context if it was opened
|
||||||
@ -470,7 +210,6 @@ async function fetchContent(req, res) {
|
|||||||
// save non pdf content
|
// save non pdf content
|
||||||
if (url && contentType !== 'application/pdf') {
|
if (url && contentType !== 'application/pdf') {
|
||||||
// parse content if it is not empty
|
// parse content if it is not empty
|
||||||
let readabilityResult = null;
|
|
||||||
if (content) {
|
if (content) {
|
||||||
let document = parseHTML(content).document;
|
let document = parseHTML(content).document;
|
||||||
// preParse content
|
// preParse content
|
||||||
@ -480,48 +219,11 @@ async function fetchContent(req, res) {
|
|||||||
}
|
}
|
||||||
readabilityResult = await getReadabilityResult(url, document);
|
readabilityResult = await getReadabilityResult(url, document);
|
||||||
}
|
}
|
||||||
|
|
||||||
const apiResponse = await sendSavePageMutation(userId, {
|
|
||||||
url,
|
|
||||||
clientRequestId: articleSavingRequestId,
|
|
||||||
title,
|
|
||||||
originalContent: content,
|
|
||||||
parseResult: readabilityResult,
|
|
||||||
state,
|
|
||||||
labels,
|
|
||||||
rssFeedUrl,
|
|
||||||
savedAt,
|
|
||||||
publishedAt,
|
|
||||||
source,
|
|
||||||
folder,
|
|
||||||
});
|
|
||||||
if (!apiResponse) {
|
|
||||||
logRecord.error = 'error while saving page';
|
|
||||||
statusCode = 500;
|
|
||||||
} else if (apiResponse.error === 'UNAUTHORIZED') {
|
|
||||||
console.info('user is deleted, do not retry', logRecord);
|
|
||||||
return res.sendStatus(200);
|
|
||||||
} else {
|
|
||||||
importStatus = readabilityResult ? 'imported' : 'failed';
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
logRecord.totalTime = Date.now() - functionStartTime;
|
console.info(`content-fetch result`, logRecord);
|
||||||
console.info(`parse-page`, logRecord);
|
|
||||||
|
|
||||||
// mark import failed on the last failed retry
|
return { finalUrl, title, content, readabilityResult, contentType };
|
||||||
const retryCount = req.headers['x-cloudtasks-taskretrycount'];
|
|
||||||
if (retryCount == MAX_RETRY_COUNT) {
|
|
||||||
console.info('max retry count reached');
|
|
||||||
importStatus = importStatus || 'failed';
|
|
||||||
}
|
|
||||||
|
|
||||||
// send import status to update the metrics
|
|
||||||
if (taskId && importStatus) {
|
|
||||||
await sendImportStatusUpdate(userId, taskId, importStatus);
|
|
||||||
}
|
|
||||||
|
|
||||||
res.sendStatus(statusCode);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -832,126 +534,126 @@ async function retrieveHtml(page, logRecord) {
|
|||||||
return { domContent, title };
|
return { domContent, title };
|
||||||
}
|
}
|
||||||
|
|
||||||
async function preview(req, res) {
|
// async function preview(req, res) {
|
||||||
const functionStartTime = Date.now();
|
// const functionStartTime = Date.now();
|
||||||
// Grabbing execution and trace ids to attach logs to the appropriate function call
|
// // Grabbing execution and trace ids to attach logs to the appropriate function call
|
||||||
const execution_id = req.get('function-execution-id');
|
// const execution_id = req.get('function-execution-id');
|
||||||
const traceId = (req.get('x-cloud-trace-context') || '').split('/')[0];
|
// const traceId = (req.get('x-cloud-trace-context') || '').split('/')[0];
|
||||||
const console = buildconsole('cloudfunctions.googleapis.com%2Fcloud-functions', {
|
// const console = buildconsole('cloudfunctions.googleapis.com%2Fcloud-functions', {
|
||||||
trace: `projects/${process.env.GCLOUD_PROJECT}/traces/${traceId}`,
|
// trace: `projects/${process.env.GCLOUD_PROJECT}/traces/${traceId}`,
|
||||||
labels: {
|
// labels: {
|
||||||
execution_id: execution_id,
|
// execution_id: execution_id,
|
||||||
},
|
// },
|
||||||
});
|
// });
|
||||||
|
|
||||||
if (!process.env.PREVIEW_IMAGE_BUCKET) {
|
// if (!process.env.PREVIEW_IMAGE_BUCKET) {
|
||||||
console.error(`PREVIEW_IMAGE_BUCKET not set`)
|
// console.error(`PREVIEW_IMAGE_BUCKET not set`)
|
||||||
return res.sendStatus(500);
|
// return res.sendStatus(500);
|
||||||
}
|
// }
|
||||||
|
|
||||||
const urlStr = (req.query ? req.query.url : undefined) || (req.body ? req.body.url : undefined);
|
// const urlStr = (req.query ? req.query.url : undefined) || (req.body ? req.body.url : undefined);
|
||||||
const url = getUrl(urlStr);
|
// const url = getUrl(urlStr);
|
||||||
console.log('preview request url', url);
|
// console.log('preview request url', url);
|
||||||
|
|
||||||
const logRecord = {
|
// const logRecord = {
|
||||||
url,
|
// url,
|
||||||
query: req.query,
|
// query: req.query,
|
||||||
origin: req.get('Origin'),
|
// origin: req.get('Origin'),
|
||||||
labels: {
|
// labels: {
|
||||||
source: 'publicImagePreview',
|
// source: 'publicImagePreview',
|
||||||
},
|
// },
|
||||||
};
|
// };
|
||||||
|
|
||||||
console.info(`Public preview image generation request`, logRecord);
|
// console.info(`Public preview image generation request`, logRecord);
|
||||||
|
|
||||||
if (!url) {
|
// if (!url) {
|
||||||
logRecord.urlIsInvalid = true;
|
// logRecord.urlIsInvalid = true;
|
||||||
console.error(`Valid URL to parse is not specified`, logRecord);
|
// console.error(`Valid URL to parse is not specified`, logRecord);
|
||||||
return res.sendStatus(400);
|
// return res.sendStatus(400);
|
||||||
}
|
// }
|
||||||
const { origin } = new URL(url);
|
// const { origin } = new URL(url);
|
||||||
if (!ALLOWED_ORIGINS.some(o => o === origin)) {
|
// if (!ALLOWED_ORIGINS.some(o => o === origin)) {
|
||||||
logRecord.forbiddenOrigin = true;
|
// logRecord.forbiddenOrigin = true;
|
||||||
console.error(`This origin is not allowed: ${origin}`, logRecord);
|
// console.error(`This origin is not allowed: ${origin}`, logRecord);
|
||||||
return res.sendStatus(400);
|
// return res.sendStatus(400);
|
||||||
}
|
// }
|
||||||
|
|
||||||
const browser = await getBrowserPromise;
|
// const browser = await getBrowserPromise;
|
||||||
logRecord.timing = { ...logRecord.timing, browserOpened: Date.now() - functionStartTime };
|
// logRecord.timing = { ...logRecord.timing, browserOpened: Date.now() - functionStartTime };
|
||||||
|
|
||||||
const page = await browser.newPage();
|
// const page = await browser.newPage();
|
||||||
const pageLoadingStart = Date.now();
|
// const pageLoadingStart = Date.now();
|
||||||
const modifiedUrl = new URL(url);
|
// const modifiedUrl = new URL(url);
|
||||||
modifiedUrl.searchParams.append('fontSize', '24');
|
// modifiedUrl.searchParams.append('fontSize', '24');
|
||||||
modifiedUrl.searchParams.append('adjustAspectRatio', '1.91');
|
// modifiedUrl.searchParams.append('adjustAspectRatio', '1.91');
|
||||||
try {
|
// try {
|
||||||
await page.goto(modifiedUrl.toString());
|
// await page.goto(modifiedUrl.toString());
|
||||||
logRecord.timing = { ...logRecord.timing, pageLoaded: Date.now() - pageLoadingStart };
|
// logRecord.timing = { ...logRecord.timing, pageLoaded: Date.now() - pageLoadingStart };
|
||||||
} catch (error) {
|
// } catch (error) {
|
||||||
console.log('error going to page: ', modifiedUrl)
|
// console.log('error going to page: ', modifiedUrl)
|
||||||
console.log(error)
|
// console.log(error)
|
||||||
throw error
|
// throw error
|
||||||
}
|
// }
|
||||||
|
|
||||||
// We lookup the destination path from our own page content and avoid trusting any passed query params
|
// // We lookup the destination path from our own page content and avoid trusting any passed query params
|
||||||
// selector - CSS selector of the element to get screenshot of
|
// // selector - CSS selector of the element to get screenshot of
|
||||||
const selector = decodeURIComponent(
|
// const selector = decodeURIComponent(
|
||||||
await page.$eval(
|
// await page.$eval(
|
||||||
"head > meta[name='omnivore:preview_image_selector']",
|
// "head > meta[name='omnivore:preview_image_selector']",
|
||||||
element => element.content,
|
// element => element.content,
|
||||||
),
|
// ),
|
||||||
);
|
// );
|
||||||
if (!selector) {
|
// if (!selector) {
|
||||||
logRecord.selectorIsInvalid = true;
|
// logRecord.selectorIsInvalid = true;
|
||||||
console.error(`Valid element selector is not specified`, logRecord);
|
// console.error(`Valid element selector is not specified`, logRecord);
|
||||||
await page.close();
|
// await page.close();
|
||||||
return res.sendStatus(400);
|
// return res.sendStatus(400);
|
||||||
}
|
// }
|
||||||
logRecord.selector = selector;
|
// logRecord.selector = selector;
|
||||||
|
|
||||||
// destination - destination pathname for the image to save with
|
// // destination - destination pathname for the image to save with
|
||||||
const destination = decodeURIComponent(
|
// const destination = decodeURIComponent(
|
||||||
await page.$eval(
|
// await page.$eval(
|
||||||
"head > meta[name='omnivore:preview_image_destination']",
|
// "head > meta[name='omnivore:preview_image_destination']",
|
||||||
element => element.content,
|
// element => element.content,
|
||||||
),
|
// ),
|
||||||
);
|
// );
|
||||||
if (!destination) {
|
// if (!destination) {
|
||||||
logRecord.destinationIsInvalid = true;
|
// logRecord.destinationIsInvalid = true;
|
||||||
console.error(`Valid file destination is not specified`, logRecord);
|
// console.error(`Valid file destination is not specified`, logRecord);
|
||||||
await page.close();
|
// await page.close();
|
||||||
return res.sendStatus(400);
|
// return res.sendStatus(400);
|
||||||
}
|
// }
|
||||||
logRecord.destination = destination;
|
// logRecord.destination = destination;
|
||||||
|
|
||||||
const screenshotTakingStart = Date.now();
|
// const screenshotTakingStart = Date.now();
|
||||||
try {
|
// try {
|
||||||
await page.waitForSelector(selector, { timeout: 3000 }); // wait for the selector to load
|
// await page.waitForSelector(selector, { timeout: 3000 }); // wait for the selector to load
|
||||||
} catch (error) {
|
// } catch (error) {
|
||||||
logRecord.elementNotFound = true;
|
// logRecord.elementNotFound = true;
|
||||||
console.error(`Element is not presented on the page`, logRecord);
|
// console.error(`Element is not presented on the page`, logRecord);
|
||||||
await page.close();
|
// await page.close();
|
||||||
return res.sendStatus(400);
|
// return res.sendStatus(400);
|
||||||
}
|
// }
|
||||||
const element = await page.$(selector);
|
// const element = await page.$(selector);
|
||||||
await element.screenshot({ path: filePath }); // take screenshot of the element in puppeteer
|
// await element.screenshot({ path: filePath }); // take screenshot of the element in puppeteer
|
||||||
logRecord.timing = { ...logRecord.timing, screenshotTaken: Date.now() - screenshotTakingStart };
|
// logRecord.timing = { ...logRecord.timing, screenshotTaken: Date.now() - screenshotTakingStart };
|
||||||
|
|
||||||
await page.close();
|
// await page.close();
|
||||||
|
|
||||||
try {
|
// try {
|
||||||
const [file] = await previewBucket.upload(filePath, {
|
// const [file] = await previewBucket.upload(filePath, {
|
||||||
destination,
|
// destination,
|
||||||
metadata: logRecord,
|
// metadata: logRecord,
|
||||||
});
|
// });
|
||||||
logRecord.file = file.metadata;
|
// logRecord.file = file.metadata;
|
||||||
} catch (e) {
|
// } catch (e) {
|
||||||
console.log('error uploading to bucket, this is non-fatal', e)
|
// console.log('error uploading to bucket, this is non-fatal', e)
|
||||||
}
|
// }
|
||||||
|
|
||||||
console.info(`preview-image`, logRecord);
|
// console.info(`preview-image`, logRecord);
|
||||||
return res.redirect(`${process.env.PREVIEW_IMAGE_CDN_ORIGIN}/${destination}`);
|
// return res.redirect(`${process.env.PREVIEW_IMAGE_CDN_ORIGIN}/${destination}`);
|
||||||
}
|
// }
|
||||||
|
|
||||||
const DOM_PURIFY_CONFIG = {
|
const DOM_PURIFY_CONFIG = {
|
||||||
ADD_TAGS: ['iframe'],
|
ADD_TAGS: ['iframe'],
|
||||||
@ -1048,6 +750,6 @@ async function getReadabilityResult(url, document) {
|
|||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
fetchContent,
|
fetchContent,
|
||||||
preview,
|
// preview,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@ -4,19 +4,15 @@
|
|||||||
"description": "Accepts URL of the article and parses its content",
|
"description": "Accepts URL of the article and parses its content",
|
||||||
"main": "index.js",
|
"main": "index.js",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@google-cloud/storage": "^7.0.1",
|
|
||||||
"@omnivore/content-handler": "1.0.0",
|
"@omnivore/content-handler": "1.0.0",
|
||||||
"@omnivore/readability": "1.0.0",
|
"@omnivore/readability": "1.0.0",
|
||||||
"axios": "^0.27.2",
|
|
||||||
"crypto": "^1.0.1",
|
"crypto": "^1.0.1",
|
||||||
"dompurify": "^2.4.1",
|
"dompurify": "^2.4.1",
|
||||||
"jsonwebtoken": "^8.5.1",
|
|
||||||
"linkedom": "^0.14.9",
|
"linkedom": "^0.14.9",
|
||||||
"puppeteer-core": "^20.9.0",
|
"puppeteer-core": "^20.9.0",
|
||||||
"puppeteer-extra": "^3.3.4",
|
"puppeteer-extra": "^3.3.4",
|
||||||
"puppeteer-extra-plugin-adblocker": "^2.13.5",
|
"puppeteer-extra-plugin-adblocker": "^2.13.5",
|
||||||
"puppeteer-extra-plugin-stealth": "^2.11.1",
|
"puppeteer-extra-plugin-stealth": "^2.11.1",
|
||||||
"underscore": "^1.13.4",
|
|
||||||
"urlsafe-base64": "^1.0.0"
|
"urlsafe-base64": "^1.0.0"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
|
|||||||
Reference in New Issue
Block a user