765 lines
25 KiB
JavaScript
765 lines
25 KiB
JavaScript
/* eslint-disable no-undef */
|
|
/* eslint-disable no-empty */
|
|
/* eslint-disable @typescript-eslint/explicit-function-return-type */
|
|
/* eslint-disable @typescript-eslint/no-var-requires */
|
|
/* eslint-disable @typescript-eslint/no-require-imports */
|
|
require('dotenv').config();
|
|
const Url = require('url');
|
|
const chromium = require('chrome-aws-lambda');
|
|
const puppeteer = require('puppeteer-core');
|
|
const axios = require('axios');
|
|
const jwt = require('jsonwebtoken');
|
|
const { promisify } = require('util');
|
|
const signToken = promisify(jwt.sign);
|
|
const { config, format, loggers, transports } = require('winston');
|
|
const { LoggingWinston } = require('@google-cloud/logging-winston');
|
|
const { DateTime } = require('luxon');
|
|
const os = require('os');
|
|
const Sentry = require('@sentry/serverless');
|
|
const { Storage } = require('@google-cloud/storage');
|
|
const { appleNewsHandler } = require('./apple-news-handler');
|
|
const { twitterHandler } = require('./twitter-handler');
|
|
const { youtubeHandler } = require('./youtube-handler');
|
|
const { tDotCoHandler } = require('./t-dot-co-handler');
|
|
const { pdfHandler } = require('./pdf-handler');
|
|
const { mediumHandler } = require('./medium-handler');
|
|
|
|
const storage = new Storage();
|
|
const previewBucket = storage.bucket(process.env.PREVIEW_IMAGE_BUCKET);
|
|
const ALLOWED_ORIGINS = process.env.ALLOWED_ORIGINS.split(',');
|
|
|
|
Sentry.GCPFunction.init({
|
|
dsn: process.env.SENTRY_DSN,
|
|
tracesSampleRate: 0,
|
|
});
|
|
|
|
const MOBILE_USER_AGENT = 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.62 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
|
|
const DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
|
|
const BOT_DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
|
|
const NON_BOT_DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
|
|
const NON_BOT_HOSTS = ['bloomberg.com', 'forbes.com']
|
|
|
|
const filePath = `${os.tmpdir()}/previewImage.png`;
|
|
const ALLOWED_CONTENT_TYPES = ['text/html', 'application/octet-stream', 'text/plain', 'application/pdf'];
|
|
|
|
|
|
const colors = {
|
|
emerg: 'inverse underline magenta',
|
|
alert: 'underline magenta',
|
|
crit: 'inverse underline red', // Any error that is forcing a shutdown of the service or application to prevent data loss.
|
|
error: 'underline red', // Any error which is fatal to the operation, but not the service or application
|
|
warning: 'underline yellow', // Anything that can potentially cause application oddities
|
|
notice: 'underline cyan', // Normal but significant condition
|
|
info: 'underline green', // Generally useful information to log
|
|
debug: 'underline gray',
|
|
};
|
|
|
|
const googleConfigs = {
|
|
level: 'info',
|
|
logName: 'logger',
|
|
levels: config.syslog.levels,
|
|
resource: {
|
|
labels: {
|
|
function_name: process.env.FUNCTION_TARGET,
|
|
project_id: process.env.GCP_PROJECT,
|
|
},
|
|
type: 'cloud_function',
|
|
},
|
|
};
|
|
|
|
function localConfig(id) {
|
|
return {
|
|
level: 'debug',
|
|
format: format.combine(
|
|
format.colorize({ all: true, colors }),
|
|
format(info =>
|
|
Object.assign(info, {
|
|
timestamp: DateTime.local().toLocaleString(DateTime.TIME_24_WITH_SECONDS),
|
|
}),
|
|
)(),
|
|
format.printf(info => {
|
|
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
const { timestamp, message, level, ...meta } = info;
|
|
|
|
return `[${id}@${info.timestamp}] ${info.message}${
|
|
Object.keys(meta).length ? '\n' + JSON.stringify(meta, null, 4) : ''
|
|
}`;
|
|
}),
|
|
),
|
|
};
|
|
}
|
|
|
|
function buildLoggerTransport(id, options) {
|
|
return process.env.IS_LOCAL
|
|
? new transports.Console(localConfig(id))
|
|
: new LoggingWinston({ ...googleConfigs, ...{ logName: id }, ...options });
|
|
}
|
|
|
|
function buildLogger(id, options) {
|
|
return loggers.get(id, {
|
|
levels: config.syslog.levels,
|
|
transports: [buildLoggerTransport(id, options)],
|
|
});
|
|
}
|
|
|
|
const userAgentForUrl = (url) => {
|
|
try {
|
|
const u = new URL(url);
|
|
for (const host of NON_BOT_HOSTS) {
|
|
if (u.hostname.endsWith(host)) {
|
|
return NON_BOT_DESKTOP_USER_AGENT;
|
|
}
|
|
}
|
|
} catch (e) {
|
|
console.log('error getting user agent for url', url, e)
|
|
}
|
|
return DESKTOP_USER_AGENT
|
|
};
|
|
|
|
// launch Puppeteer
|
|
const getBrowserPromise = (async () => {
|
|
return puppeteer.launch({
|
|
args: chromium.args,
|
|
defaultViewport: { height: 1080, width: 1920 },
|
|
executablePath: process.env.CHROMIUM_PATH || (await chromium.executablePath),
|
|
headless: process.env.IS_LOCAL ? false : chromium.headless,
|
|
timeout: 0,
|
|
});
|
|
})();
|
|
|
|
let logRecord, functionStartTime;
|
|
|
|
const uploadToSignedUrl = async ({ id, uploadSignedUrl }, contentType, contentObjUrl) => {
|
|
const stream = await axios.get(contentObjUrl, { responseType: 'stream' });
|
|
return await axios.put(uploadSignedUrl, stream.data, {
|
|
headers: {
|
|
'Content-Type': contentType,
|
|
},
|
|
maxBodyLength: 1000000000,
|
|
maxContentLength: 100000000,
|
|
})
|
|
};
|
|
|
|
const getUploadIdAndSignedUrl = async (userId, url) => {
|
|
const auth = await signToken({ uid: userId }, process.env.JWT_SECRET);
|
|
const data = JSON.stringify({
|
|
query: `mutation UploadFileRequest($input: UploadFileRequestInput!) {
|
|
uploadFileRequest(input:$input) {
|
|
... on UploadFileRequestError {
|
|
errorCodes
|
|
}
|
|
... on UploadFileRequestSuccess {
|
|
id
|
|
uploadSignedUrl
|
|
}
|
|
}
|
|
}`,
|
|
variables: {
|
|
input: {
|
|
url,
|
|
contentType: 'application/pdf',
|
|
}
|
|
}
|
|
});
|
|
|
|
const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data,
|
|
{
|
|
headers: {
|
|
Cookie: `auth=${auth};`,
|
|
'Content-Type': 'application/json',
|
|
},
|
|
});
|
|
return response.data.data.uploadFileRequest;
|
|
};
|
|
|
|
const uploadPdf = async (url, userId) => {
|
|
validateUrlString(url);
|
|
|
|
const uploadResult = await getUploadIdAndSignedUrl(userId, url);
|
|
await uploadToSignedUrl(uploadResult, 'application/pdf', url);
|
|
return uploadResult.id;
|
|
};
|
|
|
|
const sendCreateArticleMutation = async (userId, input) => {
|
|
const data = JSON.stringify({
|
|
query: `mutation CreateArticle ($input: CreateArticleInput!){
|
|
createArticle(input:$input){
|
|
... on CreateArticleSuccess{
|
|
createdArticle{
|
|
id
|
|
}
|
|
}
|
|
... on CreateArticleError{
|
|
errorCodes
|
|
}
|
|
}
|
|
}`,
|
|
variables: {
|
|
input: Object.assign({}, input , { source: 'puppeteer-parse' }),
|
|
},
|
|
});
|
|
|
|
const auth = await signToken({ uid: userId }, process.env.JWT_SECRET);
|
|
const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data,
|
|
{
|
|
headers: {
|
|
Cookie: `auth=${auth};`,
|
|
'Content-Type': 'application/json',
|
|
},
|
|
});
|
|
return response.data.data.createArticle;
|
|
};
|
|
|
|
const saveUploadedPdf = async (userId, url, uploadFileId, articleSavingRequestId) => {
|
|
return sendCreateArticleMutation(userId, {
|
|
url: encodeURI(url),
|
|
articleSavingRequestId,
|
|
uploadFileId: uploadFileId,
|
|
},
|
|
);
|
|
};
|
|
|
|
const handlers = {
|
|
'pdf': pdfHandler,
|
|
'apple-news': appleNewsHandler,
|
|
'twitter': twitterHandler,
|
|
'youtube': youtubeHandler,
|
|
't-dot-co': tDotCoHandler,
|
|
'medium': mediumHandler,
|
|
};
|
|
|
|
/**
|
|
* Cloud Function entry point, HTTP trigger.
|
|
* Loads the requested URL via Puppeteer, captures page content and sends it to backend
|
|
*
|
|
* @param {Object} req Cloud Function request context.
|
|
* @param {Object} res Cloud Function response context.
|
|
*/
|
|
exports.puppeteer = Sentry.GCPFunction.wrapHttpFunction(async (req, res) => {
|
|
functionStartTime = Date.now();
|
|
// Grabbing execution and trace ids to attach logs to the appropriate function call
|
|
const execution_id = req.get('function-execution-id');
|
|
const traceId = (req.get('x-cloud-trace-context') || '').split('/')[0];
|
|
const logger = buildLogger('cloudfunctions.googleapis.com%2Fcloud-functions', {
|
|
trace: `projects/${process.env.GCLOUD_PROJECT}/traces/${traceId}`,
|
|
labels: {
|
|
execution_id: execution_id,
|
|
},
|
|
});
|
|
|
|
let url = getUrl(req);
|
|
const userId = req.body.userId || req.query.userId;
|
|
const articleSavingRequestId = req.body.saveRequestId || req.query.saveRequestId;
|
|
|
|
logRecord = {
|
|
url,
|
|
userId,
|
|
articleSavingRequestId,
|
|
labels: {
|
|
source: 'parseContent',
|
|
},
|
|
};
|
|
|
|
logger.info(`Article parsing request`, logRecord);
|
|
|
|
if (!url) {
|
|
logRecord.urlIsInvalid = true;
|
|
logger.error(`Valid URL to parse not specified`, logRecord);
|
|
return res.sendStatus(400);
|
|
}
|
|
|
|
if (!userId || !articleSavingRequestId) {
|
|
Object.assign(logRecord, { invalidParams: true, body: req.body, query: req.query });
|
|
logger.error(`Invalid parameters`, logRecord);
|
|
return res.sendStatus(400);
|
|
}
|
|
|
|
// Before we run the regular handlers we check to see if we need tp
|
|
// pre-resolve the URL. TODO: This should probably happen recursively,
|
|
// so URLs can be pre-resolved, handled, pre-resolved, handled, etc.
|
|
for (const [key, handler] of Object.entries(handlers)) {
|
|
if (handler.shouldResolve && handler.shouldResolve(url)) {
|
|
try {
|
|
url = await handler.resolve(url);
|
|
validateUrlString(url);
|
|
} catch (err) {
|
|
console.log('error resolving url with handler', key, err);
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Before we fetch the page we check the handlers, to see if they want
|
|
// to perform a prefetch action that can modify our requests.
|
|
// enumerate the handlers and see if any of them want to handle the request
|
|
const handler = Object.keys(handlers).find(key => {
|
|
try {
|
|
return handlers[key].shouldPrehandle(url)
|
|
} catch (e) {
|
|
console.log('error with handler: ', key, e);
|
|
}
|
|
return false;
|
|
});
|
|
|
|
var title = undefined;
|
|
var content = undefined;
|
|
var contentType = undefined;
|
|
|
|
if (handler) {
|
|
try {
|
|
// The only handler we have now can modify the URL, but in the
|
|
// future maybe we let it modify content. In that case
|
|
// we might exit the request early.
|
|
console.log('pre-handling url with handler: ', handler);
|
|
|
|
const result = await handlers[handler].prehandle(url);
|
|
if (result && result.url) {
|
|
url = result.url
|
|
validateUrlString(url);
|
|
}
|
|
if (result && result.title) { title = result.title }
|
|
if (result && result.content) { content = result.content }
|
|
if (result && result.contentType) { contentType = result.contentType }
|
|
} catch (e) {
|
|
console.log('error with handler: ', handler, e);
|
|
}
|
|
}
|
|
|
|
var context, page, finalUrl;
|
|
if ((!content || !title) && contentType !== 'application/pdf') {
|
|
const result = await retrievePage(url)
|
|
if (result && result.context) { context = result.context }
|
|
if (result && result.page) { page = result.page }
|
|
if (result && result.finalUrl) { finalUrl = result.finalUrl }
|
|
if (result && result.contentType) { contentType = result.contentType }
|
|
console.log('context, page, finalUrl, contentType', context, page, finalUrl, contentType);
|
|
} else {
|
|
finalUrl = url
|
|
}
|
|
|
|
try {
|
|
if (contentType === 'application/pdf') {
|
|
const uploadedFileId = await uploadPdf(finalUrl, userId);
|
|
const l = await saveUploadedPdf(userId, finalUrl, uploadedFileId, articleSavingRequestId);
|
|
} else {
|
|
if (!content || !title) {
|
|
const result = await retrieveHtml(page);
|
|
title = result.title;
|
|
content = result.domContent;
|
|
} else {
|
|
console.log('using prefetched content and title');
|
|
console.log(content);
|
|
}
|
|
|
|
const apiResponse = await sendCreateArticleMutation(userId, {
|
|
url: finalUrl,
|
|
articleSavingRequestId,
|
|
preparedDocument: {
|
|
document: content,
|
|
pageInfo: {
|
|
title,
|
|
canonicalUrl: finalUrl,
|
|
},
|
|
},
|
|
skipParsing: !content,
|
|
});
|
|
|
|
logRecord.result = apiResponse.createArticle;
|
|
logger.info(`parse-page`, logRecord);
|
|
}
|
|
} catch (e) {
|
|
console.log('error', e)
|
|
logRecord.error = e.message;
|
|
logger.error(`Error while retrieving page`, logRecord);
|
|
return res.sendStatus(503);
|
|
} finally {
|
|
if (context) {
|
|
await context.close();
|
|
}
|
|
}
|
|
|
|
return res.sendStatus(200);
|
|
});
|
|
|
|
/**
|
|
* Cloud Function entry point, HTTP trigger.
|
|
* Loads the requested URL via Puppeteer and captures a screenshot of the provided element
|
|
*
|
|
* @param {Object} req Cloud Function request context.
|
|
* Inlcudes:
|
|
* * url - URL address of the page to open
|
|
* @param {Object} res Cloud Function response context.
|
|
*/
|
|
exports.preview = Sentry.GCPFunction.wrapHttpFunction(async (req, res) => {
|
|
functionStartTime = Date.now();
|
|
// Grabbing execution and trace ids to attach logs to the appropriate function call
|
|
const execution_id = req.get('function-execution-id');
|
|
const traceId = (req.get('x-cloud-trace-context') || '').split('/')[0];
|
|
const logger = buildLogger('cloudfunctions.googleapis.com%2Fcloud-functions', {
|
|
trace: `projects/${process.env.GCLOUD_PROJECT}/traces/${traceId}`,
|
|
labels: {
|
|
execution_id: execution_id,
|
|
},
|
|
});
|
|
|
|
const url = getUrl(req);
|
|
console.log('preview request url', url);
|
|
|
|
logRecord = {
|
|
url,
|
|
query: req.query,
|
|
origin: req.get('Origin'),
|
|
labels: {
|
|
source: 'publicImagePreview',
|
|
},
|
|
};
|
|
|
|
logger.info(`Public preview image generation request`, logRecord);
|
|
|
|
if (!url) {
|
|
logRecord.urlIsInvalid = true;
|
|
logger.error(`Valid URL to parse is not specified`, logRecord);
|
|
return res.sendStatus(400);
|
|
}
|
|
const { origin } = new URL(url);
|
|
if (!ALLOWED_ORIGINS.some(o => o === origin)) {
|
|
logRecord.forbiddenOrigin = true;
|
|
logger.error(`This origin is not allowed: ${origin}`, logRecord);
|
|
return res.sendStatus(400);
|
|
}
|
|
|
|
const browser = await getBrowserPromise;
|
|
logRecord.timing = { ...logRecord.timing, browserOpened: Date.now() - functionStartTime };
|
|
|
|
const page = await browser.newPage();
|
|
const pageLoadingStart = Date.now();
|
|
const modifiedUrl = new URL(url);
|
|
modifiedUrl.searchParams.append('fontSize', 24);
|
|
modifiedUrl.searchParams.append('adjustAspectRatio', 1.91);
|
|
try {
|
|
await page.goto(modifiedUrl);
|
|
logRecord.timing = { ...logRecord.timing, pageLoaded: Date.now() - pageLoadingStart };
|
|
} catch (error) {
|
|
console.log('error going to page: ', modifiedUrl)
|
|
console.log(error)
|
|
throw error
|
|
}
|
|
|
|
// We lookup the destination path from our own page content and avoid trusting any passed query params
|
|
// selector - CSS selector of the element to get screenshot of
|
|
const selector = decodeURIComponent(
|
|
await page.$eval(
|
|
"head > meta[name='omnivore:preview_image_selector']",
|
|
element => element.content,
|
|
),
|
|
);
|
|
if (!selector) {
|
|
logRecord.selectorIsInvalid = true;
|
|
logger.error(`Valid element selector is not specified`, logRecord);
|
|
await page.close();
|
|
return res.sendStatus(400);
|
|
}
|
|
logRecord.selector = selector;
|
|
|
|
// destination - destination pathname for the image to save with
|
|
const destination = decodeURIComponent(
|
|
await page.$eval(
|
|
"head > meta[name='omnivore:preview_image_destination']",
|
|
element => element.content,
|
|
),
|
|
);
|
|
if (!destination) {
|
|
logRecord.destinationIsInvalid = true;
|
|
logger.error(`Valid file destination is not specified`, logRecord);
|
|
await page.close();
|
|
return res.sendStatus(400);
|
|
}
|
|
logRecord.destination = destination;
|
|
|
|
const screenshotTakingStart = Date.now();
|
|
try {
|
|
await page.waitForSelector(selector, { timeout: 3000 }); // wait for the selector to load
|
|
} catch (error) {
|
|
logRecord.elementNotFound = true;
|
|
logger.error(`Element is not presented on the page`, logRecord);
|
|
await page.close();
|
|
return res.sendStatus(400);
|
|
}
|
|
const element = await page.$(selector);
|
|
await element.screenshot({ path: filePath }); // take screenshot of the element in puppeteer
|
|
logRecord.timing = { ...logRecord.timing, screenshotTaken: Date.now() - screenshotTakingStart };
|
|
|
|
await page.close();
|
|
|
|
try {
|
|
const [file] = await previewBucket.upload(filePath, {
|
|
destination,
|
|
metadata: logRecord,
|
|
});
|
|
logRecord.file = file.metadata;
|
|
} catch (e) {
|
|
console.log('error uploading to bucket, this is non-fatal', e)
|
|
}
|
|
|
|
logger.info(`preview-image`, logRecord);
|
|
return res.redirect(`${process.env.PREVIEW_IMAGE_CDN_ORIGIN}/${destination}`);
|
|
});
|
|
|
|
function validateUrlString(url) {
|
|
const u = new URL(url);
|
|
// Make sure the URL is http or https
|
|
if (u.protocol !== 'http:' && u.protocol !== 'https:') {
|
|
throw new Error('Invalid URL protocol check failed')
|
|
}
|
|
// Make sure the domain is not localhost
|
|
if (u.hostname === 'localhost' || u.hostname === '0.0.0.0') {
|
|
throw new Error('Invalid URL is localhost')
|
|
}
|
|
// Make sure the domain is not a private IP
|
|
if (/^(10|172\.16|192\.168)\..*/.test(u.hostname)) {
|
|
throw new Error('Invalid URL is private ip')
|
|
}
|
|
}
|
|
|
|
function getUrl(req) {
|
|
if (req.query.url || req.body.url) {
|
|
const urlStr = req.query.url || req.body.url;
|
|
validateUrlString(urlStr);
|
|
|
|
const url = Url.parse(urlStr);
|
|
return url.href;
|
|
}
|
|
try {
|
|
return Url.parse(JSON.parse(req.body).url).href;
|
|
} catch (e) {}
|
|
}
|
|
|
|
async function retrievePage(url) {
|
|
validateUrlString(url);
|
|
|
|
const browser = await getBrowserPromise;
|
|
logRecord.timing = { ...logRecord.timing, browserOpened: Date.now() - functionStartTime };
|
|
|
|
const context = await browser.createIncognitoBrowserContext();
|
|
const page = await context.newPage();
|
|
await page.setUserAgent(userAgentForUrl(url));
|
|
|
|
const client = await page.target().createCDPSession();
|
|
|
|
// intercept request when response headers was received
|
|
await client.send('Network.setRequestInterception', {
|
|
patterns: [
|
|
{
|
|
urlPattern: '*',
|
|
resourceType: 'Document',
|
|
interceptionStage: 'HeadersReceived',
|
|
},
|
|
],
|
|
});
|
|
|
|
const path = require('path');
|
|
const download_path = path.resolve('./download_dir/');
|
|
|
|
await page._client.send('Page.setDownloadBehavior', {
|
|
behavior: 'allow',
|
|
userDataDir: './',
|
|
downloadPath: download_path,
|
|
})
|
|
|
|
client.on('Network.requestIntercepted', async e => {
|
|
const headers = e.responseHeaders || {};
|
|
|
|
const [contentType] = (headers['content-type'] || headers['Content-Type'] || '')
|
|
.toLowerCase()
|
|
.split(';');
|
|
const obj = { interceptionId: e.interceptionId };
|
|
|
|
if (e.responseStatusCode >= 200 && e.responseStatusCode < 300) {
|
|
// We only check content-type on success responses
|
|
// as it doesn't matter what the content type is for things
|
|
// like redirects
|
|
if (contentType && !ALLOWED_CONTENT_TYPES.includes(contentType)) {
|
|
obj['errorReason'] = 'BlockedByClient';
|
|
}
|
|
}
|
|
|
|
try {
|
|
await client.send('Network.continueInterceptedRequest', obj);
|
|
// eslint-disable-next-line no-empty
|
|
} catch {}
|
|
});
|
|
|
|
/*
|
|
* Disallow MathJax from running in Puppeteer and modifying the document,
|
|
* we shall instead run it in our frontend application to transform any
|
|
* mathjax content when present.
|
|
*/
|
|
await page.setRequestInterception(true);
|
|
let requestCount = 0;
|
|
page.on('request', request => {
|
|
if (requestCount++ > 100) {
|
|
request.abort();
|
|
return;
|
|
}
|
|
if (
|
|
request.resourceType() === 'script' &&
|
|
request.url().toLowerCase().indexOf('mathjax') > -1
|
|
) {
|
|
request.abort();
|
|
} else {
|
|
request.continue();
|
|
}
|
|
});
|
|
|
|
// Puppeteer fails during download of PDf files,
|
|
// so record the failure and use those items
|
|
let lastPdfUrl = undefined;
|
|
page.on('response', response => {
|
|
if (response.headers()['content-type'] === 'application/pdf') {
|
|
lastPdfUrl = response.url();
|
|
}
|
|
});
|
|
|
|
try {
|
|
const response = await page.goto(url, { waitUntil: ['networkidle2'] });
|
|
const finalUrl = response.url();
|
|
const contentType = response.headers()['content-type'];
|
|
|
|
logRecord.finalUrl = response.url();
|
|
logRecord.contentType = response.headers()['content-type'];
|
|
|
|
return { context, page, response, finalUrl: finalUrl, contentType: contentType };
|
|
} catch (error) {
|
|
if (lastPdfUrl) {
|
|
return { context, page, finalUrl: lastPdfUrl, contentType: 'application/pdf' };
|
|
}
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
async function retrieveHtml(page) {
|
|
let domContent = '', title;
|
|
try {
|
|
title = await page.title();
|
|
logRecord.title = title;
|
|
|
|
const pageScrollingStart = Date.now();
|
|
/* scroll with a 5 second timeout */
|
|
await Promise.race([
|
|
new Promise(resolve => {
|
|
(async function () {
|
|
try {
|
|
await page.evaluate(`(async () => {
|
|
/* credit: https://github.com/puppeteer/puppeteer/issues/305 */
|
|
return new Promise((resolve, reject) => {
|
|
let scrollHeight = document.body.scrollHeight;
|
|
let totalHeight = 0;
|
|
let distance = 500;
|
|
let timer = setInterval(() => {
|
|
window.scrollBy(0, distance);
|
|
totalHeight += distance;
|
|
if(totalHeight >= scrollHeight){
|
|
clearInterval(timer);
|
|
resolve(true);
|
|
}
|
|
}, 10);
|
|
});
|
|
})()`);
|
|
} catch (e) {
|
|
logRecord.scrollError = true;
|
|
} finally {
|
|
resolve(true);
|
|
}
|
|
})();
|
|
}),
|
|
page.waitForTimeout(5000), //5 second timeout
|
|
]);
|
|
logRecord.timing = { ...logRecord.timing, pageScrolled: Date.now() - pageScrollingStart };
|
|
|
|
const iframes = {};
|
|
const urls = [];
|
|
const framesPromises = [];
|
|
const allowedUrls = /instagram\.com/gi;
|
|
|
|
for (const frame of page.mainFrame().childFrames()) {
|
|
if (frame.url() && allowedUrls.test(frame.url())) {
|
|
urls.push(frame.url());
|
|
framesPromises.push(frame.evaluate(el => el.innerHTML, await frame.$('body')));
|
|
}
|
|
}
|
|
|
|
(await Promise.all(framesPromises)).forEach((frame, index) => (iframes[urls[index]] = frame));
|
|
|
|
const domContentCapturingStart = Date.now();
|
|
// get document body with all hidden elements removed
|
|
domContent = await page.evaluate(iframes => {
|
|
const BI_SRC_REGEXP = /url\("(.+?)"\)/gi;
|
|
|
|
Array.from(document.body.getElementsByTagName('*')).forEach(el => {
|
|
const style = window.getComputedStyle(el);
|
|
|
|
// Removing blurred images since they are mostly the copies of lazy loaded ones
|
|
if (['img', 'image'].includes(el.tagName.toLowerCase())) {
|
|
const filter = style.getPropertyValue('filter');
|
|
if (filter && filter.startsWith('blur')) {
|
|
el.parentNode && el.parentNode.removeChild(el);
|
|
}
|
|
}
|
|
|
|
// convert all nodes with background image to img nodes
|
|
if (!['', 'none'].includes(style.getPropertyValue('background-image'))) {
|
|
const filter = style.getPropertyValue('filter');
|
|
// avoiding image nodes with a blur effect creation
|
|
if (filter && filter.startsWith('blur')) {
|
|
el && el.parentNode && el.parentNode.removeChild(el);
|
|
} else {
|
|
const matchedSRC = BI_SRC_REGEXP.exec(style.getPropertyValue('background-image'));
|
|
// Using "g" flag with a regex we have to manually break down lastIndex to zero after every usage
|
|
// More details here: https://stackoverflow.com/questions/1520800/why-does-a-regexp-with-global-flag-give-wrong-results
|
|
BI_SRC_REGEXP.lastIndex = 0;
|
|
|
|
if (matchedSRC && matchedSRC[1] && !el.src) {
|
|
// Replacing element only of there are no content inside, b/c might remove important div with content.
|
|
// Article example: http://www.josiahzayner.com/2017/01/genetic-designer-part-i.html
|
|
// DIV with class "content-inner" has `url("https://resources.blogblog.com/blogblog/data/1kt/travel/bg_container.png")` background image.
|
|
if (el.innerHTML.length < 25) {
|
|
const img = document.createElement('img');
|
|
img.src = matchedSRC[1];
|
|
el && el.parentNode && el.parentNode.removeChild(el);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (el.tagName === 'IFRAME') {
|
|
if (iframes[el.src]) {
|
|
const newNode = document.createElement('div');
|
|
newNode.className = 'omnivore-instagram-embed';
|
|
newNode.innerHTML = iframes[el.src];
|
|
el && el.parentNode && el.parentNode.replaceChild(newNode, el);
|
|
}
|
|
}
|
|
});
|
|
return document.documentElement.innerHTML;
|
|
}, iframes);
|
|
logRecord.puppeteerSuccess = true;
|
|
logRecord.timing = {
|
|
...logRecord.timing,
|
|
contenCaptured: Date.now() - domContentCapturingStart,
|
|
};
|
|
|
|
// [END puppeteer-block]
|
|
} catch (e) {
|
|
if (e.message.startsWith('net::ERR_BLOCKED_BY_CLIENT at ')) {
|
|
logRecord.blockedByClient = true;
|
|
} else {
|
|
logRecord.puppeteerSuccess = false;
|
|
logRecord.puppeteerError = {
|
|
message: e.message,
|
|
stack: e.stack,
|
|
};
|
|
}
|
|
}
|
|
return { domContent, title };
|
|
}
|