convert content-fetch to typescript

This commit is contained in:
Hongbo Wu
2024-01-18 11:03:57 +08:00
parent cd3402b98a
commit d9feb740cb
28 changed files with 591 additions and 596 deletions

View File

@ -7,45 +7,6 @@ declare module '*.graphql' {
export = schema export = schema
} }
declare module 'knex-stringcase' {
import { Knex } from 'knex'
type StringCase =
| 'camelcase'
| 'capitalcase'
| 'constcase'
| 'cramcase'
| 'decapitalcase'
| 'dotcase'
| 'enumcase'
| 'lowercase'
| 'pascalcase'
| 'pathcase'
| 'sentencecase'
| 'snakecase'
| 'spacecase'
| 'spinalcase'
| 'titlecase'
| 'trimcase'
| 'uppercase'
interface KnexStringCaseConfig extends Knex.Config {
appStringcase?: StringCase | StringCase[]
dbStringcase?: StringCase | StringCase[]
/* eslint-disable @typescript-eslint/no-explicit-any */
beforePostProcessResponse?(
result: any[] | object,
queryContext: object
): any[] | object
beforeWrapIdentifier?(value: string, queryContext: object): string
/* eslint-enable @typescript-eslint/no-explicit-any */
ignoreStringcase?(obj: object): boolean
}
function knexStringcase(config: KnexStringCaseConfig): Knex.Config
export = knexStringcase
}
declare module 'voca/slugify' { declare module 'voca/slugify' {
function slugify(subject?: string): string function slugify(subject?: string): string

View File

@ -1,4 +1,5 @@
node_modules node_modules
build
.env* .env*
Dockerfile Dockerfile
.dockerignore .dockerignore

View File

@ -0,0 +1,2 @@
node_modules/
build/

View File

@ -0,0 +1,6 @@
{
"extends": "../../.eslintrc",
"parserOptions": {
"project": "tsconfig.json"
}
}

View File

@ -1,21 +0,0 @@
# This file specifies files that are *not* uploaded to Google Cloud Platform
# using gcloud. It follows the same syntax as .gitignore, with the addition of
# "#!include" directives (which insert the entries of the given .gitignore-style
# file at that point).
#
# For more information, run:
# $ gcloud topic gcloudignore
#
.gcloudignore
# If you would like to upload your .git directory, .gitignore file or files
# from your .gitignore file, remove the corresponding line
# below:
.git
.gitignore
node_modules
.env*
.secrets*
Dockerfile*
previewImage.*
*.sa.json

View File

@ -32,6 +32,7 @@ ADD /packages/content-handler ./packages/content-handler
ADD /packages/puppeteer-parse ./packages/puppeteer-parse ADD /packages/puppeteer-parse ./packages/puppeteer-parse
ADD /packages/readabilityjs ./packages/readabilityjs ADD /packages/readabilityjs ./packages/readabilityjs
RUN yarn workspace @omnivore/content-handler build RUN yarn workspace @omnivore/content-handler build
RUN yarn workspace @omnivore/puppeteer-parse build
# After building, fetch the production dependencies # After building, fetch the production dependencies
RUN rm -rf /app/packages/content-fetch/node_modules RUN rm -rf /app/packages/content-fetch/node_modules

View File

@ -37,6 +37,7 @@ ADD /packages/puppeteer-parse ./packages/puppeteer-parse
ADD /packages/content-fetch ./packages/content-fetch ADD /packages/content-fetch ./packages/content-fetch
ADD /packages/readabilityjs ./packages/readabilityjs ADD /packages/readabilityjs ./packages/readabilityjs
RUN yarn workspace @omnivore/content-handler build RUN yarn workspace @omnivore/content-handler build
RUN yarn workspace @omnivore/puppeteer-parse build
# After building, fetch the production dependencies # After building, fetch the production dependencies
RUN rm -rf /app/packages/content-fetch/node_modules RUN rm -rf /app/packages/content-fetch/node_modules

View File

@ -1,205 +0,0 @@
const axios = require('axios');
const jwt = require('jsonwebtoken');
const { promisify } = require('util');
const signToken = promisify(jwt.sign);
const IMPORTER_METRICS_COLLECTOR_URL = process.env.IMPORTER_METRICS_COLLECTOR_URL;
const REQUEST_TIMEOUT = 30000; // 30 seconds
exports.uploadToSignedUrl = async ({ id, uploadSignedUrl }, contentType, contentObjUrl) => {
try {
const stream = await axios.get(contentObjUrl, { responseType: 'stream', timeout: REQUEST_TIMEOUT });
return axios.put(uploadSignedUrl, stream.data, {
headers: {
'Content-Type': contentType,
},
maxBodyLength: 1000000000,
maxContentLength: 100000000,
timeout: REQUEST_TIMEOUT,
});
} catch (error) {
console.error('error uploading to signed url', error.message);
return null;
}
};
exports.getUploadIdAndSignedUrl = async (userId, url, articleSavingRequestId) => {
const auth = await signToken({ uid: userId }, process.env.JWT_SECRET);
const data = JSON.stringify({
query: `mutation UploadFileRequest($input: UploadFileRequestInput!) {
uploadFileRequest(input:$input) {
... on UploadFileRequestError {
errorCodes
}
... on UploadFileRequestSuccess {
id
uploadSignedUrl
}
}
}`,
variables: {
input: {
url,
contentType: 'application/pdf',
clientRequestId: articleSavingRequestId,
}
}
});
try {
const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data,
{
headers: {
Cookie: `auth=${auth};`,
'Content-Type': 'application/json',
},
timeout: REQUEST_TIMEOUT,
});
if (response.data.data.uploadFileRequest.errorCodes && response.data.data.uploadFileRequest.errorCodes.length > 0) {
console.error('Error while getting upload id and signed url', response.data.data.uploadFileRequest.errorCodes[0]);
return null;
}
return response.data.data.uploadFileRequest;
} catch (e) {
console.error('error getting upload id and signed url', e.message);
return null;
}
};
exports.uploadPdf = async (url, userId, articleSavingRequestId) => {
validateUrlString(url);
const uploadResult = await getUploadIdAndSignedUrl(userId, url, articleSavingRequestId);
if (!uploadResult) {
throw new Error('error while getting upload id and signed url');
}
const uploaded = await uploadToSignedUrl(uploadResult, 'application/pdf', url);
if (!uploaded) {
throw new Error('error while uploading pdf');
}
return uploadResult.id;
};
exports.sendCreateArticleMutation = async (userId, input) => {
const data = JSON.stringify({
query: `mutation CreateArticle ($input: CreateArticleInput!){
createArticle(input:$input){
... on CreateArticleSuccess{
createdArticle{
id
}
}
... on CreateArticleError{
errorCodes
}
}
}`,
variables: {
input,
},
});
const auth = await signToken({ uid: userId }, process.env.JWT_SECRET);
try {
const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data,
{
headers: {
Cookie: `auth=${auth};`,
'Content-Type': 'application/json',
},
timeout: REQUEST_TIMEOUT,
});
if (response.data.data.createArticle.errorCodes && response.data.data.createArticle.errorCodes.length > 0) {
console.error('error while creating article', response.data.data.createArticle.errorCodes[0]);
return null;
}
return response.data.data.createArticle;
} catch (error) {
console.error('error creating article', error.message);
return null;
}
};
exports.sendSavePageMutation = async (userId, input) => {
const data = JSON.stringify({
query: `mutation SavePage ($input: SavePageInput!){
savePage(input:$input){
... on SaveSuccess{
url
clientRequestId
}
... on SaveError{
errorCodes
}
}
}`,
variables: {
input,
},
});
const auth = await signToken({ uid: userId }, process.env.JWT_SECRET);
try {
const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data,
{
headers: {
Cookie: `auth=${auth};`,
'Content-Type': 'application/json',
},
timeout: REQUEST_TIMEOUT,
});
if (response.data.data.savePage.errorCodes && response.data.data.savePage.errorCodes.length > 0) {
console.error('error while saving page', response.data.data.savePage.errorCodes[0]);
if (response.data.data.savePage.errorCodes[0] === 'UNAUTHORIZED') {
return { error: 'UNAUTHORIZED' };
}
return null;
}
return response.data.data.savePage;
} catch (error) {
console.error('error saving page', error.message);
return null;
}
};
exports.saveUploadedPdf = async (userId, url, uploadFileId, articleSavingRequestId) => {
return sendCreateArticleMutation(userId, {
url: encodeURI(url),
articleSavingRequestId,
uploadFileId: uploadFileId,
state,
labels,
source,
folder,
},
);
};
exports.sendImportStatusUpdate = async (userId, taskId, status) => {
try {
const auth = await signToken({ uid: userId }, process.env.JWT_SECRET);
await axios.post(
IMPORTER_METRICS_COLLECTOR_URL,
{
taskId,
status,
},
{
headers: {
'Authorization': auth,
'Content-Type': 'application/json',
},
timeout: REQUEST_TIMEOUT,
});
} catch (e) {
console.error('error while sending import status update', e);
}
};

View File

@ -1,35 +0,0 @@
require('dotenv').config();
const express = require('express');
const { contentFetchRequestHandler } = require('./request_handler');
const app = express();
app.use(express.json());
app.use(express.urlencoded({ extended: true }));
if (!process.env.VERIFICATION_TOKEN) {
throw new Error('VERIFICATION_TOKEN environment variable is not set');
}
app.all('/', async (req, res) => {
if (req.method !== 'GET' && req.method !== 'POST') {
console.error('request method is not GET or POST')
return res.sendStatus(405)
}
if (req.query.token !== process.env.VERIFICATION_TOKEN) {
console.error('query does not include valid token')
return res.sendStatus(403)
}
return contentFetchRequestHandler(req, res);
});
const PORT = parseInt(process.env.PORT) || 8080;
app.listen(PORT, () => {
console.log(`App listening on port ${PORT}`);
console.log('Press Ctrl+C to quit.');
});
module.exports = app;

View File

@ -1,75 +0,0 @@
const { interfaces } = require('mocha');
const { uploadPdf, sendSavePageMutation, sendCreateArticleMutation, sendImportStatusUpdate } = require('./api');
interface Item {
url: string;
userId: string;
contentType: string;
articleSavingRequestId: string;
state: string;
labels: string[];
source: string;
folder: string;
rssFeedUrl: string;
savedAt: string;
publishedAt: string;
readabilityResult: string;
}
exports.saveItem = async (item: Item) => {
const { url, userId, contentType, articleSavingRequestId, state, labels, source, folder, rssFeedUrl, savedAt, publishedAt, readabilityResult } = item;
try {
if (contentType === 'application/pdf') {
const uploadFileId = await uploadPdf(url, userId, articleSavingRequestId);
const uploadedPdf = await sendCreateArticleMutation(userId, {
url: encodeURI(url),
articleSavingRequestId,
uploadFileId,
state,
labels,
source,
folder,
rssFeedUrl,
savedAt,
publishedAt,
});
if (!uploadedPdf) {
console.error('error while saving uploaded pdf', url);
return false;
}
} else {
const apiResponse = await sendSavePageMutation(userId, {
url,
clientRequestId: articleSavingRequestId,h
title,
originalContent: content,
parseResult: readabilityResult,
state,
labels,
rssFeedUrl,
savedAt,
publishedAt,
source,
folder,
});
if (!apiResponse) {
console.error('error while saving page', url);
return false;
} else if (apiResponse.error === 'UNAUTHORIZED') {
console.log('user is deleted, do not retry', userId);
return true;
} else {
importStatus = readabilityResult ? 'imported' : 'failed';
}
}
} catch (error) {
logRecord.error = error.message;
} finally {
// mark import failed on the last failed retry
const retryCount = req.headers['x-cloudtasks-taskretrycount'];
if (retryCount === MAX_RETRY_COUNT) {
console.log('max retry count reached');
importStatus = importStatus || 'failed';
}
}
}

View File

@ -1,66 +0,0 @@
const { config, format, loggers, transports } = require('winston');
const { LoggingWinston } = require('@google-cloud/logging-winston');
const { DateTime } = require('luxon');
const colors = {
emerg: 'inverse underline magenta',
alert: 'underline magenta',
crit: 'inverse underline red', // Any error that is forcing a shutdown of the service or application to prevent data loss.
error: 'underline red', // Any error which is fatal to the operation, but not the service or application
warning: 'underline yellow', // Anything that can potentially cause application oddities
notice: 'underline cyan', // Normal but significant condition
info: 'underline green', // Generally useful information to log
debug: 'underline gray',
};
const googleConfigs = {
level: 'info',
logName: 'logger',
levels: config.syslog.levels,
resource: {
labels: {
function_name: process.env.FUNCTION_TARGET,
project_id: process.env.GCP_PROJECT,
},
type: 'cloud_function',
},
};
function localConfig(id) {
return {
level: 'debug',
format: format.combine(
format.colorize({ all: true, colors }),
format(info =>
Object.assign(info, {
timestamp: DateTime.local().toLocaleString(DateTime.TIME_24_WITH_SECONDS),
}),
)(),
format.printf(info => {
// eslint-disable-next-line @typescript-eslint/no-unused-vars
const { timestamp, message, level, ...meta } = info;
return `[${id}@${info.timestamp}] ${info.message}${
Object.keys(meta).length ? '\n' + JSON.stringify(meta, null, 4) : ''
}`;
}),
),
};
}
function buildLoggerTransport(id, options) {
return process.env.IS_LOCAL
? new transports.Console(localConfig(id))
: new LoggingWinston({ ...googleConfigs, ...{ logName: id }, ...options });
}
function buildLogger(id, options) {
return loggers.get(id, {
levels: config.syslog.levels,
transports: [buildLoggerTransport(id, options)],
});
}
module.exports = {
buildLogger,
}

View File

@ -0,0 +1,5 @@
{
"extension": ["ts"],
"spec": "test/**/*.test.ts",
"require": "test/babel-register.js"
}

View File

@ -2,7 +2,10 @@
"name": "@omnivore/content-fetch", "name": "@omnivore/content-fetch",
"version": "1.0.0", "version": "1.0.0",
"description": "Service that fetches page content from a URL", "description": "Service that fetches page content from a URL",
"main": "index.js", "main": "build/src/index.js",
"files": [
"build/src"
],
"dependencies": { "dependencies": {
"axios": "^0.27.2", "axios": "^0.27.2",
"dotenv": "^8.2.0", "dotenv": "^8.2.0",
@ -18,9 +21,12 @@
"mocha": "^10.0.0" "mocha": "^10.0.0"
}, },
"scripts": { "scripts": {
"start": "node app.js", "test": "yarn mocha -r ts-node/register --config mocha-config.json",
"start_gcf": "npx functions-framework --port=9090 --target=puppeteer", "test:typecheck": "tsc --noEmit",
"test": "mocha test/*.js" "lint": "eslint src --ext ts,js,tsx,jsx",
"build": "tsc",
"start": "node build/src/app.js",
"start_gcf": "functions-framework --port=9090 --target=puppeteer"
}, },
"volta": { "volta": {
"extends": "../../package.json" "extends": "../../package.json"

View File

@ -1,114 +0,0 @@
const { fetchContent } = require("@omnivore/puppeteer-parse");
const { uploadPdf, sendSavePageMutation, sendCreateArticleMutation, sendImportStatusUpdate } = require('./api');
const MAX_RETRY_COUNT = process.env.MAX_RETRY_COUNT || '1';
exports.contentFetchRequestHandler = async (req, res) => {
let functionStartTime = Date.now();
const userId = (req.query ? req.query.userId : undefined) || (req.body ? req.body.userId : undefined);
const articleSavingRequestId = (req.query ? req.query.saveRequestId : undefined) || (req.body ? req.body.saveRequestId : undefined);
const state = req.body.state
const labels = req.body.labels
const source = req.body.source || 'puppeteer-parse';
const taskId = req.body.taskId; // taskId is used to update import status
const url = (req.query ? req.query.url : undefined) || (req.body ? req.body.url : undefined);
const locale = (req.query ? req.query.locale : undefined) || (req.body ? req.body.locale : undefined);
const timezone = (req.query ? req.query.timezone : undefined) || (req.body ? req.body.timezone : undefined);
const rssFeedUrl = req.body.rssFeedUrl;
const savedAt = req.body.savedAt;
const publishedAt = req.body.publishedAt;
const folder = req.body.folder;
const users = req.body ? req.body.users : undefined; // users is used when saving article for multiple users
let logRecord = {
url,
userId,
articleSavingRequestId,
labels: {
source,
},
state,
labelsToAdd: labels,
taskId: taskId,
locale,
timezone,
rssFeedUrl,
savedAt,
publishedAt,
folder,
users,
};
console.log(`Article parsing request`, logRecord);
let importStatus, statusCode = 200;
try {
const { finalUrl, title, content, readabilityResult, contentType } = await fetchContent(url, locale, timezone);
if (contentType === 'application/pdf') {
const uploadFileId = await uploadPdf(finalUrl, userId, articleSavingRequestId);
const uploadedPdf = await sendCreateArticleMutation(userId, {
url: encodeURI(finalUrl),
articleSavingRequestId,
uploadFileId,
state,
labels,
source,
folder,
rssFeedUrl,
savedAt,
publishedAt,
});
if (!uploadedPdf) {
statusCode = 500;
logRecord.error = 'error while saving uploaded pdf';
} else {
importStatus = 'imported';
}
} else {
const apiResponse = await sendSavePageMutation(userId, {
url,
clientRequestId: articleSavingRequestId,
title,
originalContent: content,
parseResult: readabilityResult,
state,
labels,
rssFeedUrl,
savedAt,
publishedAt,
source,
folder,
});
if (!apiResponse) {
logRecord.error = 'error while saving page';
statusCode = 500;
} else if (apiResponse.error === 'UNAUTHORIZED') {
console.log('user is deleted, do not retry', logRecord);
return res.sendStatus(200);
} else {
importStatus = readabilityResult ? 'imported' : 'failed';
}
}
} catch (error) {
logRecord.error = error.message;
} finally {
logRecord.totalTime = Date.now() - functionStartTime;
console.log(`parse-page result`, logRecord);
// mark import failed on the last failed retry
const retryCount = req.headers['x-cloudtasks-taskretrycount'];
if (retryCount === MAX_RETRY_COUNT) {
console.log('max retry count reached');
importStatus = importStatus || 'failed';
}
// send import status to update the metrics
if (taskId && importStatus) {
await sendImportStatusUpdate(userId, taskId, importStatus);
}
res.sendStatus(statusCode);
}
}

View File

@ -0,0 +1,311 @@
import axios from 'axios'
import jwt from 'jsonwebtoken'
import { promisify } from 'util'
const signToken = promisify(jwt.sign)
const IMPORTER_METRICS_COLLECTOR_URL =
process.env.IMPORTER_METRICS_COLLECTOR_URL
const JWT_SECRET = process.env.JWT_SECRET
const REST_BACKEND_ENDPOINT = process.env.REST_BACKEND_ENDPOINT
if (!IMPORTER_METRICS_COLLECTOR_URL || !JWT_SECRET || !REST_BACKEND_ENDPOINT) {
throw new Error('Missing environment variables')
}
const REQUEST_TIMEOUT = 30000 // 30 seconds
export const uploadToSignedUrl = async (
uploadSignedUrl: string,
contentType: string,
contentObjUrl: string
) => {
try {
const stream = await axios.get(contentObjUrl, {
responseType: 'stream',
timeout: REQUEST_TIMEOUT,
})
return axios.put(uploadSignedUrl, stream.data, {
headers: {
'Content-Type': contentType,
},
maxBodyLength: 1000000000,
maxContentLength: 100000000,
timeout: REQUEST_TIMEOUT,
})
} catch (error) {
console.error('error uploading to signed url', error)
return null
}
}
interface UploadFileResponse {
data: {
uploadFileRequest: {
id: string
uploadSignedUrl: string
uploadFileId: string
createdPageId: string
errorCodes?: string[]
}
}
}
export const getUploadIdAndSignedUrl = async (
userId: string,
url: string,
articleSavingRequestId: string
) => {
const auth = await signToken({ uid: userId }, JWT_SECRET)
const data = JSON.stringify({
query: `mutation UploadFileRequest($input: UploadFileRequestInput!) {
uploadFileRequest(input:$input) {
... on UploadFileRequestError {
errorCodes
}
... on UploadFileRequestSuccess {
id
uploadSignedUrl
}
}
}`,
variables: {
input: {
url,
contentType: 'application/pdf',
clientRequestId: articleSavingRequestId,
},
},
})
try {
const response = await axios.post<UploadFileResponse>(
`${REST_BACKEND_ENDPOINT}/graphql`,
data,
{
headers: {
Cookie: `auth=${auth as string};`,
'Content-Type': 'application/json',
},
timeout: REQUEST_TIMEOUT,
}
)
if (
response.data.data.uploadFileRequest.errorCodes &&
response.data.data.uploadFileRequest.errorCodes?.length > 0
) {
console.error(
'Error while getting upload id and signed url',
response.data.data.uploadFileRequest.errorCodes[0]
)
return null
}
return response.data.data.uploadFileRequest
} catch (e) {
console.error('error getting upload id and signed url', e)
return null
}
}
interface CreateArticleResponse {
data: {
createArticle: {
createdArticle: {
id: string
}
errorCodes: string[]
}
}
}
export const uploadPdf = async (
url: string,
userId: string,
articleSavingRequestId: string
) => {
const uploadResult = await getUploadIdAndSignedUrl(
userId,
url,
articleSavingRequestId
)
if (!uploadResult) {
throw new Error('error while getting upload id and signed url')
}
const uploaded = await uploadToSignedUrl(
uploadResult.uploadSignedUrl,
'application/pdf',
url
)
if (!uploaded) {
throw new Error('error while uploading pdf')
}
return uploadResult.id
}
export const sendCreateArticleMutation = async (
userId: string,
input: unknown
) => {
const data = JSON.stringify({
query: `mutation CreateArticle ($input: CreateArticleInput!){
createArticle(input:$input){
... on CreateArticleSuccess{
createdArticle{
id
}
}
... on CreateArticleError{
errorCodes
}
}
}`,
variables: {
input,
},
})
const auth = await signToken({ uid: userId }, JWT_SECRET)
try {
const response = await axios.post<CreateArticleResponse>(
`${REST_BACKEND_ENDPOINT}/graphql`,
data,
{
headers: {
Cookie: `auth=${auth as string};`,
'Content-Type': 'application/json',
},
timeout: REQUEST_TIMEOUT,
}
)
if (
response.data.data.createArticle.errorCodes &&
response.data.data.createArticle.errorCodes.length > 0
) {
console.error(
'error while creating article',
response.data.data.createArticle.errorCodes[0]
)
return null
}
return response.data.data.createArticle
} catch (error) {
console.error('error creating article', error)
return null
}
}
interface SavePageResponse {
data: {
savePage: {
url: string
clientRequestId: string
errorCodes?: string[]
}
}
}
export const sendSavePageMutation = async (userId: string, input: unknown) => {
const data = JSON.stringify({
query: `mutation SavePage ($input: SavePageInput!){
savePage(input:$input){
... on SaveSuccess{
url
clientRequestId
}
... on SaveError{
errorCodes
}
}
}`,
variables: {
input,
},
})
const auth = await signToken({ uid: userId }, JWT_SECRET)
try {
const response = await axios.post<SavePageResponse>(
`${REST_BACKEND_ENDPOINT}/graphql`,
data,
{
headers: {
Cookie: `auth=${auth as string};`,
'Content-Type': 'application/json',
},
timeout: REQUEST_TIMEOUT,
}
)
if (
response.data.data.savePage.errorCodes &&
response.data.data.savePage.errorCodes.length > 0
) {
console.error(
'error while saving page',
response.data.data.savePage.errorCodes[0]
)
if (response.data.data.savePage.errorCodes[0] === 'UNAUTHORIZED') {
return { error: 'UNAUTHORIZED' }
}
return null
}
return response.data.data.savePage
} catch (error) {
console.error('error saving page', error)
return null
}
}
export const saveUploadedPdf = async (
userId: string,
url: string,
uploadFileId: string,
articleSavingRequestId: string,
state: string,
labels: string[],
source: string,
folder: string
) => {
return sendCreateArticleMutation(userId, {
url: encodeURI(url),
articleSavingRequestId,
uploadFileId: uploadFileId,
state,
labels,
source,
folder,
})
}
export const sendImportStatusUpdate = async (
userId: string,
taskId: string,
status: string
) => {
try {
const auth = await signToken({ uid: userId }, JWT_SECRET)
await axios.post(
IMPORTER_METRICS_COLLECTOR_URL,
{
taskId,
status,
},
{
headers: {
Authorization: auth as string,
'Content-Type': 'application/json',
},
timeout: REQUEST_TIMEOUT,
}
)
} catch (e) {
console.error('error while sending import status update', e)
}
}

View File

@ -0,0 +1,34 @@
import 'dotenv/config'
import express from 'express'
import { contentFetchRequestHandler } from './request_handler'
console.log(process.env)
const app = express()
app.use(express.json())
app.use(express.urlencoded({ extended: true }))
if (!process.env.VERIFICATION_TOKEN) {
throw new Error('VERIFICATION_TOKEN environment variable is not set')
}
app.all('/', (req, res, next) => {
if (req.method !== 'GET' && req.method !== 'POST') {
console.error('request method is not GET or POST')
return res.sendStatus(405)
}
if (req.query.token !== process.env.VERIFICATION_TOKEN) {
console.error('query does not include valid token')
return res.sendStatus(403)
}
return contentFetchRequestHandler(req, res, next)
})
const PORT = process.env.PORT ? parseInt(process.env.PORT) : 8080
app.listen(PORT, () => {
console.log(`App listening on port ${PORT}`)
console.log('Press Ctrl+C to quit.')
})

View File

@ -1,16 +1,12 @@
/* eslint-disable no-undef */ import { HttpFunction } from '@google-cloud/functions-framework'
/* eslint-disable no-empty */ import * as Sentry from '@sentry/serverless'
/* eslint-disable @typescript-eslint/explicit-function-return-type */ import 'dotenv/config'
/* eslint-disable @typescript-eslint/no-var-requires */ import { contentFetchRequestHandler } from './request_handler'
/* eslint-disable @typescript-eslint/no-require-imports */
require('dotenv').config();
const Sentry = require('@sentry/serverless');
const { contentFetchRequestHandler } = require('./request_handler');
Sentry.GCPFunction.init({ Sentry.GCPFunction.init({
dsn: process.env.SENTRY_DSN, dsn: process.env.SENTRY_DSN,
tracesSampleRate: 0, tracesSampleRate: 0,
}); })
/** /**
* Cloud Function entry point, HTTP trigger. * Cloud Function entry point, HTTP trigger.
@ -19,7 +15,9 @@ Sentry.GCPFunction.init({
* @param {Object} req Cloud Function request context. * @param {Object} req Cloud Function request context.
* @param {Object} res Cloud Function response context. * @param {Object} res Cloud Function response context.
*/ */
exports.puppeteer = Sentry.GCPFunction.wrapHttpFunction(contentFetchRequestHandler); export const puppeteer = Sentry.GCPFunction.wrapHttpFunction(
contentFetchRequestHandler as HttpFunction
)
/** /**
* Cloud Function entry point, HTTP trigger. * Cloud Function entry point, HTTP trigger.

View File

@ -0,0 +1,177 @@
import { fetchContent } from '@omnivore/puppeteer-parse'
import { RequestHandler } from 'express'
import {
sendCreateArticleMutation,
sendImportStatusUpdate,
sendSavePageMutation,
uploadPdf,
} from './api'
interface RequestBody {
url: string
userId: string
saveRequestId: string
state?: string
labels?: string[]
source?: string
taskId?: string
locale?: string
timezone?: string
rssFeedUrl?: string
savedAt?: string
publishedAt?: string
folder?: string
users?: string[]
}
interface LogRecord {
url: string
userId: string
articleSavingRequestId: string
labels: {
source: string
}
state?: string
labelsToAdd?: string[]
taskId?: string
locale?: string
timezone?: string
rssFeedUrl?: string
savedAt?: string
publishedAt?: string
folder?: string
users?: string[]
error?: string
totalTime?: number
}
const MAX_RETRY_COUNT = process.env.MAX_RETRY_COUNT || '1'
export const contentFetchRequestHandler: RequestHandler = async (req, res) => {
const functionStartTime = Date.now()
const body = <RequestBody>req.body
const userId = body.userId
const articleSavingRequestId = body.saveRequestId
const state = body.state
const labels = body.labels
const source = body.source || 'puppeteer-parse'
const taskId = body.taskId // taskId is used to update import status
const url = body.url
const locale = body.locale
const timezone = body.timezone
const rssFeedUrl = body.rssFeedUrl
const savedAt = body.savedAt
const publishedAt = body.publishedAt
const folder = body.folder
const users = body ? body.users : undefined // users is used when saving article for multiple users
const logRecord: LogRecord = {
url,
userId,
articleSavingRequestId,
labels: {
source,
},
state,
labelsToAdd: labels,
taskId: taskId,
locale,
timezone,
rssFeedUrl,
savedAt,
publishedAt,
folder,
users,
}
console.log(`Article parsing request`, logRecord)
let importStatus,
statusCode = 200
try {
const fetchResult = await fetchContent(url, locale, timezone)
const finalUrl = fetchResult.finalUrl
const title = fetchResult.title
const content = fetchResult.content
const readabilityResult = fetchResult.readabilityResult as unknown
if (fetchResult.contentType === 'application/pdf') {
const uploadFileId = await uploadPdf(
finalUrl,
userId,
articleSavingRequestId
)
const uploadedPdf = await sendCreateArticleMutation(userId, {
url: encodeURI(finalUrl),
articleSavingRequestId,
uploadFileId,
state,
labels,
source,
folder,
rssFeedUrl,
savedAt,
publishedAt,
})
if (!uploadedPdf) {
statusCode = 500
logRecord.error = 'error while saving uploaded pdf'
} else {
importStatus = 'imported'
}
} else {
const apiResponse = await sendSavePageMutation(userId, {
url,
clientRequestId: articleSavingRequestId,
title,
originalContent: content,
parseResult: readabilityResult,
state,
labels,
rssFeedUrl,
savedAt,
publishedAt,
source,
folder,
})
if (!apiResponse) {
logRecord.error = 'error while saving page'
statusCode = 500
} else if (
'error' in apiResponse &&
apiResponse.error === 'UNAUTHORIZED'
) {
console.log('user is deleted, do not retry', logRecord)
return res.sendStatus(200)
} else {
importStatus = readabilityResult ? 'imported' : 'failed'
}
}
} catch (error) {
console.error(error)
if (error instanceof Error) {
logRecord.error = error.message
} else {
logRecord.error = 'unknown error'
}
} finally {
logRecord.totalTime = Date.now() - functionStartTime
console.log(`parse-page result`, logRecord)
// mark import failed on the last failed retry
const retryCount = req.headers['x-cloudtasks-taskretrycount']
if (retryCount === MAX_RETRY_COUNT) {
console.log('max retry count reached')
importStatus = importStatus || 'failed'
}
// send import status to update the metrics
if (taskId && importStatus) {
await sendImportStatusUpdate(userId, taskId, importStatus)
}
res.sendStatus(statusCode)
}
}

View File

@ -0,0 +1,3 @@
const register = require('@babel/register').default
register({ extensions: ['.ts', '.tsx', '.js', '.jsx'] })

View File

@ -1,9 +0,0 @@
const chai = require("chai");
const expect = chai.expect;
describe('Stub test', () => {
it('should pass', () => {
expect(true).to.be.true
})
})

View File

@ -0,0 +1,8 @@
import 'mocha'
import { expect } from 'chai'
describe('stub test', () => {
it('should pass', () => {
expect(true).to.be.true
})
})

View File

@ -0,0 +1,8 @@
{
"extends": "./../../tsconfig.json",
"compilerOptions": {
"outDir": "build",
"rootDir": ".",
},
"include": ["src"]
}

View File

@ -1,2 +0,0 @@
node_modules
/lib

View File

@ -1,10 +1,9 @@
{ {
"extends": "@tsconfig/node14/tsconfig.json", "extends": "./../../tsconfig.json",
"compilerOptions": { "compilerOptions": {
"rootDir": ".", "rootDir": ".",
"declaration": true, "declaration": true,
"outDir": "build", "outDir": "build"
"lib": ["dom"]
}, },
"include": ["src"] "include": ["src"]
} }

View File

@ -3,6 +3,7 @@
"version": "1.0.0", "version": "1.0.0",
"description": "Accepts URL of the article and parses its content", "description": "Accepts URL of the article and parses its content",
"main": "build/src/index.js", "main": "build/src/index.js",
"types": "build/src/index.d.ts",
"files": [ "files": [
"build/src" "build/src"
], ],

View File

@ -140,8 +140,8 @@ const getBrowserPromise = (async () => {
export const fetchContent = async ( export const fetchContent = async (
url: string, url: string,
locale: string, locale?: string,
timezone: string timezone?: string
) => { ) => {
const functionStartTime = Date.now() const functionStartTime = Date.now()
const logRecord = { const logRecord = {
@ -154,8 +154,8 @@ export const fetchContent = async (
let context: BrowserContext | undefined, let context: BrowserContext | undefined,
page: Page | undefined, page: Page | undefined,
finalUrl: string | undefined, finalUrl = '',
title: string | undefined, title = '',
content: string | undefined, content: string | undefined,
contentType: string | undefined, contentType: string | undefined,
readabilityResult: Readability.ParseResult | null | undefined readabilityResult: Readability.ParseResult | null | undefined
@ -217,7 +217,7 @@ export const fetchContent = async (
const sbResult = await fetchContentWithScrapingBee(url) const sbResult = await fetchContentWithScrapingBee(url)
title = sbResult.title title = sbResult.title
content = sbResult.domContent content = sbResult.domContent
} else { } else if (result.title && result.domContent) {
title = result.title title = result.title
content = result.domContent content = result.domContent
} }
@ -312,8 +312,8 @@ async function retrievePage(
url: string, url: string,
logRecord: Record<string, any>, logRecord: Record<string, any>,
functionStartTime: number, functionStartTime: number,
locale: string, locale?: string,
timezone: string timezone?: string
) { ) {
validateUrlString(url) validateUrlString(url)
@ -466,8 +466,7 @@ async function retrievePage(
} }
async function retrieveHtml(page: Page, logRecord: Record<string, any>) { async function retrieveHtml(page: Page, logRecord: Record<string, any>) {
let domContent = '', let domContent, title
title
try { try {
title = await page.title() title = await page.title()
logRecord.title = title logRecord.title = title

View File

@ -2,7 +2,9 @@
"extends": "./../../tsconfig.json", "extends": "./../../tsconfig.json",
"compilerOptions": { "compilerOptions": {
"outDir": "build", "outDir": "build",
"rootDir": "." "rootDir": ".",
// Generate d.ts files
"declaration": true
}, },
"include": ["src"] "include": ["src"]
} }

View File

@ -1,11 +1,10 @@
{ {
"extends": "@tsconfig/node14/tsconfig.json", "extends": "./../../tsconfig.json",
"compilerOptions": { "compilerOptions": {
"outDir": "build", "outDir": "build",
"rootDir": ".", "rootDir": ".",
"lib": ["dom"],
// Generate d.ts files // Generate d.ts files
"declaration": true "declaration": true
}, },
"include": ["src"], "include": ["src"]
} }