From d9feb740cb848f664bfe01b2fdfd78f27d4b90a6 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Thu, 18 Jan 2024 11:03:57 +0800 Subject: [PATCH] convert content-fetch to typescript --- packages/api/src/graphql.d.ts | 39 --- packages/content-fetch/.dockerignore | 1 + packages/content-fetch/.eslintignore | 2 + packages/content-fetch/.eslintrc | 6 + packages/content-fetch/.gcloudignore | 21 -- packages/content-fetch/Dockerfile | 1 + packages/content-fetch/Dockerfile-gcf | 1 + packages/content-fetch/api.js | 205 ------------ packages/content-fetch/app.js | 35 -- packages/content-fetch/item.js | 75 ----- packages/content-fetch/logger.js | 66 ---- packages/content-fetch/mocha-config.json | 5 + packages/content-fetch/package.json | 14 +- packages/content-fetch/request_handler.js | 114 ------- packages/content-fetch/src/api.ts | 311 ++++++++++++++++++ packages/content-fetch/src/app.ts | 34 ++ .../content-fetch/{index.js => src/index.ts} | 18 +- packages/content-fetch/src/request_handler.ts | 177 ++++++++++ packages/content-fetch/test/babel-register.js | 3 + packages/content-fetch/test/stub.test.js | 9 - packages/content-fetch/test/stub.test.ts | 8 + packages/content-fetch/tsconfig.json | 8 + packages/content-handler/.gitignore | 2 - packages/content-handler/tsconfig.json | 5 +- packages/puppeteer-parse/package.json | 1 + packages/puppeteer-parse/src/index.ts | 17 +- packages/puppeteer-parse/tsconfig.json | 4 +- packages/text-to-speech/tsconfig.json | 5 +- 28 files changed, 591 insertions(+), 596 deletions(-) create mode 100644 packages/content-fetch/.eslintignore create mode 100644 packages/content-fetch/.eslintrc delete mode 100644 packages/content-fetch/.gcloudignore delete mode 100644 packages/content-fetch/api.js delete mode 100644 packages/content-fetch/app.js delete mode 100644 packages/content-fetch/item.js delete mode 100644 packages/content-fetch/logger.js create mode 100644 packages/content-fetch/mocha-config.json delete mode 100644 packages/content-fetch/request_handler.js create mode 100644 packages/content-fetch/src/api.ts create mode 100644 packages/content-fetch/src/app.ts rename packages/content-fetch/{index.js => src/index.ts} (60%) create mode 100644 packages/content-fetch/src/request_handler.ts create mode 100644 packages/content-fetch/test/babel-register.js delete mode 100644 packages/content-fetch/test/stub.test.js create mode 100644 packages/content-fetch/test/stub.test.ts create mode 100644 packages/content-fetch/tsconfig.json delete mode 100644 packages/content-handler/.gitignore diff --git a/packages/api/src/graphql.d.ts b/packages/api/src/graphql.d.ts index 6c4bc953c..4c2e4693f 100755 --- a/packages/api/src/graphql.d.ts +++ b/packages/api/src/graphql.d.ts @@ -7,45 +7,6 @@ declare module '*.graphql' { export = schema } -declare module 'knex-stringcase' { - import { Knex } from 'knex' - - type StringCase = - | 'camelcase' - | 'capitalcase' - | 'constcase' - | 'cramcase' - | 'decapitalcase' - | 'dotcase' - | 'enumcase' - | 'lowercase' - | 'pascalcase' - | 'pathcase' - | 'sentencecase' - | 'snakecase' - | 'spacecase' - | 'spinalcase' - | 'titlecase' - | 'trimcase' - | 'uppercase' - - interface KnexStringCaseConfig extends Knex.Config { - appStringcase?: StringCase | StringCase[] - dbStringcase?: StringCase | StringCase[] - /* eslint-disable @typescript-eslint/no-explicit-any */ - beforePostProcessResponse?( - result: any[] | object, - queryContext: object - ): any[] | object - beforeWrapIdentifier?(value: string, queryContext: object): string - /* eslint-enable @typescript-eslint/no-explicit-any */ - ignoreStringcase?(obj: object): boolean - } - - function knexStringcase(config: KnexStringCaseConfig): Knex.Config - export = knexStringcase -} - declare module 'voca/slugify' { function slugify(subject?: string): string diff --git a/packages/content-fetch/.dockerignore b/packages/content-fetch/.dockerignore index 77c017249..b77d54205 100644 --- a/packages/content-fetch/.dockerignore +++ b/packages/content-fetch/.dockerignore @@ -1,4 +1,5 @@ node_modules +build .env* Dockerfile .dockerignore diff --git a/packages/content-fetch/.eslintignore b/packages/content-fetch/.eslintignore new file mode 100644 index 000000000..b38db2f29 --- /dev/null +++ b/packages/content-fetch/.eslintignore @@ -0,0 +1,2 @@ +node_modules/ +build/ diff --git a/packages/content-fetch/.eslintrc b/packages/content-fetch/.eslintrc new file mode 100644 index 000000000..644bb1aec --- /dev/null +++ b/packages/content-fetch/.eslintrc @@ -0,0 +1,6 @@ +{ + "extends": "../../.eslintrc", + "parserOptions": { + "project": "tsconfig.json" + } +} diff --git a/packages/content-fetch/.gcloudignore b/packages/content-fetch/.gcloudignore deleted file mode 100644 index fc644d8d3..000000000 --- a/packages/content-fetch/.gcloudignore +++ /dev/null @@ -1,21 +0,0 @@ -# This file specifies files that are *not* uploaded to Google Cloud Platform -# using gcloud. It follows the same syntax as .gitignore, with the addition of -# "#!include" directives (which insert the entries of the given .gitignore-style -# file at that point). -# -# For more information, run: -# $ gcloud topic gcloudignore -# -.gcloudignore -# If you would like to upload your .git directory, .gitignore file or files -# from your .gitignore file, remove the corresponding line -# below: -.git -.gitignore - -node_modules -.env* -.secrets* -Dockerfile* -previewImage.* -*.sa.json diff --git a/packages/content-fetch/Dockerfile b/packages/content-fetch/Dockerfile index a637ce9b0..bc33e7d85 100644 --- a/packages/content-fetch/Dockerfile +++ b/packages/content-fetch/Dockerfile @@ -32,6 +32,7 @@ ADD /packages/content-handler ./packages/content-handler ADD /packages/puppeteer-parse ./packages/puppeteer-parse ADD /packages/readabilityjs ./packages/readabilityjs RUN yarn workspace @omnivore/content-handler build +RUN yarn workspace @omnivore/puppeteer-parse build # After building, fetch the production dependencies RUN rm -rf /app/packages/content-fetch/node_modules diff --git a/packages/content-fetch/Dockerfile-gcf b/packages/content-fetch/Dockerfile-gcf index fd6e1680e..9d7d038d9 100644 --- a/packages/content-fetch/Dockerfile-gcf +++ b/packages/content-fetch/Dockerfile-gcf @@ -37,6 +37,7 @@ ADD /packages/puppeteer-parse ./packages/puppeteer-parse ADD /packages/content-fetch ./packages/content-fetch ADD /packages/readabilityjs ./packages/readabilityjs RUN yarn workspace @omnivore/content-handler build +RUN yarn workspace @omnivore/puppeteer-parse build # After building, fetch the production dependencies RUN rm -rf /app/packages/content-fetch/node_modules diff --git a/packages/content-fetch/api.js b/packages/content-fetch/api.js deleted file mode 100644 index 0a36d4db6..000000000 --- a/packages/content-fetch/api.js +++ /dev/null @@ -1,205 +0,0 @@ -const axios = require('axios'); -const jwt = require('jsonwebtoken'); -const { promisify } = require('util'); -const signToken = promisify(jwt.sign); - -const IMPORTER_METRICS_COLLECTOR_URL = process.env.IMPORTER_METRICS_COLLECTOR_URL; -const REQUEST_TIMEOUT = 30000; // 30 seconds - -exports.uploadToSignedUrl = async ({ id, uploadSignedUrl }, contentType, contentObjUrl) => { - try { - const stream = await axios.get(contentObjUrl, { responseType: 'stream', timeout: REQUEST_TIMEOUT }); - return axios.put(uploadSignedUrl, stream.data, { - headers: { - 'Content-Type': contentType, - }, - maxBodyLength: 1000000000, - maxContentLength: 100000000, - timeout: REQUEST_TIMEOUT, - }); - } catch (error) { - console.error('error uploading to signed url', error.message); - return null; - } -}; - -exports.getUploadIdAndSignedUrl = async (userId, url, articleSavingRequestId) => { - const auth = await signToken({ uid: userId }, process.env.JWT_SECRET); - const data = JSON.stringify({ - query: `mutation UploadFileRequest($input: UploadFileRequestInput!) { - uploadFileRequest(input:$input) { - ... on UploadFileRequestError { - errorCodes - } - ... on UploadFileRequestSuccess { - id - uploadSignedUrl - } - } - }`, - variables: { - input: { - url, - contentType: 'application/pdf', - clientRequestId: articleSavingRequestId, - } - } - }); - - try { - const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data, - { - headers: { - Cookie: `auth=${auth};`, - 'Content-Type': 'application/json', - }, - timeout: REQUEST_TIMEOUT, - }); - - if (response.data.data.uploadFileRequest.errorCodes && response.data.data.uploadFileRequest.errorCodes.length > 0) { - console.error('Error while getting upload id and signed url', response.data.data.uploadFileRequest.errorCodes[0]); - return null; - } - - return response.data.data.uploadFileRequest; - } catch (e) { - console.error('error getting upload id and signed url', e.message); - return null; - } -}; - -exports.uploadPdf = async (url, userId, articleSavingRequestId) => { - validateUrlString(url); - - const uploadResult = await getUploadIdAndSignedUrl(userId, url, articleSavingRequestId); - if (!uploadResult) { - throw new Error('error while getting upload id and signed url'); - } - const uploaded = await uploadToSignedUrl(uploadResult, 'application/pdf', url); - if (!uploaded) { - throw new Error('error while uploading pdf'); - } - return uploadResult.id; -}; - -exports.sendCreateArticleMutation = async (userId, input) => { - const data = JSON.stringify({ - query: `mutation CreateArticle ($input: CreateArticleInput!){ - createArticle(input:$input){ - ... on CreateArticleSuccess{ - createdArticle{ - id - } - } - ... on CreateArticleError{ - errorCodes - } - } - }`, - variables: { - input, - }, - }); - - const auth = await signToken({ uid: userId }, process.env.JWT_SECRET); - try { - const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data, - { - headers: { - Cookie: `auth=${auth};`, - 'Content-Type': 'application/json', - }, - timeout: REQUEST_TIMEOUT, - }); - - if (response.data.data.createArticle.errorCodes && response.data.data.createArticle.errorCodes.length > 0) { - console.error('error while creating article', response.data.data.createArticle.errorCodes[0]); - return null; - } - - return response.data.data.createArticle; - } catch (error) { - console.error('error creating article', error.message); - return null; - } -}; - -exports.sendSavePageMutation = async (userId, input) => { - const data = JSON.stringify({ - query: `mutation SavePage ($input: SavePageInput!){ - savePage(input:$input){ - ... on SaveSuccess{ - url - clientRequestId - } - ... on SaveError{ - errorCodes - } - } - }`, - variables: { - input, - }, - }); - - const auth = await signToken({ uid: userId }, process.env.JWT_SECRET); - try { - const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data, - { - headers: { - Cookie: `auth=${auth};`, - 'Content-Type': 'application/json', - }, - timeout: REQUEST_TIMEOUT, - }); - - if (response.data.data.savePage.errorCodes && response.data.data.savePage.errorCodes.length > 0) { - console.error('error while saving page', response.data.data.savePage.errorCodes[0]); - if (response.data.data.savePage.errorCodes[0] === 'UNAUTHORIZED') { - return { error: 'UNAUTHORIZED' }; - } - - return null; - } - - return response.data.data.savePage; - } catch (error) { - console.error('error saving page', error.message); - return null; - } -}; - -exports.saveUploadedPdf = async (userId, url, uploadFileId, articleSavingRequestId) => { - return sendCreateArticleMutation(userId, { - url: encodeURI(url), - articleSavingRequestId, - uploadFileId: uploadFileId, - state, - labels, - source, - folder, - }, - ); -}; - -exports.sendImportStatusUpdate = async (userId, taskId, status) => { - try { - const auth = await signToken({ uid: userId }, process.env.JWT_SECRET); - - await axios.post( - IMPORTER_METRICS_COLLECTOR_URL, - { - taskId, - status, - }, - { - headers: { - 'Authorization': auth, - 'Content-Type': 'application/json', - }, - timeout: REQUEST_TIMEOUT, - }); - } catch (e) { - console.error('error while sending import status update', e); - } -}; diff --git a/packages/content-fetch/app.js b/packages/content-fetch/app.js deleted file mode 100644 index 95a7880fa..000000000 --- a/packages/content-fetch/app.js +++ /dev/null @@ -1,35 +0,0 @@ -require('dotenv').config(); -const express = require('express'); -const { contentFetchRequestHandler } = require('./request_handler'); - -const app = express(); - -app.use(express.json()); -app.use(express.urlencoded({ extended: true })); - -if (!process.env.VERIFICATION_TOKEN) { - throw new Error('VERIFICATION_TOKEN environment variable is not set'); -} - - -app.all('/', async (req, res) => { - if (req.method !== 'GET' && req.method !== 'POST') { - console.error('request method is not GET or POST') - return res.sendStatus(405) - } - - if (req.query.token !== process.env.VERIFICATION_TOKEN) { - console.error('query does not include valid token') - return res.sendStatus(403) - } - - return contentFetchRequestHandler(req, res); -}); - -const PORT = parseInt(process.env.PORT) || 8080; -app.listen(PORT, () => { - console.log(`App listening on port ${PORT}`); - console.log('Press Ctrl+C to quit.'); -}); - -module.exports = app; diff --git a/packages/content-fetch/item.js b/packages/content-fetch/item.js deleted file mode 100644 index 5703358e2..000000000 --- a/packages/content-fetch/item.js +++ /dev/null @@ -1,75 +0,0 @@ -const { interfaces } = require('mocha'); -const { uploadPdf, sendSavePageMutation, sendCreateArticleMutation, sendImportStatusUpdate } = require('./api'); - -interface Item { - url: string; - userId: string; - contentType: string; - articleSavingRequestId: string; - state: string; - labels: string[]; - source: string; - folder: string; - rssFeedUrl: string; - savedAt: string; - publishedAt: string; - readabilityResult: string; -} - -exports.saveItem = async (item: Item) => { - const { url, userId, contentType, articleSavingRequestId, state, labels, source, folder, rssFeedUrl, savedAt, publishedAt, readabilityResult } = item; - try { - if (contentType === 'application/pdf') { - const uploadFileId = await uploadPdf(url, userId, articleSavingRequestId); - const uploadedPdf = await sendCreateArticleMutation(userId, { - url: encodeURI(url), - articleSavingRequestId, - uploadFileId, - state, - labels, - source, - folder, - rssFeedUrl, - savedAt, - publishedAt, - }); - if (!uploadedPdf) { - console.error('error while saving uploaded pdf', url); - return false; - } - } else { - const apiResponse = await sendSavePageMutation(userId, { - url, - clientRequestId: articleSavingRequestId,h - title, - originalContent: content, - parseResult: readabilityResult, - state, - labels, - rssFeedUrl, - savedAt, - publishedAt, - source, - folder, - }); - if (!apiResponse) { - console.error('error while saving page', url); - return false; - } else if (apiResponse.error === 'UNAUTHORIZED') { - console.log('user is deleted, do not retry', userId); - return true; - } else { - importStatus = readabilityResult ? 'imported' : 'failed'; - } - } - } catch (error) { - logRecord.error = error.message; - } finally { - // mark import failed on the last failed retry - const retryCount = req.headers['x-cloudtasks-taskretrycount']; - if (retryCount === MAX_RETRY_COUNT) { - console.log('max retry count reached'); - importStatus = importStatus || 'failed'; - } - } -} diff --git a/packages/content-fetch/logger.js b/packages/content-fetch/logger.js deleted file mode 100644 index 9317e36c8..000000000 --- a/packages/content-fetch/logger.js +++ /dev/null @@ -1,66 +0,0 @@ -const { config, format, loggers, transports } = require('winston'); -const { LoggingWinston } = require('@google-cloud/logging-winston'); -const { DateTime } = require('luxon'); - -const colors = { - emerg: 'inverse underline magenta', - alert: 'underline magenta', - crit: 'inverse underline red', // Any error that is forcing a shutdown of the service or application to prevent data loss. - error: 'underline red', // Any error which is fatal to the operation, but not the service or application - warning: 'underline yellow', // Anything that can potentially cause application oddities - notice: 'underline cyan', // Normal but significant condition - info: 'underline green', // Generally useful information to log - debug: 'underline gray', -}; - -const googleConfigs = { - level: 'info', - logName: 'logger', - levels: config.syslog.levels, - resource: { - labels: { - function_name: process.env.FUNCTION_TARGET, - project_id: process.env.GCP_PROJECT, - }, - type: 'cloud_function', - }, -}; - -function localConfig(id) { - return { - level: 'debug', - format: format.combine( - format.colorize({ all: true, colors }), - format(info => - Object.assign(info, { - timestamp: DateTime.local().toLocaleString(DateTime.TIME_24_WITH_SECONDS), - }), - )(), - format.printf(info => { - // eslint-disable-next-line @typescript-eslint/no-unused-vars - const { timestamp, message, level, ...meta } = info; - - return `[${id}@${info.timestamp}] ${info.message}${ - Object.keys(meta).length ? '\n' + JSON.stringify(meta, null, 4) : '' - }`; - }), - ), - }; -} - -function buildLoggerTransport(id, options) { - return process.env.IS_LOCAL - ? new transports.Console(localConfig(id)) - : new LoggingWinston({ ...googleConfigs, ...{ logName: id }, ...options }); -} - -function buildLogger(id, options) { - return loggers.get(id, { - levels: config.syslog.levels, - transports: [buildLoggerTransport(id, options)], - }); -} - -module.exports = { - buildLogger, -} diff --git a/packages/content-fetch/mocha-config.json b/packages/content-fetch/mocha-config.json new file mode 100644 index 000000000..44d1d24c1 --- /dev/null +++ b/packages/content-fetch/mocha-config.json @@ -0,0 +1,5 @@ +{ + "extension": ["ts"], + "spec": "test/**/*.test.ts", + "require": "test/babel-register.js" + } \ No newline at end of file diff --git a/packages/content-fetch/package.json b/packages/content-fetch/package.json index 1e6746ab3..541d6a39f 100644 --- a/packages/content-fetch/package.json +++ b/packages/content-fetch/package.json @@ -2,7 +2,10 @@ "name": "@omnivore/content-fetch", "version": "1.0.0", "description": "Service that fetches page content from a URL", - "main": "index.js", + "main": "build/src/index.js", + "files": [ + "build/src" + ], "dependencies": { "axios": "^0.27.2", "dotenv": "^8.2.0", @@ -18,9 +21,12 @@ "mocha": "^10.0.0" }, "scripts": { - "start": "node app.js", - "start_gcf": "npx functions-framework --port=9090 --target=puppeteer", - "test": "mocha test/*.js" + "test": "yarn mocha -r ts-node/register --config mocha-config.json", + "test:typecheck": "tsc --noEmit", + "lint": "eslint src --ext ts,js,tsx,jsx", + "build": "tsc", + "start": "node build/src/app.js", + "start_gcf": "functions-framework --port=9090 --target=puppeteer" }, "volta": { "extends": "../../package.json" diff --git a/packages/content-fetch/request_handler.js b/packages/content-fetch/request_handler.js deleted file mode 100644 index d6a34cdc0..000000000 --- a/packages/content-fetch/request_handler.js +++ /dev/null @@ -1,114 +0,0 @@ -const { fetchContent } = require("@omnivore/puppeteer-parse"); -const { uploadPdf, sendSavePageMutation, sendCreateArticleMutation, sendImportStatusUpdate } = require('./api'); - -const MAX_RETRY_COUNT = process.env.MAX_RETRY_COUNT || '1'; - -exports.contentFetchRequestHandler = async (req, res) => { - let functionStartTime = Date.now(); - - const userId = (req.query ? req.query.userId : undefined) || (req.body ? req.body.userId : undefined); - const articleSavingRequestId = (req.query ? req.query.saveRequestId : undefined) || (req.body ? req.body.saveRequestId : undefined); - const state = req.body.state - const labels = req.body.labels - const source = req.body.source || 'puppeteer-parse'; - const taskId = req.body.taskId; // taskId is used to update import status - const url = (req.query ? req.query.url : undefined) || (req.body ? req.body.url : undefined); - const locale = (req.query ? req.query.locale : undefined) || (req.body ? req.body.locale : undefined); - const timezone = (req.query ? req.query.timezone : undefined) || (req.body ? req.body.timezone : undefined); - const rssFeedUrl = req.body.rssFeedUrl; - const savedAt = req.body.savedAt; - const publishedAt = req.body.publishedAt; - const folder = req.body.folder; - const users = req.body ? req.body.users : undefined; // users is used when saving article for multiple users - - let logRecord = { - url, - userId, - articleSavingRequestId, - labels: { - source, - }, - state, - labelsToAdd: labels, - taskId: taskId, - locale, - timezone, - rssFeedUrl, - savedAt, - publishedAt, - folder, - users, - }; - - console.log(`Article parsing request`, logRecord); - - let importStatus, statusCode = 200; - - try { - const { finalUrl, title, content, readabilityResult, contentType } = await fetchContent(url, locale, timezone); - if (contentType === 'application/pdf') { - const uploadFileId = await uploadPdf(finalUrl, userId, articleSavingRequestId); - const uploadedPdf = await sendCreateArticleMutation(userId, { - url: encodeURI(finalUrl), - articleSavingRequestId, - uploadFileId, - state, - labels, - source, - folder, - rssFeedUrl, - savedAt, - publishedAt, - }); - if (!uploadedPdf) { - statusCode = 500; - logRecord.error = 'error while saving uploaded pdf'; - } else { - importStatus = 'imported'; - } - } else { - const apiResponse = await sendSavePageMutation(userId, { - url, - clientRequestId: articleSavingRequestId, - title, - originalContent: content, - parseResult: readabilityResult, - state, - labels, - rssFeedUrl, - savedAt, - publishedAt, - source, - folder, - }); - if (!apiResponse) { - logRecord.error = 'error while saving page'; - statusCode = 500; - } else if (apiResponse.error === 'UNAUTHORIZED') { - console.log('user is deleted, do not retry', logRecord); - return res.sendStatus(200); - } else { - importStatus = readabilityResult ? 'imported' : 'failed'; - } - } - } catch (error) { - logRecord.error = error.message; - } finally { - logRecord.totalTime = Date.now() - functionStartTime; - console.log(`parse-page result`, logRecord); - - // mark import failed on the last failed retry - const retryCount = req.headers['x-cloudtasks-taskretrycount']; - if (retryCount === MAX_RETRY_COUNT) { - console.log('max retry count reached'); - importStatus = importStatus || 'failed'; - } - - // send import status to update the metrics - if (taskId && importStatus) { - await sendImportStatusUpdate(userId, taskId, importStatus); - } - - res.sendStatus(statusCode); - } -} diff --git a/packages/content-fetch/src/api.ts b/packages/content-fetch/src/api.ts new file mode 100644 index 000000000..4066cd9b3 --- /dev/null +++ b/packages/content-fetch/src/api.ts @@ -0,0 +1,311 @@ +import axios from 'axios' +import jwt from 'jsonwebtoken' +import { promisify } from 'util' + +const signToken = promisify(jwt.sign) + +const IMPORTER_METRICS_COLLECTOR_URL = + process.env.IMPORTER_METRICS_COLLECTOR_URL +const JWT_SECRET = process.env.JWT_SECRET +const REST_BACKEND_ENDPOINT = process.env.REST_BACKEND_ENDPOINT + +if (!IMPORTER_METRICS_COLLECTOR_URL || !JWT_SECRET || !REST_BACKEND_ENDPOINT) { + throw new Error('Missing environment variables') +} + +const REQUEST_TIMEOUT = 30000 // 30 seconds + +export const uploadToSignedUrl = async ( + uploadSignedUrl: string, + contentType: string, + contentObjUrl: string +) => { + try { + const stream = await axios.get(contentObjUrl, { + responseType: 'stream', + timeout: REQUEST_TIMEOUT, + }) + return axios.put(uploadSignedUrl, stream.data, { + headers: { + 'Content-Type': contentType, + }, + maxBodyLength: 1000000000, + maxContentLength: 100000000, + timeout: REQUEST_TIMEOUT, + }) + } catch (error) { + console.error('error uploading to signed url', error) + return null + } +} + +interface UploadFileResponse { + data: { + uploadFileRequest: { + id: string + uploadSignedUrl: string + uploadFileId: string + createdPageId: string + errorCodes?: string[] + } + } +} + +export const getUploadIdAndSignedUrl = async ( + userId: string, + url: string, + articleSavingRequestId: string +) => { + const auth = await signToken({ uid: userId }, JWT_SECRET) + const data = JSON.stringify({ + query: `mutation UploadFileRequest($input: UploadFileRequestInput!) { + uploadFileRequest(input:$input) { + ... on UploadFileRequestError { + errorCodes + } + ... on UploadFileRequestSuccess { + id + uploadSignedUrl + } + } + }`, + variables: { + input: { + url, + contentType: 'application/pdf', + clientRequestId: articleSavingRequestId, + }, + }, + }) + + try { + const response = await axios.post( + `${REST_BACKEND_ENDPOINT}/graphql`, + data, + { + headers: { + Cookie: `auth=${auth as string};`, + 'Content-Type': 'application/json', + }, + timeout: REQUEST_TIMEOUT, + } + ) + + if ( + response.data.data.uploadFileRequest.errorCodes && + response.data.data.uploadFileRequest.errorCodes?.length > 0 + ) { + console.error( + 'Error while getting upload id and signed url', + response.data.data.uploadFileRequest.errorCodes[0] + ) + return null + } + + return response.data.data.uploadFileRequest + } catch (e) { + console.error('error getting upload id and signed url', e) + return null + } +} + +interface CreateArticleResponse { + data: { + createArticle: { + createdArticle: { + id: string + } + errorCodes: string[] + } + } +} + +export const uploadPdf = async ( + url: string, + userId: string, + articleSavingRequestId: string +) => { + const uploadResult = await getUploadIdAndSignedUrl( + userId, + url, + articleSavingRequestId + ) + if (!uploadResult) { + throw new Error('error while getting upload id and signed url') + } + const uploaded = await uploadToSignedUrl( + uploadResult.uploadSignedUrl, + 'application/pdf', + url + ) + if (!uploaded) { + throw new Error('error while uploading pdf') + } + return uploadResult.id +} + +export const sendCreateArticleMutation = async ( + userId: string, + input: unknown +) => { + const data = JSON.stringify({ + query: `mutation CreateArticle ($input: CreateArticleInput!){ + createArticle(input:$input){ + ... on CreateArticleSuccess{ + createdArticle{ + id + } + } + ... on CreateArticleError{ + errorCodes + } + } + }`, + variables: { + input, + }, + }) + + const auth = await signToken({ uid: userId }, JWT_SECRET) + try { + const response = await axios.post( + `${REST_BACKEND_ENDPOINT}/graphql`, + data, + { + headers: { + Cookie: `auth=${auth as string};`, + 'Content-Type': 'application/json', + }, + timeout: REQUEST_TIMEOUT, + } + ) + + if ( + response.data.data.createArticle.errorCodes && + response.data.data.createArticle.errorCodes.length > 0 + ) { + console.error( + 'error while creating article', + response.data.data.createArticle.errorCodes[0] + ) + return null + } + + return response.data.data.createArticle + } catch (error) { + console.error('error creating article', error) + return null + } +} + +interface SavePageResponse { + data: { + savePage: { + url: string + clientRequestId: string + errorCodes?: string[] + } + } +} + +export const sendSavePageMutation = async (userId: string, input: unknown) => { + const data = JSON.stringify({ + query: `mutation SavePage ($input: SavePageInput!){ + savePage(input:$input){ + ... on SaveSuccess{ + url + clientRequestId + } + ... on SaveError{ + errorCodes + } + } + }`, + variables: { + input, + }, + }) + + const auth = await signToken({ uid: userId }, JWT_SECRET) + try { + const response = await axios.post( + `${REST_BACKEND_ENDPOINT}/graphql`, + data, + { + headers: { + Cookie: `auth=${auth as string};`, + 'Content-Type': 'application/json', + }, + timeout: REQUEST_TIMEOUT, + } + ) + + if ( + response.data.data.savePage.errorCodes && + response.data.data.savePage.errorCodes.length > 0 + ) { + console.error( + 'error while saving page', + response.data.data.savePage.errorCodes[0] + ) + if (response.data.data.savePage.errorCodes[0] === 'UNAUTHORIZED') { + return { error: 'UNAUTHORIZED' } + } + + return null + } + + return response.data.data.savePage + } catch (error) { + console.error('error saving page', error) + return null + } +} + +export const saveUploadedPdf = async ( + userId: string, + url: string, + uploadFileId: string, + articleSavingRequestId: string, + state: string, + labels: string[], + source: string, + folder: string +) => { + return sendCreateArticleMutation(userId, { + url: encodeURI(url), + articleSavingRequestId, + uploadFileId: uploadFileId, + state, + labels, + source, + folder, + }) +} + +export const sendImportStatusUpdate = async ( + userId: string, + taskId: string, + status: string +) => { + try { + const auth = await signToken({ uid: userId }, JWT_SECRET) + + await axios.post( + IMPORTER_METRICS_COLLECTOR_URL, + { + taskId, + status, + }, + { + headers: { + Authorization: auth as string, + 'Content-Type': 'application/json', + }, + timeout: REQUEST_TIMEOUT, + } + ) + } catch (e) { + console.error('error while sending import status update', e) + } +} diff --git a/packages/content-fetch/src/app.ts b/packages/content-fetch/src/app.ts new file mode 100644 index 000000000..fff7bf92b --- /dev/null +++ b/packages/content-fetch/src/app.ts @@ -0,0 +1,34 @@ +import 'dotenv/config' +import express from 'express' +import { contentFetchRequestHandler } from './request_handler' + +console.log(process.env) + +const app = express() + +app.use(express.json()) +app.use(express.urlencoded({ extended: true })) + +if (!process.env.VERIFICATION_TOKEN) { + throw new Error('VERIFICATION_TOKEN environment variable is not set') +} + +app.all('/', (req, res, next) => { + if (req.method !== 'GET' && req.method !== 'POST') { + console.error('request method is not GET or POST') + return res.sendStatus(405) + } + + if (req.query.token !== process.env.VERIFICATION_TOKEN) { + console.error('query does not include valid token') + return res.sendStatus(403) + } + + return contentFetchRequestHandler(req, res, next) +}) + +const PORT = process.env.PORT ? parseInt(process.env.PORT) : 8080 +app.listen(PORT, () => { + console.log(`App listening on port ${PORT}`) + console.log('Press Ctrl+C to quit.') +}) diff --git a/packages/content-fetch/index.js b/packages/content-fetch/src/index.ts similarity index 60% rename from packages/content-fetch/index.js rename to packages/content-fetch/src/index.ts index f76eaf20d..d3bc341d1 100644 --- a/packages/content-fetch/index.js +++ b/packages/content-fetch/src/index.ts @@ -1,16 +1,12 @@ -/* eslint-disable no-undef */ -/* eslint-disable no-empty */ -/* eslint-disable @typescript-eslint/explicit-function-return-type */ -/* eslint-disable @typescript-eslint/no-var-requires */ -/* eslint-disable @typescript-eslint/no-require-imports */ -require('dotenv').config(); -const Sentry = require('@sentry/serverless'); -const { contentFetchRequestHandler } = require('./request_handler'); +import { HttpFunction } from '@google-cloud/functions-framework' +import * as Sentry from '@sentry/serverless' +import 'dotenv/config' +import { contentFetchRequestHandler } from './request_handler' Sentry.GCPFunction.init({ dsn: process.env.SENTRY_DSN, tracesSampleRate: 0, -}); +}) /** * Cloud Function entry point, HTTP trigger. @@ -19,7 +15,9 @@ Sentry.GCPFunction.init({ * @param {Object} req Cloud Function request context. * @param {Object} res Cloud Function response context. */ -exports.puppeteer = Sentry.GCPFunction.wrapHttpFunction(contentFetchRequestHandler); +export const puppeteer = Sentry.GCPFunction.wrapHttpFunction( + contentFetchRequestHandler as HttpFunction +) /** * Cloud Function entry point, HTTP trigger. diff --git a/packages/content-fetch/src/request_handler.ts b/packages/content-fetch/src/request_handler.ts new file mode 100644 index 000000000..a33ec5c47 --- /dev/null +++ b/packages/content-fetch/src/request_handler.ts @@ -0,0 +1,177 @@ +import { fetchContent } from '@omnivore/puppeteer-parse' +import { RequestHandler } from 'express' +import { + sendCreateArticleMutation, + sendImportStatusUpdate, + sendSavePageMutation, + uploadPdf, +} from './api' + +interface RequestBody { + url: string + userId: string + saveRequestId: string + state?: string + labels?: string[] + source?: string + taskId?: string + locale?: string + timezone?: string + rssFeedUrl?: string + savedAt?: string + publishedAt?: string + folder?: string + users?: string[] +} + +interface LogRecord { + url: string + userId: string + articleSavingRequestId: string + labels: { + source: string + } + state?: string + labelsToAdd?: string[] + taskId?: string + locale?: string + timezone?: string + rssFeedUrl?: string + savedAt?: string + publishedAt?: string + folder?: string + users?: string[] + error?: string + totalTime?: number +} + +const MAX_RETRY_COUNT = process.env.MAX_RETRY_COUNT || '1' + +export const contentFetchRequestHandler: RequestHandler = async (req, res) => { + const functionStartTime = Date.now() + + const body = req.body + + const userId = body.userId + const articleSavingRequestId = body.saveRequestId + const state = body.state + const labels = body.labels + const source = body.source || 'puppeteer-parse' + const taskId = body.taskId // taskId is used to update import status + const url = body.url + const locale = body.locale + const timezone = body.timezone + const rssFeedUrl = body.rssFeedUrl + const savedAt = body.savedAt + const publishedAt = body.publishedAt + const folder = body.folder + const users = body ? body.users : undefined // users is used when saving article for multiple users + + const logRecord: LogRecord = { + url, + userId, + articleSavingRequestId, + labels: { + source, + }, + state, + labelsToAdd: labels, + taskId: taskId, + locale, + timezone, + rssFeedUrl, + savedAt, + publishedAt, + folder, + users, + } + + console.log(`Article parsing request`, logRecord) + + let importStatus, + statusCode = 200 + + try { + const fetchResult = await fetchContent(url, locale, timezone) + const finalUrl = fetchResult.finalUrl + const title = fetchResult.title + const content = fetchResult.content + const readabilityResult = fetchResult.readabilityResult as unknown + if (fetchResult.contentType === 'application/pdf') { + const uploadFileId = await uploadPdf( + finalUrl, + userId, + articleSavingRequestId + ) + const uploadedPdf = await sendCreateArticleMutation(userId, { + url: encodeURI(finalUrl), + articleSavingRequestId, + uploadFileId, + state, + labels, + source, + folder, + rssFeedUrl, + savedAt, + publishedAt, + }) + if (!uploadedPdf) { + statusCode = 500 + logRecord.error = 'error while saving uploaded pdf' + } else { + importStatus = 'imported' + } + } else { + const apiResponse = await sendSavePageMutation(userId, { + url, + clientRequestId: articleSavingRequestId, + title, + originalContent: content, + parseResult: readabilityResult, + state, + labels, + rssFeedUrl, + savedAt, + publishedAt, + source, + folder, + }) + if (!apiResponse) { + logRecord.error = 'error while saving page' + statusCode = 500 + } else if ( + 'error' in apiResponse && + apiResponse.error === 'UNAUTHORIZED' + ) { + console.log('user is deleted, do not retry', logRecord) + return res.sendStatus(200) + } else { + importStatus = readabilityResult ? 'imported' : 'failed' + } + } + } catch (error) { + console.error(error) + if (error instanceof Error) { + logRecord.error = error.message + } else { + logRecord.error = 'unknown error' + } + } finally { + logRecord.totalTime = Date.now() - functionStartTime + console.log(`parse-page result`, logRecord) + + // mark import failed on the last failed retry + const retryCount = req.headers['x-cloudtasks-taskretrycount'] + if (retryCount === MAX_RETRY_COUNT) { + console.log('max retry count reached') + importStatus = importStatus || 'failed' + } + + // send import status to update the metrics + if (taskId && importStatus) { + await sendImportStatusUpdate(userId, taskId, importStatus) + } + + res.sendStatus(statusCode) + } +} diff --git a/packages/content-fetch/test/babel-register.js b/packages/content-fetch/test/babel-register.js new file mode 100644 index 000000000..a6f65f60a --- /dev/null +++ b/packages/content-fetch/test/babel-register.js @@ -0,0 +1,3 @@ +const register = require('@babel/register').default + +register({ extensions: ['.ts', '.tsx', '.js', '.jsx'] }) diff --git a/packages/content-fetch/test/stub.test.js b/packages/content-fetch/test/stub.test.js deleted file mode 100644 index 317d21b52..000000000 --- a/packages/content-fetch/test/stub.test.js +++ /dev/null @@ -1,9 +0,0 @@ -const chai = require("chai"); - -const expect = chai.expect; - -describe('Stub test', () => { - it('should pass', () => { - expect(true).to.be.true - }) -}) diff --git a/packages/content-fetch/test/stub.test.ts b/packages/content-fetch/test/stub.test.ts new file mode 100644 index 000000000..24ad25c8f --- /dev/null +++ b/packages/content-fetch/test/stub.test.ts @@ -0,0 +1,8 @@ +import 'mocha' +import { expect } from 'chai' + +describe('stub test', () => { + it('should pass', () => { + expect(true).to.be.true + }) +}) diff --git a/packages/content-fetch/tsconfig.json b/packages/content-fetch/tsconfig.json new file mode 100644 index 000000000..30fcfe13a --- /dev/null +++ b/packages/content-fetch/tsconfig.json @@ -0,0 +1,8 @@ +{ + "extends": "./../../tsconfig.json", + "compilerOptions": { + "outDir": "build", + "rootDir": ".", + }, + "include": ["src"] +} diff --git a/packages/content-handler/.gitignore b/packages/content-handler/.gitignore deleted file mode 100644 index 0ae7e5c9e..000000000 --- a/packages/content-handler/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -node_modules -/lib diff --git a/packages/content-handler/tsconfig.json b/packages/content-handler/tsconfig.json index aeb8d2c3a..48d435fdf 100644 --- a/packages/content-handler/tsconfig.json +++ b/packages/content-handler/tsconfig.json @@ -1,10 +1,9 @@ { - "extends": "@tsconfig/node14/tsconfig.json", + "extends": "./../../tsconfig.json", "compilerOptions": { "rootDir": ".", "declaration": true, - "outDir": "build", - "lib": ["dom"] + "outDir": "build" }, "include": ["src"] } diff --git a/packages/puppeteer-parse/package.json b/packages/puppeteer-parse/package.json index 87e2943d5..4eb2f4239 100644 --- a/packages/puppeteer-parse/package.json +++ b/packages/puppeteer-parse/package.json @@ -3,6 +3,7 @@ "version": "1.0.0", "description": "Accepts URL of the article and parses its content", "main": "build/src/index.js", + "types": "build/src/index.d.ts", "files": [ "build/src" ], diff --git a/packages/puppeteer-parse/src/index.ts b/packages/puppeteer-parse/src/index.ts index fc2c4f5f4..899c56a48 100644 --- a/packages/puppeteer-parse/src/index.ts +++ b/packages/puppeteer-parse/src/index.ts @@ -140,8 +140,8 @@ const getBrowserPromise = (async () => { export const fetchContent = async ( url: string, - locale: string, - timezone: string + locale?: string, + timezone?: string ) => { const functionStartTime = Date.now() const logRecord = { @@ -154,8 +154,8 @@ export const fetchContent = async ( let context: BrowserContext | undefined, page: Page | undefined, - finalUrl: string | undefined, - title: string | undefined, + finalUrl = '', + title = '', content: string | undefined, contentType: string | undefined, readabilityResult: Readability.ParseResult | null | undefined @@ -217,7 +217,7 @@ export const fetchContent = async ( const sbResult = await fetchContentWithScrapingBee(url) title = sbResult.title content = sbResult.domContent - } else { + } else if (result.title && result.domContent) { title = result.title content = result.domContent } @@ -312,8 +312,8 @@ async function retrievePage( url: string, logRecord: Record, functionStartTime: number, - locale: string, - timezone: string + locale?: string, + timezone?: string ) { validateUrlString(url) @@ -466,8 +466,7 @@ async function retrievePage( } async function retrieveHtml(page: Page, logRecord: Record) { - let domContent = '', - title + let domContent, title try { title = await page.title() logRecord.title = title diff --git a/packages/puppeteer-parse/tsconfig.json b/packages/puppeteer-parse/tsconfig.json index 7ebe093f6..5b6ea3e00 100644 --- a/packages/puppeteer-parse/tsconfig.json +++ b/packages/puppeteer-parse/tsconfig.json @@ -2,7 +2,9 @@ "extends": "./../../tsconfig.json", "compilerOptions": { "outDir": "build", - "rootDir": "." + "rootDir": ".", + // Generate d.ts files + "declaration": true }, "include": ["src"] } diff --git a/packages/text-to-speech/tsconfig.json b/packages/text-to-speech/tsconfig.json index 42c16d244..5b6ea3e00 100644 --- a/packages/text-to-speech/tsconfig.json +++ b/packages/text-to-speech/tsconfig.json @@ -1,11 +1,10 @@ { - "extends": "@tsconfig/node14/tsconfig.json", + "extends": "./../../tsconfig.json", "compilerOptions": { "outDir": "build", "rootDir": ".", - "lib": ["dom"], // Generate d.ts files "declaration": true }, - "include": ["src"], + "include": ["src"] }