diff --git a/packages/content-fetch/apple-news-handler.js b/packages/content-fetch/apple-news-handler.js deleted file mode 100644 index 0759dec23..000000000 --- a/packages/content-fetch/apple-news-handler.js +++ /dev/null @@ -1,36 +0,0 @@ -/* eslint-disable no-undef */ -/* eslint-disable no-empty */ -/* eslint-disable @typescript-eslint/explicit-function-return-type */ -/* eslint-disable @typescript-eslint/no-var-requires */ -/* eslint-disable @typescript-eslint/no-require-imports */ -require('dotenv').config(); -const Url = require('url'); -const axios = require('axios'); -const { promisify } = require('util'); -const { DateTime } = require('luxon'); -const os = require('os'); -const { Cipher } = require('crypto'); -const { parseHTML } = require('linkedom'); - -exports.appleNewsHandler = { - - shouldPrehandle: (url, env) => { - const u = new URL(url); - if (u.hostname === 'apple.news') { - return true; - } - return false - }, - - prehandle: async (url, env) => { - const MOBILE_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36' - const response = await axios.get(url, { headers: { 'User-Agent': MOBILE_USER_AGENT } } ); - const data = response.data; - - const dom = parseHTML(data).document; - - // make sure its a valid URL by wrapping in new URL - const u = new URL(dom.querySelector('span.click-here').parentNode.href); - return { url: u.href }; - } -} diff --git a/packages/content-fetch/bloomberg-handler.js b/packages/content-fetch/bloomberg-handler.js deleted file mode 100644 index d79a568bb..000000000 --- a/packages/content-fetch/bloomberg-handler.js +++ /dev/null @@ -1,39 +0,0 @@ -/* eslint-disable no-undef */ -/* eslint-disable no-empty */ -/* eslint-disable @typescript-eslint/explicit-function-return-type */ -/* eslint-disable @typescript-eslint/no-var-requires */ -/* eslint-disable @typescript-eslint/no-require-imports */ -require('dotenv').config(); -const axios = require('axios'); -const os = require('os'); -const { parseHTML } = require('linkedom'); - -exports.bloombergHandler = { - - shouldPrehandle: (url, env) => { - const BLOOMBERG_URL_MATCH = - /https?:\/\/(www\.)?bloomberg.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)/ - return BLOOMBERG_URL_MATCH.test(url.toString()) - }, - - prehandle: async (url, env) => { - console.log('prehandling bloomberg url', url) - - try { - const response = await axios.get('https://app.scrapingbee.com/api/v1', { - params: { - 'api_key': process.env.SCRAPINGBEE_API_KEY, - 'url': url, - 'return_page_source': true, - 'block_ads': true, - 'block_resources': false, - } - }) - const dom = parseHTML(response.data).document; - return { title: dom.title, content: dom.querySelector('body').innerHTML, url: url } - } catch (error) { - console.error('error prehandling bloomberg url', error) - throw error - } - } -} diff --git a/packages/content-fetch/derstandard-handler.js b/packages/content-fetch/derstandard-handler.js deleted file mode 100644 index a44db6f2a..000000000 --- a/packages/content-fetch/derstandard-handler.js +++ /dev/null @@ -1,35 +0,0 @@ -/* eslint-disable no-undef */ -/* eslint-disable no-empty */ -/* eslint-disable @typescript-eslint/explicit-function-return-type */ -/* eslint-disable @typescript-eslint/no-var-requires */ -/* eslint-disable @typescript-eslint/no-require-imports */ -require('dotenv').config(); -const axios = require('axios'); -const { parseHTML } = require('linkedom'); - -exports.derstandardHandler = { - shouldPrehandle: (url, env) => { - const u = new URL(url); - return u.hostname === 'www.derstandard.at'; - }, - - prehandle: async (url, env) => { - const response = await axios.get(url, { - // set cookie to give consent to get the article - headers: { - 'cookie': `DSGVO_ZUSAGE_V1=true; consentUUID=2bacb9c1-1e80-4be0-9f7b-ee987cf4e7b0_6` - }, - }); - const content = response.data; - - var title = undefined; - const dom = parseHTML(content).document; - const titleElement = dom.querySelector('.article-title') - if (!titleElement) { - title = titleElement.textContent - titleElement.remove() - } - - return { content: dom.body.outerHTML, title: title }; - } -} diff --git a/packages/content-fetch/fetch-content.js b/packages/content-fetch/fetch-content.js index 01dbe199c..658229e3d 100644 --- a/packages/content-fetch/fetch-content.js +++ b/packages/content-fetch/fetch-content.js @@ -9,16 +9,10 @@ const puppeteer = require('puppeteer-core'); const axios = require('axios'); const jwt = require('jsonwebtoken'); const { promisify } = require('util'); +const { parseHTML } = require('linkedom'); +const { preHandleContent } = require('@omnivore/content-handler'); + const signToken = promisify(jwt.sign); -const { appleNewsHandler } = require('./apple-news-handler'); -const { twitterHandler } = require('./twitter-handler'); -const { youtubeHandler } = require('./youtube-handler'); -const { tDotCoHandler } = require('./t-dot-co-handler'); -const { pdfHandler } = require('./pdf-handler'); -const { mediumHandler } = require('./medium-handler'); -const { derstandardHandler } = require('./derstandard-handler'); -const { imageHandler } = require('./image-handler'); -const { scrapingBeeHandler } = require('./scrapingBee-handler') const MOBILE_USER_AGENT = 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.62 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' const DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36' @@ -29,8 +23,6 @@ const NON_SCRIPT_HOSTS= ['medium.com', 'fastcompany.com']; const ALLOWED_CONTENT_TYPES = ['text/html', 'application/octet-stream', 'text/plain', 'application/pdf']; -const { parseHTML } = require('linkedom'); - // Add stealth plugin to hide puppeteer usage // const StealthPlugin = require('puppeteer-extra-plugin-stealth'); // puppeteer.use(StealthPlugin()); @@ -207,19 +199,6 @@ const saveUploadedPdf = async (userId, url, uploadFileId, articleSavingRequestId ); }; -const handlers = { - 'pdf': pdfHandler, - 'apple-news': appleNewsHandler, - 'twitter': twitterHandler, - 'youtube': youtubeHandler, - 't-dot-co': tDotCoHandler, - 'medium': mediumHandler, - 'derstandard': derstandardHandler, - 'image': imageHandler, - 'scrapingBee': scrapingBeeHandler, -}; - - async function fetchContent(req, res) { functionStartTime = Date.now(); @@ -246,61 +225,18 @@ async function fetchContent(req, res) { return res.sendStatus(400); } - // if (!userId || !articleSavingRequestId) { - // Object.assign(logRecord, { invalidParams: true, body: req.body, query: req.query }); - // console.log(`Invalid parameters`, logRecord); - // return res.sendStatus(400); - // } - - // Before we run the regular handlers we check to see if we need tp - // pre-resolve the URL. TODO: This should probably happen recursively, - // so URLs can be pre-resolved, handled, pre-resolved, handled, etc. - for (const [key, handler] of Object.entries(handlers)) { - if (handler.shouldResolve && handler.shouldResolve(url)) { - try { - url = await handler.resolve(url); - validateUrlString(url); - } catch (err) { - console.log('error resolving url with handler', key, err); - } - break; - } - } - - // Before we fetch the page we check the handlers, to see if they want - // to perform a prefetch action that can modify our requests. - // enumerate the handlers and see if any of them want to handle the request - const handler = Object.keys(handlers).find(key => { - try { - return handlers[key].shouldPrehandle(url) - } catch (e) { - console.log('error with handler: ', key, e); - } - return false; - }); - - var title = undefined; - var content = undefined; - var contentType = undefined; - - if (handler) { - try { - // The only handler we have now can modify the URL, but in the - // future maybe we let it modify content. In that case - // we might exit the request early. - console.log('pre-handling url with handler: ', handler); - - const result = await handlers[handler].prehandle(url); - if (result && result.url) { - url = result.url - validateUrlString(url); - } - if (result && result.title) { title = result.title } - if (result && result.content) { content = result.content } - if (result && result.contentType) { contentType = result.contentType } - } catch (e) { - console.log('error with handler: ', handler, e); + let title, content, contentType; + try { + const result = await preHandleContent(url); + if (result && result.url) { + url = result.url + validateUrlString(url); } + if (result && result.title) { title = result.title } + if (result && result.content) { content = result.content } + if (result && result.contentType) { contentType = result.contentType } + } catch (e) { + console.log('error with handler: ', e); } let context, page, finalUrl; diff --git a/packages/content-fetch/image-handler.js b/packages/content-fetch/image-handler.js deleted file mode 100644 index 59f132afc..000000000 --- a/packages/content-fetch/image-handler.js +++ /dev/null @@ -1,34 +0,0 @@ -/* eslint-disable no-undef */ -/* eslint-disable no-empty */ -/* eslint-disable @typescript-eslint/explicit-function-return-type */ -/* eslint-disable @typescript-eslint/no-var-requires */ -/* eslint-disable @typescript-eslint/no-require-imports */ -require('dotenv').config(); - - -exports.imageHandler = { - shouldPrehandle: (url, env) => { - const IMAGE_URL_PATTERN = - /(https?:\/\/.*\.(?:jpg|jpeg|png|webp))/i - return IMAGE_URL_PATTERN.test(url.toString()) - }, - - prehandle: async (url, env) => { - const title = url.toString().split('/').pop(); - const content = ` - - - ${title} - - - - -
- ${title} -
- - ` - - return { title, content }; - } -} diff --git a/packages/content-fetch/medium-handler.js b/packages/content-fetch/medium-handler.js deleted file mode 100644 index e6a605a0e..000000000 --- a/packages/content-fetch/medium-handler.js +++ /dev/null @@ -1,29 +0,0 @@ -/* eslint-disable no-undef */ -/* eslint-disable no-empty */ -/* eslint-disable @typescript-eslint/explicit-function-return-type */ -/* eslint-disable @typescript-eslint/no-var-requires */ -/* eslint-disable @typescript-eslint/no-require-imports */ -require('dotenv').config(); -const axios = require('axios'); -const os = require('os'); - -exports.mediumHandler = { - - shouldPrehandle: (url, env) => { - const u = new URL(url); - return u.hostname.endsWith('medium.com') - }, - - prehandle: async (url, env) => { - console.log('prehandling medium url', url) - - try { - const res = new URL(url); - res.searchParams.delete('source'); - return { url: res.toString() } - } catch (error) { - console.error('error prehandling medium url', error) - throw error - } - } -} diff --git a/packages/content-fetch/package.json b/packages/content-fetch/package.json index f58675a74..3df85a237 100644 --- a/packages/content-fetch/package.json +++ b/packages/content-fetch/package.json @@ -11,7 +11,8 @@ "linkedom": "^0.14.9", "luxon": "^2.3.1", "puppeteer-core": "^16.1.0", - "underscore": "^1.13.4" + "underscore": "^1.13.4", + "@omnivore/content-handler": "1.0.0" }, "scripts": { "start": "node app.js", diff --git a/packages/content-fetch/pdf-handler.js b/packages/content-fetch/pdf-handler.js deleted file mode 100644 index 1260db287..000000000 --- a/packages/content-fetch/pdf-handler.js +++ /dev/null @@ -1,21 +0,0 @@ -/* eslint-disable no-undef */ -/* eslint-disable no-empty */ -/* eslint-disable @typescript-eslint/explicit-function-return-type */ -/* eslint-disable @typescript-eslint/no-var-requires */ -/* eslint-disable @typescript-eslint/no-require-imports */ -require('dotenv').config(); -const Url = require('url'); - - -exports.pdfHandler = { - - shouldPrehandle: (url, env) => { - const u = Url.parse(url) - const path = u.path.replace(u.search, '') - return path.endsWith('.pdf') - }, - - prehandle: async (url, env) => { - return { contentType: 'application/pdf' }; - } -} diff --git a/packages/content-fetch/scrapingBee-handler.js b/packages/content-fetch/scrapingBee-handler.js deleted file mode 100644 index 6563fca44..000000000 --- a/packages/content-fetch/scrapingBee-handler.js +++ /dev/null @@ -1,44 +0,0 @@ -/* eslint-disable no-undef */ -/* eslint-disable no-empty */ -/* eslint-disable @typescript-eslint/explicit-function-return-type */ -/* eslint-disable @typescript-eslint/no-var-requires */ -/* eslint-disable @typescript-eslint/no-require-imports */ -require('dotenv').config(); -const axios = require('axios'); -const { parseHTML } = require('linkedom'); - -const os = require('os'); - -exports.scrapingBeeHandler = { - - shouldPrehandle: (url, env) => { - const u = new URL(url); - const hostnames = [ - 'nytimes.com', - 'news.google.com', - ] - - return hostnames.some((h) => u.hostname.endsWith(h)) - }, - - prehandle: async (url, env) => { - console.log('prehandling url with scrapingbee', url) - - try { - const response = await axios.get('https://app.scrapingbee.com/api/v1', { - params: { - 'api_key': process.env.SCRAPINGBEE_API_KEY, - 'url': url, - 'return_page_source': true, - 'block_ads': true, - 'block_resources': false, - } - }) - const dom = parseHTML(response.data).document; - return { title: dom.title, content: response.data, url: url } - } catch (error) { - console.error('error prehandling url w/scrapingbee', error) - throw error - } - } -} diff --git a/packages/content-fetch/t-dot-co-handler.js b/packages/content-fetch/t-dot-co-handler.js deleted file mode 100644 index 170f97fb7..000000000 --- a/packages/content-fetch/t-dot-co-handler.js +++ /dev/null @@ -1,31 +0,0 @@ -/* eslint-disable no-undef */ -/* eslint-disable no-empty */ -/* eslint-disable @typescript-eslint/explicit-function-return-type */ -/* eslint-disable @typescript-eslint/no-var-requires */ -/* eslint-disable @typescript-eslint/no-require-imports */ -require('dotenv').config(); -const axios = require('axios'); -const Url = require('url'); - - -exports.tDotCoHandler = { - - shouldResolve: function (url, env) { - const T_DOT_CO_URL_MATCH = /^https:\/\/(?:www\.)?t\.co\/.*$/; - return T_DOT_CO_URL_MATCH.test(url); - }, - - resolve: async function(url, env) { - return await axios.get(url, { maxRedirects: 0, validateStatus: null }) - .then(res => { - return Url.parse(res.headers.location).href; - }).catch((err) => { - console.log('err with t.co url', err); - return undefined; - }); - }, - - shouldPrehandle: (url, env) => { - return false - }, -} diff --git a/packages/content-fetch/test/apple-news-handler.test.js b/packages/content-fetch/test/apple-news-handler.test.js deleted file mode 100644 index 4531d720e..000000000 --- a/packages/content-fetch/test/apple-news-handler.test.js +++ /dev/null @@ -1,9 +0,0 @@ -const { expect } = require('chai') -const { appleNewsHandler } = require('../apple-news-handler') - -describe('open a simple web page', () => { - it('should return a response', async () => { - const response = await appleNewsHandler.prehandle('https://apple.news/AxjzaZaPvSn23b67LhXI5EQ') - console.log('response', response) - }) -}) diff --git a/packages/content-fetch/test/youtube-handler.test.js b/packages/content-fetch/test/youtube-handler.test.js deleted file mode 100644 index d34643773..000000000 --- a/packages/content-fetch/test/youtube-handler.test.js +++ /dev/null @@ -1,12 +0,0 @@ -const { expect } = require('chai') -const { getYoutubeVideoId } = require('../youtube-handler') - -describe('getYoutubeVideoId', () => { - it('should parse video id out of a URL', async () => { - expect('BnSUk0je6oo').to.eq(getYoutubeVideoId('https://www.youtube.com/watch?v=BnSUk0je6oo&t=269s')); - expect('vFD2gu007dc').to.eq(getYoutubeVideoId('https://www.youtube.com/watch?v=vFD2gu007dc&list=RDvFD2gu007dc&start_radio=1')); - expect('vFD2gu007dc').to.eq(getYoutubeVideoId('https://youtu.be/vFD2gu007dc')); - expect('BMFVCnbRaV4').to.eq(getYoutubeVideoId('https://youtube.com/watch?v=BMFVCnbRaV4&feature=share')); - expect('cg9b4RC87LI').to.eq(getYoutubeVideoId('https://youtu.be/cg9b4RC87LI?t=116')); - }) -}) diff --git a/packages/content-fetch/twitter-handler.js b/packages/content-fetch/twitter-handler.js deleted file mode 100644 index 7ae93072c..000000000 --- a/packages/content-fetch/twitter-handler.js +++ /dev/null @@ -1,172 +0,0 @@ -/* eslint-disable no-undef */ -/* eslint-disable no-empty */ -/* eslint-disable @typescript-eslint/explicit-function-return-type */ -/* eslint-disable @typescript-eslint/no-var-requires */ -/* eslint-disable @typescript-eslint/no-require-imports */ -require('dotenv').config(); -const axios = require('axios'); -const { DateTime } = require('luxon'); -const _ = require('underscore'); - -const TWITTER_BEARER_TOKEN = process.env.TWITTER_BEARER_TOKEN; -const TWITTER_URL_MATCH = /twitter\.com\/(?:#!\/)?(\w+)\/status(?:es)?\/(\d+)(?:\/.*)?/ - -const embeddedTweet = async (url) => { - - const BASE_ENDPOINT = 'https://publish.twitter.com/oembed' - - const apiUrl = new URL(BASE_ENDPOINT) - apiUrl.searchParams.append('url', url); - apiUrl.searchParams.append('omit_script', true); - apiUrl.searchParams.append('dnt', true); - - return await axios.get(apiUrl.toString(), { - headers: { - Authorization: `Bearer ${TWITTER_BEARER_TOKEN}`, - redirect: "follow", - }, - }); -}; - -const getTweetFields = () => { - const TWEET_FIELDS = - "&tweet.fields=attachments,author_id,conversation_id,created_at," + - "entities,geo,in_reply_to_user_id,lang,possibly_sensitive,public_metrics,referenced_tweets," + - "source,withheld"; - const EXPANSIONS = "&expansions=author_id,attachments.media_keys"; - const USER_FIELDS = - "&user.fields=created_at,description,entities,location,pinned_tweet_id,profile_image_url,protected,public_metrics,url,verified,withheld"; - const MEDIA_FIELDS = - "&media.fields=duration_ms,height,preview_image_url,url,media_key,public_metrics,width"; - - return `${TWEET_FIELDS}${EXPANSIONS}${USER_FIELDS}${MEDIA_FIELDS}`; -} - -const getTweetById = async (id) => { - const BASE_ENDPOINT = "https://api.twitter.com/2/tweets/"; - const apiUrl = new URL(BASE_ENDPOINT + id + '?' + getTweetFields()) - - return await axios.get(apiUrl.toString(), { - headers: { - Authorization: `Bearer ${TWITTER_BEARER_TOKEN}`, - redirect: "follow", - }, - }); -}; - -const getUserByUsername = async (username) => { - const BASE_ENDPOINT = "https://api.twitter.com/2/users/by/username/"; - - const apiUrl = new URL(BASE_ENDPOINT + username) - apiUrl.searchParams.append('user.fields', 'profile_image_url'); - - return await axios.get(apiUrl.toString(), { - headers: { - Authorization: `Bearer ${TWITTER_BEARER_TOKEN}`, - redirect: "follow", - }, - }); -}; - -const titleForTweet = (tweet) => { - return `${tweet.data.author_name} on Twitter` -}; - -const titleForAuthor = (author) => { - return `${author.name} on Twitter` -}; - -const usernameFromStatusUrl = (url) => { - const match = url.toString().match(TWITTER_URL_MATCH) - return match[1] -}; - -const tweetIdFromStatusUrl = (url) => { - const match = url.toString().match(TWITTER_URL_MATCH) - return match[2] -}; - -const formatTimestamp = (timestamp) => { - return DateTime.fromJSDate(new Date(timestamp)).toLocaleString(DateTime.DATETIME_FULL); -}; - -exports.twitterHandler = { - - shouldPrehandle: (url, env) => { - return TWITTER_BEARER_TOKEN && TWITTER_URL_MATCH.test(url.toString()) - }, - - // version of the handler that uses the oembed API - // This isn't great as it doesn't work well with our - // readability API. But could potentially give a more consistent - // look to the tweets - // prehandle: async (url, env) => { - // const oeTweet = await embeddedTweet(url) - // const dom = new JSDOM(oeTweet.data.html); - // const bq = dom.window.document.querySelector('blockquote') - // console.log('blockquote:', bq); - - // const title = titleForTweet(oeTweet) - // return { title, content: '
' + bq.innerHTML + '
', url: oeTweet.data.url }; - // } - - prehandle: async (url, env) => { - console.log('prehandling twitter url', url) - - const tweetId = tweetIdFromStatusUrl(url) - const tweetData = (await getTweetById(tweetId)).data; - const authorId = tweetData.data.author_id; - const author = tweetData.includes.users.filter(u => u.id = authorId)[0]; - // escape html entities in title - const title = _.escape(titleForAuthor(author)) - const authorImage = author.profile_image_url.replace('_normal', '_400x400') - - let text = tweetData.data.text; - if (tweetData.data.entities && tweetData.data.entities.urls) { - for (let urlObj of tweetData.data.entities.urls) { - text = text.replace( - urlObj.url, - `${urlObj.display_url}` - ); - } - } - - const front = ` -
-

${text}

- ` - - var includesHtml = ''; - if (tweetData.includes.media) { - includesHtml = tweetData.includes.media.map(m => { - const linkUrl = m.type == 'photo' ? m.url : url; - const previewUrl = m.type == 'photo' ? m.url : m.preview_image_url; - const mediaOpen = ` - - - - ` - return mediaOpen - }).join('\n'); - } - - const back = ` - — ${author.username} ${author.name} ${formatTimestamp(tweetData.data.created_at)} -
- ` - const content = ` - - - - - - - - ${front} - ${includesHtml} - ${back} - ` - - return { content, url, title }; - } -} diff --git a/packages/content-fetch/youtube-handler.js b/packages/content-fetch/youtube-handler.js deleted file mode 100644 index e1866428a..000000000 --- a/packages/content-fetch/youtube-handler.js +++ /dev/null @@ -1,68 +0,0 @@ -/* eslint-disable no-undef */ -/* eslint-disable no-empty */ -/* eslint-disable @typescript-eslint/explicit-function-return-type */ -/* eslint-disable @typescript-eslint/no-var-requires */ -/* eslint-disable @typescript-eslint/no-require-imports */ -require('dotenv').config(); -const axios = require('axios'); -const _ = require('underscore'); - -const YOUTUBE_URL_MATCH = - /^((?:https?:)?\/\/)?((?:www|m)\.)?((?:youtube\.com|youtu.be))(\/(?:[\w-]+\?v=|embed\/|v\/)?)([\w-]+)(\S+)?$/ - -function getYoutubeVideoId(url) { - const u = new URL(url); - const videoId = u.searchParams.get('v'); - if (!videoId) { - const match = url.toString().match(YOUTUBE_URL_MATCH) - if (match === null || match.length < 6 || !match[5]) { - return undefined - } - return match[5] - } - return videoId -} -exports.getYoutubeVideoId = getYoutubeVideoId - -exports.youtubeHandler = { - shouldPrehandle: (url, env) => { - return YOUTUBE_URL_MATCH.test(url.toString()) - }, - - prehandle: async (url, env) => { - const videoId = getYoutubeVideoId(url) - if (!videoId) { - return {} - } - - const oembedUrl = `https://www.youtube.com/oembed?format=json&url=` + encodeURIComponent(`https://www.youtube.com/watch?v=${videoId}`) - const oembed = (await axios.get(oembedUrl.toString())).data; - // escape html entities in title - const title = _.escape(oembed.title); - const ratio = oembed.width / oembed.height; - const thumbnail = oembed.thumbnail_url; - const height = 350; - const width = height * ratio; - const authorName = _.escape(oembed.author_name); - - const content = ` - - ${title} - - - - - - - - -

${title}

- - - ` - - console.log('got video id', videoId) - - return { content, title: 'Youtube Content' }; - } -} diff --git a/packages/content-handler/src/apple-news-handler.ts b/packages/content-handler/src/apple-news-handler.ts index 1958b3b8c..a4239565c 100644 --- a/packages/content-handler/src/apple-news-handler.ts +++ b/packages/content-handler/src/apple-news-handler.ts @@ -1,8 +1,13 @@ -import { ContentHandler, PreHandleResult } from './index' import axios from 'axios' import { parseHTML } from 'linkedom' +import { ContentHandler, PreHandleResult } from './content-handler' export class AppleNewsHandler extends ContentHandler { + constructor() { + super() + this.name = 'Apple News' + } + shouldPreHandle(url: string, dom?: Document): boolean { const u = new URL(url) return u.hostname === 'apple.news' diff --git a/packages/content-handler/src/bloomberg-handler.ts b/packages/content-handler/src/bloomberg-handler.ts index ac4fb23ed..95670630e 100644 --- a/packages/content-handler/src/bloomberg-handler.ts +++ b/packages/content-handler/src/bloomberg-handler.ts @@ -1,8 +1,13 @@ -import { ContentHandler, PreHandleResult } from './index' import axios from 'axios' import { parseHTML } from 'linkedom' +import { ContentHandler, PreHandleResult } from './content-handler' + +export class BloombergHandler extends ContentHandler { + constructor() { + super() + this.name = 'Bloomberg' + } -class BloombergHandler extends ContentHandler { shouldPreHandle(url: string, dom?: Document): boolean { const BLOOMBERG_URL_MATCH = /https?:\/\/(www\.)?bloomberg.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_+.~#?&/=]*)/ diff --git a/packages/content-handler/src/content-handler.ts b/packages/content-handler/src/content-handler.ts new file mode 100644 index 000000000..c49987368 --- /dev/null +++ b/packages/content-handler/src/content-handler.ts @@ -0,0 +1,126 @@ +import addressparser from 'addressparser' +import rfc2047 from 'rfc2047' +import { v4 as uuidv4 } from 'uuid' + +interface Unsubscribe { + mailTo?: string + httpUrl?: string +} + +interface NewsletterMessage { + email: string + content: string + url: string + title: string + author: string + unsubMailTo?: string + unsubHttpUrl?: string +} + +export interface PreHandleResult { + url?: string + title?: string + content?: string + contentType?: string + dom?: Document +} + +export abstract class ContentHandler { + protected senderRegex: RegExp + protected urlRegex: RegExp + protected defaultUrl: string + public name: string + + protected constructor() { + this.senderRegex = new RegExp(/NEWSLETTER_SENDER_REGEX/) + this.urlRegex = new RegExp(/NEWSLETTER_URL_REGEX/) + this.defaultUrl = 'NEWSLETTER_DEFAULT_URL' + this.name = 'Handler name' + } + + shouldResolve(url: string): boolean { + return false + } + + async resolve(url: string): Promise { + return Promise.resolve(url) + } + + shouldPreHandle(url: string, dom?: Document): boolean { + return false + } + + async preHandle(url: string, document?: Document): Promise { + return Promise.resolve({ url, dom: document }) + } + + isNewsletter(postHeader: string, from: string, unSubHeader: string): boolean { + // Axios newsletter is from + const re = new RegExp(this.senderRegex) + return re.test(from) && (!!postHeader || !!unSubHeader) + } + + parseNewsletterUrl(_postHeader: string, html: string): string | undefined { + // get newsletter url from html + const matches = html.match(this.urlRegex) + if (matches) { + return matches[1] + } + return undefined + } + + parseAuthor(from: string): string { + // get author name from email + // e.g. 'Jackson Harper from Omnivore App ' + // or 'Mike Allen ' + const parsed = addressparser(from) + if (parsed.length > 0) { + return parsed[0].name + } + return from + } + + parseUnsubscribe(unSubHeader: string): Unsubscribe { + // parse list-unsubscribe header + // e.g. List-Unsubscribe: , + const decoded = rfc2047.decode(unSubHeader) + return { + mailTo: decoded.match(/<(https?:\/\/[^>]*)>/)?.[1], + httpUrl: decoded.match(/]*)>/)?.[1], + } + } + + handleNewsletter( + email: string, + html: string, + postHeader: string, + title: string, + from: string, + unSubHeader: string + ): NewsletterMessage { + console.log('handleNewsletter', email, postHeader, title, from) + + if (!email || !html || !title || !from) { + console.log('invalid newsletter email') + throw new Error('invalid newsletter email') + } + + // fallback to default url if newsletter url does not exist + // assign a random uuid to the default url to avoid duplicate url + const url = + this.parseNewsletterUrl(postHeader, html) || + `${this.defaultUrl}?source=newsletters&id=${uuidv4()}` + const author = this.parseAuthor(from) + const unsubscribe = this.parseUnsubscribe(unSubHeader) + + return { + email, + content: html, + url, + title, + author, + unsubMailTo: unsubscribe.mailTo || '', + unsubHttpUrl: unsubscribe.httpUrl || '', + } + } +} diff --git a/packages/content-handler/src/derstandard-handler.ts b/packages/content-handler/src/derstandard-handler.ts index 1a1f50778..2ac01ac86 100644 --- a/packages/content-handler/src/derstandard-handler.ts +++ b/packages/content-handler/src/derstandard-handler.ts @@ -1,8 +1,13 @@ -import { ContentHandler, PreHandleResult } from './index' +import { ContentHandler, PreHandleResult } from './content-handler' import axios from 'axios' import { parseHTML } from 'linkedom' -class DerstandardHandler extends ContentHandler { +export class DerstandardHandler extends ContentHandler { + constructor() { + super() + this.name = 'Derstandard' + } + shouldPreHandle(url: string, dom?: Document): boolean { const u = new URL(url) return u.hostname === 'www.derstandard.at' diff --git a/packages/content-handler/src/image-handler.ts b/packages/content-handler/src/image-handler.ts index cb5e462c7..652756c51 100644 --- a/packages/content-handler/src/image-handler.ts +++ b/packages/content-handler/src/image-handler.ts @@ -1,6 +1,11 @@ -import { ContentHandler, PreHandleResult } from './index' +import { ContentHandler, PreHandleResult } from './content-handler' + +export class ImageHandler extends ContentHandler { + constructor() { + super() + this.name = 'Image' + } -class ImageHandler extends ContentHandler { shouldPreHandle(url: string, dom?: Document): boolean { const IMAGE_URL_PATTERN = /(https?:\/\/.*\.(?:jpg|jpeg|png|webp))/i return IMAGE_URL_PATTERN.test(url.toString()) diff --git a/packages/content-handler/src/index.ts b/packages/content-handler/src/index.ts index bfb0214e4..3275a3c89 100644 --- a/packages/content-handler/src/index.ts +++ b/packages/content-handler/src/index.ts @@ -1,111 +1,76 @@ -import addressparser from 'addressparser' -import { v4 as uuidv4 } from 'uuid' -import rfc2047 from 'rfc2047' +import { AppleNewsHandler } from './apple-news-handler' +import { BloombergHandler } from './bloomberg-handler' +import { DerstandardHandler } from './derstandard-handler' +import { ImageHandler } from './image-handler' +import { MediumHandler } from './medium-handler' +import { PdfHandler } from './pdf-handler' +import { ScrapingBeeHandler } from './scrapingBee-handler' +import { TDotCoHandler } from './t-dot-co-handler' +import { TwitterHandler } from './twitter-handler' +import { YoutubeHandler } from './youtube-handler' +import { ContentHandler, PreHandleResult } from './content-handler' -interface Unsubscribe { - mailTo?: string - httpUrl?: string +const validateUrlString = (url: string) => { + const u = new URL(url) + // Make sure the URL is http or https + if (u.protocol !== 'http:' && u.protocol !== 'https:') { + throw new Error('Invalid URL protocol check failed') + } + // Make sure the domain is not localhost + if (u.hostname === 'localhost' || u.hostname === '0.0.0.0') { + throw new Error('Invalid URL is localhost') + } + // Make sure the domain is not a private IP + if (/^(10|172\.16|192\.168)\..*/.test(u.hostname)) { + throw new Error('Invalid URL is private ip') + } } -interface NewsletterMessage { - email: string - content: string - url: string - title: string - author: string - unsubMailTo?: string - unsubHttpUrl?: string -} +const contentHandlers: ContentHandler[] = [ + new AppleNewsHandler(), + new BloombergHandler(), + new DerstandardHandler(), + new ImageHandler(), + new MediumHandler(), + new PdfHandler(), + new ScrapingBeeHandler(), + new TDotCoHandler(), + new TwitterHandler(), + new YoutubeHandler(), +] -export interface PreHandleResult { - url?: string - title?: string - content?: string - contentType?: string +export const preHandleContent = async ( + url: string, dom?: Document +): Promise => { + // Before we run the regular handlers we check to see if we need tp + // pre-resolve the URL. TODO: This should probably happen recursively, + // so URLs can be pre-resolved, handled, pre-resolved, handled, etc. + for (const handler of contentHandlers) { + if (handler.shouldResolve(url)) { + try { + const resolvedUrl = await handler.resolve(url) + if (resolvedUrl && validateUrlString(resolvedUrl)) { + url = resolvedUrl + } + } catch (err) { + console.log('error resolving url with handler', handler.name, err) + } + break + } + } + // Before we fetch the page we check the handlers, to see if they want + // to perform a prefetch action that can modify our requests. + // enumerate the handlers and see if any of them want to handle the request + for (const handler of contentHandlers) { + if (handler.shouldPreHandle(url, dom)) { + console.log('preHandleContent', handler.name, url) + return handler.preHandle(url, dom) + } + } + return undefined } -export class ContentHandler { - protected senderRegex = /NEWSLETTER_SENDER_REGEX/ - protected urlRegex = /NEWSLETTER_URL_REGEX/ - protected defaultUrl = 'NEWSLETTER_DEFAULT_URL' - protected name = '' - - shouldPreHandle(url: string, dom?: Document): boolean { - return false - } - - async preHandle(url: string, document?: Document): Promise { - return Promise.resolve({ url, dom: document }) - } - - isNewsletter(postHeader: string, from: string, unSubHeader: string): boolean { - // Axios newsletter is from - const re = new RegExp(this.senderRegex) - return re.test(from) && (!!postHeader || !!unSubHeader) - } - - parseNewsletterUrl(_postHeader: string, html: string): string | undefined { - // get newsletter url from html - const matches = html.match(this.urlRegex) - if (matches) { - return matches[1] - } - return undefined - } - - parseAuthor(from: string): string { - // get author name from email - // e.g. 'Jackson Harper from Omnivore App ' - // or 'Mike Allen ' - const parsed = addressparser(from) - if (parsed.length > 0) { - return parsed[0].name - } - return from - } - - parseUnsubscribe(unSubHeader: string): Unsubscribe { - // parse list-unsubscribe header - // e.g. List-Unsubscribe: , - const decoded = rfc2047.decode(unSubHeader) - return { - mailTo: decoded.match(/<(https?:\/\/[^>]*)>/)?.[1], - httpUrl: decoded.match(/]*)>/)?.[1], - } - } - - handleNewsletter( - email: string, - html: string, - postHeader: string, - title: string, - from: string, - unSubHeader: string - ): NewsletterMessage { - console.log('handleNewsletter', email, postHeader, title, from) - - if (!email || !html || !title || !from) { - console.log('invalid newsletter email') - throw new Error('invalid newsletter email') - } - - // fallback to default url if newsletter url does not exist - // assign a random uuid to the default url to avoid duplicate url - const url = - this.parseNewsletterUrl(postHeader, html) || - `${this.defaultUrl}?source=newsletters&id=${uuidv4()}` - const author = this.parseAuthor(from) - const unsubscribe = this.parseUnsubscribe(unSubHeader) - - return { - email, - content: html, - url, - title, - author, - unsubMailTo: unsubscribe.mailTo || '', - unsubHttpUrl: unsubscribe.httpUrl || '', - } - } +module.exports = { + preHandleContent, } diff --git a/packages/content-handler/src/medium-handler.ts b/packages/content-handler/src/medium-handler.ts index 0b9d2fcb5..8e14cebfe 100644 --- a/packages/content-handler/src/medium-handler.ts +++ b/packages/content-handler/src/medium-handler.ts @@ -1,6 +1,11 @@ -import { ContentHandler, PreHandleResult } from './index' +import { ContentHandler, PreHandleResult } from './content-handler' + +export class MediumHandler extends ContentHandler { + constructor() { + super() + this.name = 'Medium' + } -class MediumHandler extends ContentHandler { shouldPreHandle(url: string, dom?: Document): boolean { const u = new URL(url) return u.hostname.endsWith('medium.com') diff --git a/packages/content-handler/src/pdf-handler.ts b/packages/content-handler/src/pdf-handler.ts index 54df72bc2..245f9fc1b 100644 --- a/packages/content-handler/src/pdf-handler.ts +++ b/packages/content-handler/src/pdf-handler.ts @@ -1,6 +1,11 @@ -import { ContentHandler, PreHandleResult } from './index' +import { ContentHandler, PreHandleResult } from './content-handler' + +export class PdfHandler extends ContentHandler { + constructor() { + super() + this.name = 'PDF' + } -class PdfHandler extends ContentHandler { shouldPreHandle(url: string, dom?: Document): boolean { const u = new URL(url) const path = u.pathname.replace(u.search, '') diff --git a/packages/content-handler/src/scrapingBee-handler.ts b/packages/content-handler/src/scrapingBee-handler.ts index 0b1b5984b..792c5a75c 100644 --- a/packages/content-handler/src/scrapingBee-handler.ts +++ b/packages/content-handler/src/scrapingBee-handler.ts @@ -1,8 +1,13 @@ -import { ContentHandler, PreHandleResult } from './index' +import { ContentHandler, PreHandleResult } from './content-handler' import axios from 'axios' import { parseHTML } from 'linkedom' -class ScrapingBeeHandler extends ContentHandler { +export class ScrapingBeeHandler extends ContentHandler { + constructor() { + super() + this.name = 'ScrapingBee' + } + shouldPreHandle(url: string, dom?: Document): boolean { const u = new URL(url) const hostnames = ['nytimes.com', 'news.google.com'] diff --git a/packages/content-handler/src/t-dot-co-handler.ts b/packages/content-handler/src/t-dot-co-handler.ts index 41c90e4a5..b4b461c2f 100644 --- a/packages/content-handler/src/t-dot-co-handler.ts +++ b/packages/content-handler/src/t-dot-co-handler.ts @@ -1,7 +1,12 @@ -import { ContentHandler } from './index' +import { ContentHandler } from './content-handler' import axios from 'axios' -class TDotCoHandler extends ContentHandler { +export class TDotCoHandler extends ContentHandler { + constructor() { + super() + this.name = 't.co' + } + shouldResolve(url: string): boolean { const T_DOT_CO_URL_MATCH = /^https:\/\/(?:www\.)?t\.co\/.*$/ return T_DOT_CO_URL_MATCH.test(url) diff --git a/packages/content-handler/src/twitter-handler.ts b/packages/content-handler/src/twitter-handler.ts index 10d9ead03..b7345efe5 100644 --- a/packages/content-handler/src/twitter-handler.ts +++ b/packages/content-handler/src/twitter-handler.ts @@ -1,4 +1,4 @@ -import { ContentHandler, PreHandleResult } from './index' +import { ContentHandler, PreHandleResult } from './content-handler' import axios from 'axios' import { DateTime } from 'luxon' import _ from 'underscore' @@ -52,7 +52,12 @@ const formatTimestamp = (timestamp: string) => { ) } -class TwitterHandler extends ContentHandler { +export class TwitterHandler extends ContentHandler { + constructor() { + super() + this.name = 'Twitter' + } + shouldPreHandle(url: string, dom?: Document): boolean { return !!TWITTER_BEARER_TOKEN && TWITTER_URL_MATCH.test(url.toString()) } diff --git a/packages/content-handler/src/youtube-handler.ts b/packages/content-handler/src/youtube-handler.ts index 0cbe0df57..898810fe9 100644 --- a/packages/content-handler/src/youtube-handler.ts +++ b/packages/content-handler/src/youtube-handler.ts @@ -1,4 +1,4 @@ -import { ContentHandler, PreHandleResult } from './index' +import { ContentHandler, PreHandleResult } from './content-handler' import axios from 'axios' import _ from 'underscore' @@ -18,7 +18,12 @@ export const getYoutubeVideoId = (url: string) => { return videoId } -class YoutubeHandler extends ContentHandler { +export class YoutubeHandler extends ContentHandler { + constructor() { + super() + this.name = 'Youtube' + } + shouldPreHandle(url: string, dom?: Document): boolean { return YOUTUBE_URL_MATCH.test(url.toString()) } diff --git a/yarn.lock b/yarn.lock index d1c9a65dd..3da81da9b 100644 --- a/yarn.lock +++ b/yarn.lock @@ -10579,19 +10579,6 @@ chai@^4.3.4: pathval "^1.1.1" type-detect "^4.0.5" -chai@^4.3.6: - version "4.3.6" - resolved "https://registry.yarnpkg.com/chai/-/chai-4.3.6.tgz#ffe4ba2d9fa9d6680cc0b370adae709ec9011e9c" - integrity sha512-bbcp3YfHCUzMOvKqsztczerVgBKSsEijCySNlHHbX3VG1nskvqjz5Rfso1gGwD6w6oOV3eI60pKuMOV5MV7p3Q== - dependencies: - assertion-error "^1.1.0" - check-error "^1.0.2" - deep-eql "^3.0.1" - get-func-name "^2.0.0" - loupe "^2.3.1" - pathval "^1.1.1" - type-detect "^4.0.5" - chalk@^1.0.0, chalk@^1.1.3: version "1.1.3" resolved "https://registry.yarnpkg.com/chalk/-/chalk-1.1.3.tgz#a8115c55e4a702fe4d150abd3872822a7e09fc98" @@ -18078,13 +18065,6 @@ loose-envify@^1.0.0, loose-envify@^1.1.0, loose-envify@^1.4.0: dependencies: js-tokens "^3.0.0 || ^4.0.0" -loupe@^2.3.1: - version "2.3.4" - resolved "https://registry.yarnpkg.com/loupe/-/loupe-2.3.4.tgz#7e0b9bffc76f148f9be769cb1321d3dcf3cb25f3" - integrity sha512-OvKfgCC2Ndby6aSTREl5aCCPTNIzlDfQZvZxNUrBrihDhL3xcrYegTblhmEiCrg2kKQz4XsFIaemE5BF4ybSaQ== - dependencies: - get-func-name "^2.0.0" - lower-case-first@^1.0.0: version "1.0.2" resolved "https://registry.yarnpkg.com/lower-case-first/-/lower-case-first-1.0.2.tgz#e5da7c26f29a7073be02d52bac9980e5922adfa1"