diff --git a/packages/content-fetch/apple-news-handler.js b/packages/content-fetch/apple-news-handler.js
deleted file mode 100644
index 0759dec23..000000000
--- a/packages/content-fetch/apple-news-handler.js
+++ /dev/null
@@ -1,36 +0,0 @@
-/* eslint-disable no-undef */
-/* eslint-disable no-empty */
-/* eslint-disable @typescript-eslint/explicit-function-return-type */
-/* eslint-disable @typescript-eslint/no-var-requires */
-/* eslint-disable @typescript-eslint/no-require-imports */
-require('dotenv').config();
-const Url = require('url');
-const axios = require('axios');
-const { promisify } = require('util');
-const { DateTime } = require('luxon');
-const os = require('os');
-const { Cipher } = require('crypto');
-const { parseHTML } = require('linkedom');
-
-exports.appleNewsHandler = {
-
- shouldPrehandle: (url, env) => {
- const u = new URL(url);
- if (u.hostname === 'apple.news') {
- return true;
- }
- return false
- },
-
- prehandle: async (url, env) => {
- const MOBILE_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36'
- const response = await axios.get(url, { headers: { 'User-Agent': MOBILE_USER_AGENT } } );
- const data = response.data;
-
- const dom = parseHTML(data).document;
-
- // make sure its a valid URL by wrapping in new URL
- const u = new URL(dom.querySelector('span.click-here').parentNode.href);
- return { url: u.href };
- }
-}
diff --git a/packages/content-fetch/bloomberg-handler.js b/packages/content-fetch/bloomberg-handler.js
deleted file mode 100644
index d79a568bb..000000000
--- a/packages/content-fetch/bloomberg-handler.js
+++ /dev/null
@@ -1,39 +0,0 @@
-/* eslint-disable no-undef */
-/* eslint-disable no-empty */
-/* eslint-disable @typescript-eslint/explicit-function-return-type */
-/* eslint-disable @typescript-eslint/no-var-requires */
-/* eslint-disable @typescript-eslint/no-require-imports */
-require('dotenv').config();
-const axios = require('axios');
-const os = require('os');
-const { parseHTML } = require('linkedom');
-
-exports.bloombergHandler = {
-
- shouldPrehandle: (url, env) => {
- const BLOOMBERG_URL_MATCH =
- /https?:\/\/(www\.)?bloomberg.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)/
- return BLOOMBERG_URL_MATCH.test(url.toString())
- },
-
- prehandle: async (url, env) => {
- console.log('prehandling bloomberg url', url)
-
- try {
- const response = await axios.get('https://app.scrapingbee.com/api/v1', {
- params: {
- 'api_key': process.env.SCRAPINGBEE_API_KEY,
- 'url': url,
- 'return_page_source': true,
- 'block_ads': true,
- 'block_resources': false,
- }
- })
- const dom = parseHTML(response.data).document;
- return { title: dom.title, content: dom.querySelector('body').innerHTML, url: url }
- } catch (error) {
- console.error('error prehandling bloomberg url', error)
- throw error
- }
- }
-}
diff --git a/packages/content-fetch/derstandard-handler.js b/packages/content-fetch/derstandard-handler.js
deleted file mode 100644
index a44db6f2a..000000000
--- a/packages/content-fetch/derstandard-handler.js
+++ /dev/null
@@ -1,35 +0,0 @@
-/* eslint-disable no-undef */
-/* eslint-disable no-empty */
-/* eslint-disable @typescript-eslint/explicit-function-return-type */
-/* eslint-disable @typescript-eslint/no-var-requires */
-/* eslint-disable @typescript-eslint/no-require-imports */
-require('dotenv').config();
-const axios = require('axios');
-const { parseHTML } = require('linkedom');
-
-exports.derstandardHandler = {
- shouldPrehandle: (url, env) => {
- const u = new URL(url);
- return u.hostname === 'www.derstandard.at';
- },
-
- prehandle: async (url, env) => {
- const response = await axios.get(url, {
- // set cookie to give consent to get the article
- headers: {
- 'cookie': `DSGVO_ZUSAGE_V1=true; consentUUID=2bacb9c1-1e80-4be0-9f7b-ee987cf4e7b0_6`
- },
- });
- const content = response.data;
-
- var title = undefined;
- const dom = parseHTML(content).document;
- const titleElement = dom.querySelector('.article-title')
- if (!titleElement) {
- title = titleElement.textContent
- titleElement.remove()
- }
-
- return { content: dom.body.outerHTML, title: title };
- }
-}
diff --git a/packages/content-fetch/fetch-content.js b/packages/content-fetch/fetch-content.js
index 01dbe199c..658229e3d 100644
--- a/packages/content-fetch/fetch-content.js
+++ b/packages/content-fetch/fetch-content.js
@@ -9,16 +9,10 @@ const puppeteer = require('puppeteer-core');
const axios = require('axios');
const jwt = require('jsonwebtoken');
const { promisify } = require('util');
+const { parseHTML } = require('linkedom');
+const { preHandleContent } = require('@omnivore/content-handler');
+
const signToken = promisify(jwt.sign);
-const { appleNewsHandler } = require('./apple-news-handler');
-const { twitterHandler } = require('./twitter-handler');
-const { youtubeHandler } = require('./youtube-handler');
-const { tDotCoHandler } = require('./t-dot-co-handler');
-const { pdfHandler } = require('./pdf-handler');
-const { mediumHandler } = require('./medium-handler');
-const { derstandardHandler } = require('./derstandard-handler');
-const { imageHandler } = require('./image-handler');
-const { scrapingBeeHandler } = require('./scrapingBee-handler')
const MOBILE_USER_AGENT = 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.62 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
const DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
@@ -29,8 +23,6 @@ const NON_SCRIPT_HOSTS= ['medium.com', 'fastcompany.com'];
const ALLOWED_CONTENT_TYPES = ['text/html', 'application/octet-stream', 'text/plain', 'application/pdf'];
-const { parseHTML } = require('linkedom');
-
// Add stealth plugin to hide puppeteer usage
// const StealthPlugin = require('puppeteer-extra-plugin-stealth');
// puppeteer.use(StealthPlugin());
@@ -207,19 +199,6 @@ const saveUploadedPdf = async (userId, url, uploadFileId, articleSavingRequestId
);
};
-const handlers = {
- 'pdf': pdfHandler,
- 'apple-news': appleNewsHandler,
- 'twitter': twitterHandler,
- 'youtube': youtubeHandler,
- 't-dot-co': tDotCoHandler,
- 'medium': mediumHandler,
- 'derstandard': derstandardHandler,
- 'image': imageHandler,
- 'scrapingBee': scrapingBeeHandler,
-};
-
-
async function fetchContent(req, res) {
functionStartTime = Date.now();
@@ -246,61 +225,18 @@ async function fetchContent(req, res) {
return res.sendStatus(400);
}
- // if (!userId || !articleSavingRequestId) {
- // Object.assign(logRecord, { invalidParams: true, body: req.body, query: req.query });
- // console.log(`Invalid parameters`, logRecord);
- // return res.sendStatus(400);
- // }
-
- // Before we run the regular handlers we check to see if we need tp
- // pre-resolve the URL. TODO: This should probably happen recursively,
- // so URLs can be pre-resolved, handled, pre-resolved, handled, etc.
- for (const [key, handler] of Object.entries(handlers)) {
- if (handler.shouldResolve && handler.shouldResolve(url)) {
- try {
- url = await handler.resolve(url);
- validateUrlString(url);
- } catch (err) {
- console.log('error resolving url with handler', key, err);
- }
- break;
- }
- }
-
- // Before we fetch the page we check the handlers, to see if they want
- // to perform a prefetch action that can modify our requests.
- // enumerate the handlers and see if any of them want to handle the request
- const handler = Object.keys(handlers).find(key => {
- try {
- return handlers[key].shouldPrehandle(url)
- } catch (e) {
- console.log('error with handler: ', key, e);
- }
- return false;
- });
-
- var title = undefined;
- var content = undefined;
- var contentType = undefined;
-
- if (handler) {
- try {
- // The only handler we have now can modify the URL, but in the
- // future maybe we let it modify content. In that case
- // we might exit the request early.
- console.log('pre-handling url with handler: ', handler);
-
- const result = await handlers[handler].prehandle(url);
- if (result && result.url) {
- url = result.url
- validateUrlString(url);
- }
- if (result && result.title) { title = result.title }
- if (result && result.content) { content = result.content }
- if (result && result.contentType) { contentType = result.contentType }
- } catch (e) {
- console.log('error with handler: ', handler, e);
+ let title, content, contentType;
+ try {
+ const result = await preHandleContent(url);
+ if (result && result.url) {
+ url = result.url
+ validateUrlString(url);
}
+ if (result && result.title) { title = result.title }
+ if (result && result.content) { content = result.content }
+ if (result && result.contentType) { contentType = result.contentType }
+ } catch (e) {
+ console.log('error with handler: ', e);
}
let context, page, finalUrl;
diff --git a/packages/content-fetch/image-handler.js b/packages/content-fetch/image-handler.js
deleted file mode 100644
index 59f132afc..000000000
--- a/packages/content-fetch/image-handler.js
+++ /dev/null
@@ -1,34 +0,0 @@
-/* eslint-disable no-undef */
-/* eslint-disable no-empty */
-/* eslint-disable @typescript-eslint/explicit-function-return-type */
-/* eslint-disable @typescript-eslint/no-var-requires */
-/* eslint-disable @typescript-eslint/no-require-imports */
-require('dotenv').config();
-
-
-exports.imageHandler = {
- shouldPrehandle: (url, env) => {
- const IMAGE_URL_PATTERN =
- /(https?:\/\/.*\.(?:jpg|jpeg|png|webp))/i
- return IMAGE_URL_PATTERN.test(url.toString())
- },
-
- prehandle: async (url, env) => {
- const title = url.toString().split('/').pop();
- const content = `
-
-
- ${title}
-
-
-
-
-
-

-
-
- `
-
- return { title, content };
- }
-}
diff --git a/packages/content-fetch/medium-handler.js b/packages/content-fetch/medium-handler.js
deleted file mode 100644
index e6a605a0e..000000000
--- a/packages/content-fetch/medium-handler.js
+++ /dev/null
@@ -1,29 +0,0 @@
-/* eslint-disable no-undef */
-/* eslint-disable no-empty */
-/* eslint-disable @typescript-eslint/explicit-function-return-type */
-/* eslint-disable @typescript-eslint/no-var-requires */
-/* eslint-disable @typescript-eslint/no-require-imports */
-require('dotenv').config();
-const axios = require('axios');
-const os = require('os');
-
-exports.mediumHandler = {
-
- shouldPrehandle: (url, env) => {
- const u = new URL(url);
- return u.hostname.endsWith('medium.com')
- },
-
- prehandle: async (url, env) => {
- console.log('prehandling medium url', url)
-
- try {
- const res = new URL(url);
- res.searchParams.delete('source');
- return { url: res.toString() }
- } catch (error) {
- console.error('error prehandling medium url', error)
- throw error
- }
- }
-}
diff --git a/packages/content-fetch/package.json b/packages/content-fetch/package.json
index f58675a74..3df85a237 100644
--- a/packages/content-fetch/package.json
+++ b/packages/content-fetch/package.json
@@ -11,7 +11,8 @@
"linkedom": "^0.14.9",
"luxon": "^2.3.1",
"puppeteer-core": "^16.1.0",
- "underscore": "^1.13.4"
+ "underscore": "^1.13.4",
+ "@omnivore/content-handler": "1.0.0"
},
"scripts": {
"start": "node app.js",
diff --git a/packages/content-fetch/pdf-handler.js b/packages/content-fetch/pdf-handler.js
deleted file mode 100644
index 1260db287..000000000
--- a/packages/content-fetch/pdf-handler.js
+++ /dev/null
@@ -1,21 +0,0 @@
-/* eslint-disable no-undef */
-/* eslint-disable no-empty */
-/* eslint-disable @typescript-eslint/explicit-function-return-type */
-/* eslint-disable @typescript-eslint/no-var-requires */
-/* eslint-disable @typescript-eslint/no-require-imports */
-require('dotenv').config();
-const Url = require('url');
-
-
-exports.pdfHandler = {
-
- shouldPrehandle: (url, env) => {
- const u = Url.parse(url)
- const path = u.path.replace(u.search, '')
- return path.endsWith('.pdf')
- },
-
- prehandle: async (url, env) => {
- return { contentType: 'application/pdf' };
- }
-}
diff --git a/packages/content-fetch/scrapingBee-handler.js b/packages/content-fetch/scrapingBee-handler.js
deleted file mode 100644
index 6563fca44..000000000
--- a/packages/content-fetch/scrapingBee-handler.js
+++ /dev/null
@@ -1,44 +0,0 @@
-/* eslint-disable no-undef */
-/* eslint-disable no-empty */
-/* eslint-disable @typescript-eslint/explicit-function-return-type */
-/* eslint-disable @typescript-eslint/no-var-requires */
-/* eslint-disable @typescript-eslint/no-require-imports */
-require('dotenv').config();
-const axios = require('axios');
-const { parseHTML } = require('linkedom');
-
-const os = require('os');
-
-exports.scrapingBeeHandler = {
-
- shouldPrehandle: (url, env) => {
- const u = new URL(url);
- const hostnames = [
- 'nytimes.com',
- 'news.google.com',
- ]
-
- return hostnames.some((h) => u.hostname.endsWith(h))
- },
-
- prehandle: async (url, env) => {
- console.log('prehandling url with scrapingbee', url)
-
- try {
- const response = await axios.get('https://app.scrapingbee.com/api/v1', {
- params: {
- 'api_key': process.env.SCRAPINGBEE_API_KEY,
- 'url': url,
- 'return_page_source': true,
- 'block_ads': true,
- 'block_resources': false,
- }
- })
- const dom = parseHTML(response.data).document;
- return { title: dom.title, content: response.data, url: url }
- } catch (error) {
- console.error('error prehandling url w/scrapingbee', error)
- throw error
- }
- }
-}
diff --git a/packages/content-fetch/t-dot-co-handler.js b/packages/content-fetch/t-dot-co-handler.js
deleted file mode 100644
index 170f97fb7..000000000
--- a/packages/content-fetch/t-dot-co-handler.js
+++ /dev/null
@@ -1,31 +0,0 @@
-/* eslint-disable no-undef */
-/* eslint-disable no-empty */
-/* eslint-disable @typescript-eslint/explicit-function-return-type */
-/* eslint-disable @typescript-eslint/no-var-requires */
-/* eslint-disable @typescript-eslint/no-require-imports */
-require('dotenv').config();
-const axios = require('axios');
-const Url = require('url');
-
-
-exports.tDotCoHandler = {
-
- shouldResolve: function (url, env) {
- const T_DOT_CO_URL_MATCH = /^https:\/\/(?:www\.)?t\.co\/.*$/;
- return T_DOT_CO_URL_MATCH.test(url);
- },
-
- resolve: async function(url, env) {
- return await axios.get(url, { maxRedirects: 0, validateStatus: null })
- .then(res => {
- return Url.parse(res.headers.location).href;
- }).catch((err) => {
- console.log('err with t.co url', err);
- return undefined;
- });
- },
-
- shouldPrehandle: (url, env) => {
- return false
- },
-}
diff --git a/packages/content-fetch/test/apple-news-handler.test.js b/packages/content-fetch/test/apple-news-handler.test.js
deleted file mode 100644
index 4531d720e..000000000
--- a/packages/content-fetch/test/apple-news-handler.test.js
+++ /dev/null
@@ -1,9 +0,0 @@
-const { expect } = require('chai')
-const { appleNewsHandler } = require('../apple-news-handler')
-
-describe('open a simple web page', () => {
- it('should return a response', async () => {
- const response = await appleNewsHandler.prehandle('https://apple.news/AxjzaZaPvSn23b67LhXI5EQ')
- console.log('response', response)
- })
-})
diff --git a/packages/content-fetch/test/youtube-handler.test.js b/packages/content-fetch/test/youtube-handler.test.js
deleted file mode 100644
index d34643773..000000000
--- a/packages/content-fetch/test/youtube-handler.test.js
+++ /dev/null
@@ -1,12 +0,0 @@
-const { expect } = require('chai')
-const { getYoutubeVideoId } = require('../youtube-handler')
-
-describe('getYoutubeVideoId', () => {
- it('should parse video id out of a URL', async () => {
- expect('BnSUk0je6oo').to.eq(getYoutubeVideoId('https://www.youtube.com/watch?v=BnSUk0je6oo&t=269s'));
- expect('vFD2gu007dc').to.eq(getYoutubeVideoId('https://www.youtube.com/watch?v=vFD2gu007dc&list=RDvFD2gu007dc&start_radio=1'));
- expect('vFD2gu007dc').to.eq(getYoutubeVideoId('https://youtu.be/vFD2gu007dc'));
- expect('BMFVCnbRaV4').to.eq(getYoutubeVideoId('https://youtube.com/watch?v=BMFVCnbRaV4&feature=share'));
- expect('cg9b4RC87LI').to.eq(getYoutubeVideoId('https://youtu.be/cg9b4RC87LI?t=116'));
- })
-})
diff --git a/packages/content-fetch/twitter-handler.js b/packages/content-fetch/twitter-handler.js
deleted file mode 100644
index 7ae93072c..000000000
--- a/packages/content-fetch/twitter-handler.js
+++ /dev/null
@@ -1,172 +0,0 @@
-/* eslint-disable no-undef */
-/* eslint-disable no-empty */
-/* eslint-disable @typescript-eslint/explicit-function-return-type */
-/* eslint-disable @typescript-eslint/no-var-requires */
-/* eslint-disable @typescript-eslint/no-require-imports */
-require('dotenv').config();
-const axios = require('axios');
-const { DateTime } = require('luxon');
-const _ = require('underscore');
-
-const TWITTER_BEARER_TOKEN = process.env.TWITTER_BEARER_TOKEN;
-const TWITTER_URL_MATCH = /twitter\.com\/(?:#!\/)?(\w+)\/status(?:es)?\/(\d+)(?:\/.*)?/
-
-const embeddedTweet = async (url) => {
-
- const BASE_ENDPOINT = 'https://publish.twitter.com/oembed'
-
- const apiUrl = new URL(BASE_ENDPOINT)
- apiUrl.searchParams.append('url', url);
- apiUrl.searchParams.append('omit_script', true);
- apiUrl.searchParams.append('dnt', true);
-
- return await axios.get(apiUrl.toString(), {
- headers: {
- Authorization: `Bearer ${TWITTER_BEARER_TOKEN}`,
- redirect: "follow",
- },
- });
-};
-
-const getTweetFields = () => {
- const TWEET_FIELDS =
- "&tweet.fields=attachments,author_id,conversation_id,created_at," +
- "entities,geo,in_reply_to_user_id,lang,possibly_sensitive,public_metrics,referenced_tweets," +
- "source,withheld";
- const EXPANSIONS = "&expansions=author_id,attachments.media_keys";
- const USER_FIELDS =
- "&user.fields=created_at,description,entities,location,pinned_tweet_id,profile_image_url,protected,public_metrics,url,verified,withheld";
- const MEDIA_FIELDS =
- "&media.fields=duration_ms,height,preview_image_url,url,media_key,public_metrics,width";
-
- return `${TWEET_FIELDS}${EXPANSIONS}${USER_FIELDS}${MEDIA_FIELDS}`;
-}
-
-const getTweetById = async (id) => {
- const BASE_ENDPOINT = "https://api.twitter.com/2/tweets/";
- const apiUrl = new URL(BASE_ENDPOINT + id + '?' + getTweetFields())
-
- return await axios.get(apiUrl.toString(), {
- headers: {
- Authorization: `Bearer ${TWITTER_BEARER_TOKEN}`,
- redirect: "follow",
- },
- });
-};
-
-const getUserByUsername = async (username) => {
- const BASE_ENDPOINT = "https://api.twitter.com/2/users/by/username/";
-
- const apiUrl = new URL(BASE_ENDPOINT + username)
- apiUrl.searchParams.append('user.fields', 'profile_image_url');
-
- return await axios.get(apiUrl.toString(), {
- headers: {
- Authorization: `Bearer ${TWITTER_BEARER_TOKEN}`,
- redirect: "follow",
- },
- });
-};
-
-const titleForTweet = (tweet) => {
- return `${tweet.data.author_name} on Twitter`
-};
-
-const titleForAuthor = (author) => {
- return `${author.name} on Twitter`
-};
-
-const usernameFromStatusUrl = (url) => {
- const match = url.toString().match(TWITTER_URL_MATCH)
- return match[1]
-};
-
-const tweetIdFromStatusUrl = (url) => {
- const match = url.toString().match(TWITTER_URL_MATCH)
- return match[2]
-};
-
-const formatTimestamp = (timestamp) => {
- return DateTime.fromJSDate(new Date(timestamp)).toLocaleString(DateTime.DATETIME_FULL);
-};
-
-exports.twitterHandler = {
-
- shouldPrehandle: (url, env) => {
- return TWITTER_BEARER_TOKEN && TWITTER_URL_MATCH.test(url.toString())
- },
-
- // version of the handler that uses the oembed API
- // This isn't great as it doesn't work well with our
- // readability API. But could potentially give a more consistent
- // look to the tweets
- // prehandle: async (url, env) => {
- // const oeTweet = await embeddedTweet(url)
- // const dom = new JSDOM(oeTweet.data.html);
- // const bq = dom.window.document.querySelector('blockquote')
- // console.log('blockquote:', bq);
-
- // const title = titleForTweet(oeTweet)
- // return { title, content: '' + bq.innerHTML + '
', url: oeTweet.data.url };
- // }
-
- prehandle: async (url, env) => {
- console.log('prehandling twitter url', url)
-
- const tweetId = tweetIdFromStatusUrl(url)
- const tweetData = (await getTweetById(tweetId)).data;
- const authorId = tweetData.data.author_id;
- const author = tweetData.includes.users.filter(u => u.id = authorId)[0];
- // escape html entities in title
- const title = _.escape(titleForAuthor(author))
- const authorImage = author.profile_image_url.replace('_normal', '_400x400')
-
- let text = tweetData.data.text;
- if (tweetData.data.entities && tweetData.data.entities.urls) {
- for (let urlObj of tweetData.data.entities.urls) {
- text = text.replace(
- urlObj.url,
- `${urlObj.display_url}`
- );
- }
- }
-
- const front = `
-
-
${text}
- `
-
- var includesHtml = '';
- if (tweetData.includes.media) {
- includesHtml = tweetData.includes.media.map(m => {
- const linkUrl = m.type == 'photo' ? m.url : url;
- const previewUrl = m.type == 'photo' ? m.url : m.preview_image_url;
- const mediaOpen = `
-
-
-
- `
- return mediaOpen
- }).join('\n');
- }
-
- const back = `
- —
${author.username} ${author.name}
${formatTimestamp(tweetData.data.created_at)}
-
- `
- const content = `
-
-
-
-
-
-
-
- ${front}
- ${includesHtml}
- ${back}
- `
-
- return { content, url, title };
- }
-}
diff --git a/packages/content-fetch/youtube-handler.js b/packages/content-fetch/youtube-handler.js
deleted file mode 100644
index e1866428a..000000000
--- a/packages/content-fetch/youtube-handler.js
+++ /dev/null
@@ -1,68 +0,0 @@
-/* eslint-disable no-undef */
-/* eslint-disable no-empty */
-/* eslint-disable @typescript-eslint/explicit-function-return-type */
-/* eslint-disable @typescript-eslint/no-var-requires */
-/* eslint-disable @typescript-eslint/no-require-imports */
-require('dotenv').config();
-const axios = require('axios');
-const _ = require('underscore');
-
-const YOUTUBE_URL_MATCH =
- /^((?:https?:)?\/\/)?((?:www|m)\.)?((?:youtube\.com|youtu.be))(\/(?:[\w-]+\?v=|embed\/|v\/)?)([\w-]+)(\S+)?$/
-
-function getYoutubeVideoId(url) {
- const u = new URL(url);
- const videoId = u.searchParams.get('v');
- if (!videoId) {
- const match = url.toString().match(YOUTUBE_URL_MATCH)
- if (match === null || match.length < 6 || !match[5]) {
- return undefined
- }
- return match[5]
- }
- return videoId
-}
-exports.getYoutubeVideoId = getYoutubeVideoId
-
-exports.youtubeHandler = {
- shouldPrehandle: (url, env) => {
- return YOUTUBE_URL_MATCH.test(url.toString())
- },
-
- prehandle: async (url, env) => {
- const videoId = getYoutubeVideoId(url)
- if (!videoId) {
- return {}
- }
-
- const oembedUrl = `https://www.youtube.com/oembed?format=json&url=` + encodeURIComponent(`https://www.youtube.com/watch?v=${videoId}`)
- const oembed = (await axios.get(oembedUrl.toString())).data;
- // escape html entities in title
- const title = _.escape(oembed.title);
- const ratio = oembed.width / oembed.height;
- const thumbnail = oembed.thumbnail_url;
- const height = 350;
- const width = height * ratio;
- const authorName = _.escape(oembed.author_name);
-
- const content = `
-
- ${title}
-
-
-
-
-
-
-
-
- ${title}
- By ${authorName}
-
- `
-
- console.log('got video id', videoId)
-
- return { content, title: 'Youtube Content' };
- }
-}
diff --git a/packages/content-handler/src/apple-news-handler.ts b/packages/content-handler/src/apple-news-handler.ts
index 1958b3b8c..a4239565c 100644
--- a/packages/content-handler/src/apple-news-handler.ts
+++ b/packages/content-handler/src/apple-news-handler.ts
@@ -1,8 +1,13 @@
-import { ContentHandler, PreHandleResult } from './index'
import axios from 'axios'
import { parseHTML } from 'linkedom'
+import { ContentHandler, PreHandleResult } from './content-handler'
export class AppleNewsHandler extends ContentHandler {
+ constructor() {
+ super()
+ this.name = 'Apple News'
+ }
+
shouldPreHandle(url: string, dom?: Document): boolean {
const u = new URL(url)
return u.hostname === 'apple.news'
diff --git a/packages/content-handler/src/bloomberg-handler.ts b/packages/content-handler/src/bloomberg-handler.ts
index ac4fb23ed..95670630e 100644
--- a/packages/content-handler/src/bloomberg-handler.ts
+++ b/packages/content-handler/src/bloomberg-handler.ts
@@ -1,8 +1,13 @@
-import { ContentHandler, PreHandleResult } from './index'
import axios from 'axios'
import { parseHTML } from 'linkedom'
+import { ContentHandler, PreHandleResult } from './content-handler'
+
+export class BloombergHandler extends ContentHandler {
+ constructor() {
+ super()
+ this.name = 'Bloomberg'
+ }
-class BloombergHandler extends ContentHandler {
shouldPreHandle(url: string, dom?: Document): boolean {
const BLOOMBERG_URL_MATCH =
/https?:\/\/(www\.)?bloomberg.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_+.~#?&/=]*)/
diff --git a/packages/content-handler/src/content-handler.ts b/packages/content-handler/src/content-handler.ts
new file mode 100644
index 000000000..c49987368
--- /dev/null
+++ b/packages/content-handler/src/content-handler.ts
@@ -0,0 +1,126 @@
+import addressparser from 'addressparser'
+import rfc2047 from 'rfc2047'
+import { v4 as uuidv4 } from 'uuid'
+
+interface Unsubscribe {
+ mailTo?: string
+ httpUrl?: string
+}
+
+interface NewsletterMessage {
+ email: string
+ content: string
+ url: string
+ title: string
+ author: string
+ unsubMailTo?: string
+ unsubHttpUrl?: string
+}
+
+export interface PreHandleResult {
+ url?: string
+ title?: string
+ content?: string
+ contentType?: string
+ dom?: Document
+}
+
+export abstract class ContentHandler {
+ protected senderRegex: RegExp
+ protected urlRegex: RegExp
+ protected defaultUrl: string
+ public name: string
+
+ protected constructor() {
+ this.senderRegex = new RegExp(/NEWSLETTER_SENDER_REGEX/)
+ this.urlRegex = new RegExp(/NEWSLETTER_URL_REGEX/)
+ this.defaultUrl = 'NEWSLETTER_DEFAULT_URL'
+ this.name = 'Handler name'
+ }
+
+ shouldResolve(url: string): boolean {
+ return false
+ }
+
+ async resolve(url: string): Promise {
+ return Promise.resolve(url)
+ }
+
+ shouldPreHandle(url: string, dom?: Document): boolean {
+ return false
+ }
+
+ async preHandle(url: string, document?: Document): Promise {
+ return Promise.resolve({ url, dom: document })
+ }
+
+ isNewsletter(postHeader: string, from: string, unSubHeader: string): boolean {
+ // Axios newsletter is from
+ const re = new RegExp(this.senderRegex)
+ return re.test(from) && (!!postHeader || !!unSubHeader)
+ }
+
+ parseNewsletterUrl(_postHeader: string, html: string): string | undefined {
+ // get newsletter url from html
+ const matches = html.match(this.urlRegex)
+ if (matches) {
+ return matches[1]
+ }
+ return undefined
+ }
+
+ parseAuthor(from: string): string {
+ // get author name from email
+ // e.g. 'Jackson Harper from Omnivore App '
+ // or 'Mike Allen '
+ const parsed = addressparser(from)
+ if (parsed.length > 0) {
+ return parsed[0].name
+ }
+ return from
+ }
+
+ parseUnsubscribe(unSubHeader: string): Unsubscribe {
+ // parse list-unsubscribe header
+ // e.g. List-Unsubscribe: ,
+ const decoded = rfc2047.decode(unSubHeader)
+ return {
+ mailTo: decoded.match(/<(https?:\/\/[^>]*)>/)?.[1],
+ httpUrl: decoded.match(/]*)>/)?.[1],
+ }
+ }
+
+ handleNewsletter(
+ email: string,
+ html: string,
+ postHeader: string,
+ title: string,
+ from: string,
+ unSubHeader: string
+ ): NewsletterMessage {
+ console.log('handleNewsletter', email, postHeader, title, from)
+
+ if (!email || !html || !title || !from) {
+ console.log('invalid newsletter email')
+ throw new Error('invalid newsletter email')
+ }
+
+ // fallback to default url if newsletter url does not exist
+ // assign a random uuid to the default url to avoid duplicate url
+ const url =
+ this.parseNewsletterUrl(postHeader, html) ||
+ `${this.defaultUrl}?source=newsletters&id=${uuidv4()}`
+ const author = this.parseAuthor(from)
+ const unsubscribe = this.parseUnsubscribe(unSubHeader)
+
+ return {
+ email,
+ content: html,
+ url,
+ title,
+ author,
+ unsubMailTo: unsubscribe.mailTo || '',
+ unsubHttpUrl: unsubscribe.httpUrl || '',
+ }
+ }
+}
diff --git a/packages/content-handler/src/derstandard-handler.ts b/packages/content-handler/src/derstandard-handler.ts
index 1a1f50778..2ac01ac86 100644
--- a/packages/content-handler/src/derstandard-handler.ts
+++ b/packages/content-handler/src/derstandard-handler.ts
@@ -1,8 +1,13 @@
-import { ContentHandler, PreHandleResult } from './index'
+import { ContentHandler, PreHandleResult } from './content-handler'
import axios from 'axios'
import { parseHTML } from 'linkedom'
-class DerstandardHandler extends ContentHandler {
+export class DerstandardHandler extends ContentHandler {
+ constructor() {
+ super()
+ this.name = 'Derstandard'
+ }
+
shouldPreHandle(url: string, dom?: Document): boolean {
const u = new URL(url)
return u.hostname === 'www.derstandard.at'
diff --git a/packages/content-handler/src/image-handler.ts b/packages/content-handler/src/image-handler.ts
index cb5e462c7..652756c51 100644
--- a/packages/content-handler/src/image-handler.ts
+++ b/packages/content-handler/src/image-handler.ts
@@ -1,6 +1,11 @@
-import { ContentHandler, PreHandleResult } from './index'
+import { ContentHandler, PreHandleResult } from './content-handler'
+
+export class ImageHandler extends ContentHandler {
+ constructor() {
+ super()
+ this.name = 'Image'
+ }
-class ImageHandler extends ContentHandler {
shouldPreHandle(url: string, dom?: Document): boolean {
const IMAGE_URL_PATTERN = /(https?:\/\/.*\.(?:jpg|jpeg|png|webp))/i
return IMAGE_URL_PATTERN.test(url.toString())
diff --git a/packages/content-handler/src/index.ts b/packages/content-handler/src/index.ts
index bfb0214e4..3275a3c89 100644
--- a/packages/content-handler/src/index.ts
+++ b/packages/content-handler/src/index.ts
@@ -1,111 +1,76 @@
-import addressparser from 'addressparser'
-import { v4 as uuidv4 } from 'uuid'
-import rfc2047 from 'rfc2047'
+import { AppleNewsHandler } from './apple-news-handler'
+import { BloombergHandler } from './bloomberg-handler'
+import { DerstandardHandler } from './derstandard-handler'
+import { ImageHandler } from './image-handler'
+import { MediumHandler } from './medium-handler'
+import { PdfHandler } from './pdf-handler'
+import { ScrapingBeeHandler } from './scrapingBee-handler'
+import { TDotCoHandler } from './t-dot-co-handler'
+import { TwitterHandler } from './twitter-handler'
+import { YoutubeHandler } from './youtube-handler'
+import { ContentHandler, PreHandleResult } from './content-handler'
-interface Unsubscribe {
- mailTo?: string
- httpUrl?: string
+const validateUrlString = (url: string) => {
+ const u = new URL(url)
+ // Make sure the URL is http or https
+ if (u.protocol !== 'http:' && u.protocol !== 'https:') {
+ throw new Error('Invalid URL protocol check failed')
+ }
+ // Make sure the domain is not localhost
+ if (u.hostname === 'localhost' || u.hostname === '0.0.0.0') {
+ throw new Error('Invalid URL is localhost')
+ }
+ // Make sure the domain is not a private IP
+ if (/^(10|172\.16|192\.168)\..*/.test(u.hostname)) {
+ throw new Error('Invalid URL is private ip')
+ }
}
-interface NewsletterMessage {
- email: string
- content: string
- url: string
- title: string
- author: string
- unsubMailTo?: string
- unsubHttpUrl?: string
-}
+const contentHandlers: ContentHandler[] = [
+ new AppleNewsHandler(),
+ new BloombergHandler(),
+ new DerstandardHandler(),
+ new ImageHandler(),
+ new MediumHandler(),
+ new PdfHandler(),
+ new ScrapingBeeHandler(),
+ new TDotCoHandler(),
+ new TwitterHandler(),
+ new YoutubeHandler(),
+]
-export interface PreHandleResult {
- url?: string
- title?: string
- content?: string
- contentType?: string
+export const preHandleContent = async (
+ url: string,
dom?: Document
+): Promise => {
+ // Before we run the regular handlers we check to see if we need tp
+ // pre-resolve the URL. TODO: This should probably happen recursively,
+ // so URLs can be pre-resolved, handled, pre-resolved, handled, etc.
+ for (const handler of contentHandlers) {
+ if (handler.shouldResolve(url)) {
+ try {
+ const resolvedUrl = await handler.resolve(url)
+ if (resolvedUrl && validateUrlString(resolvedUrl)) {
+ url = resolvedUrl
+ }
+ } catch (err) {
+ console.log('error resolving url with handler', handler.name, err)
+ }
+ break
+ }
+ }
+ // Before we fetch the page we check the handlers, to see if they want
+ // to perform a prefetch action that can modify our requests.
+ // enumerate the handlers and see if any of them want to handle the request
+ for (const handler of contentHandlers) {
+ if (handler.shouldPreHandle(url, dom)) {
+ console.log('preHandleContent', handler.name, url)
+ return handler.preHandle(url, dom)
+ }
+ }
+ return undefined
}
-export class ContentHandler {
- protected senderRegex = /NEWSLETTER_SENDER_REGEX/
- protected urlRegex = /NEWSLETTER_URL_REGEX/
- protected defaultUrl = 'NEWSLETTER_DEFAULT_URL'
- protected name = ''
-
- shouldPreHandle(url: string, dom?: Document): boolean {
- return false
- }
-
- async preHandle(url: string, document?: Document): Promise {
- return Promise.resolve({ url, dom: document })
- }
-
- isNewsletter(postHeader: string, from: string, unSubHeader: string): boolean {
- // Axios newsletter is from
- const re = new RegExp(this.senderRegex)
- return re.test(from) && (!!postHeader || !!unSubHeader)
- }
-
- parseNewsletterUrl(_postHeader: string, html: string): string | undefined {
- // get newsletter url from html
- const matches = html.match(this.urlRegex)
- if (matches) {
- return matches[1]
- }
- return undefined
- }
-
- parseAuthor(from: string): string {
- // get author name from email
- // e.g. 'Jackson Harper from Omnivore App '
- // or 'Mike Allen '
- const parsed = addressparser(from)
- if (parsed.length > 0) {
- return parsed[0].name
- }
- return from
- }
-
- parseUnsubscribe(unSubHeader: string): Unsubscribe {
- // parse list-unsubscribe header
- // e.g. List-Unsubscribe: ,
- const decoded = rfc2047.decode(unSubHeader)
- return {
- mailTo: decoded.match(/<(https?:\/\/[^>]*)>/)?.[1],
- httpUrl: decoded.match(/]*)>/)?.[1],
- }
- }
-
- handleNewsletter(
- email: string,
- html: string,
- postHeader: string,
- title: string,
- from: string,
- unSubHeader: string
- ): NewsletterMessage {
- console.log('handleNewsletter', email, postHeader, title, from)
-
- if (!email || !html || !title || !from) {
- console.log('invalid newsletter email')
- throw new Error('invalid newsletter email')
- }
-
- // fallback to default url if newsletter url does not exist
- // assign a random uuid to the default url to avoid duplicate url
- const url =
- this.parseNewsletterUrl(postHeader, html) ||
- `${this.defaultUrl}?source=newsletters&id=${uuidv4()}`
- const author = this.parseAuthor(from)
- const unsubscribe = this.parseUnsubscribe(unSubHeader)
-
- return {
- email,
- content: html,
- url,
- title,
- author,
- unsubMailTo: unsubscribe.mailTo || '',
- unsubHttpUrl: unsubscribe.httpUrl || '',
- }
- }
+module.exports = {
+ preHandleContent,
}
diff --git a/packages/content-handler/src/medium-handler.ts b/packages/content-handler/src/medium-handler.ts
index 0b9d2fcb5..8e14cebfe 100644
--- a/packages/content-handler/src/medium-handler.ts
+++ b/packages/content-handler/src/medium-handler.ts
@@ -1,6 +1,11 @@
-import { ContentHandler, PreHandleResult } from './index'
+import { ContentHandler, PreHandleResult } from './content-handler'
+
+export class MediumHandler extends ContentHandler {
+ constructor() {
+ super()
+ this.name = 'Medium'
+ }
-class MediumHandler extends ContentHandler {
shouldPreHandle(url: string, dom?: Document): boolean {
const u = new URL(url)
return u.hostname.endsWith('medium.com')
diff --git a/packages/content-handler/src/pdf-handler.ts b/packages/content-handler/src/pdf-handler.ts
index 54df72bc2..245f9fc1b 100644
--- a/packages/content-handler/src/pdf-handler.ts
+++ b/packages/content-handler/src/pdf-handler.ts
@@ -1,6 +1,11 @@
-import { ContentHandler, PreHandleResult } from './index'
+import { ContentHandler, PreHandleResult } from './content-handler'
+
+export class PdfHandler extends ContentHandler {
+ constructor() {
+ super()
+ this.name = 'PDF'
+ }
-class PdfHandler extends ContentHandler {
shouldPreHandle(url: string, dom?: Document): boolean {
const u = new URL(url)
const path = u.pathname.replace(u.search, '')
diff --git a/packages/content-handler/src/scrapingBee-handler.ts b/packages/content-handler/src/scrapingBee-handler.ts
index 0b1b5984b..792c5a75c 100644
--- a/packages/content-handler/src/scrapingBee-handler.ts
+++ b/packages/content-handler/src/scrapingBee-handler.ts
@@ -1,8 +1,13 @@
-import { ContentHandler, PreHandleResult } from './index'
+import { ContentHandler, PreHandleResult } from './content-handler'
import axios from 'axios'
import { parseHTML } from 'linkedom'
-class ScrapingBeeHandler extends ContentHandler {
+export class ScrapingBeeHandler extends ContentHandler {
+ constructor() {
+ super()
+ this.name = 'ScrapingBee'
+ }
+
shouldPreHandle(url: string, dom?: Document): boolean {
const u = new URL(url)
const hostnames = ['nytimes.com', 'news.google.com']
diff --git a/packages/content-handler/src/t-dot-co-handler.ts b/packages/content-handler/src/t-dot-co-handler.ts
index 41c90e4a5..b4b461c2f 100644
--- a/packages/content-handler/src/t-dot-co-handler.ts
+++ b/packages/content-handler/src/t-dot-co-handler.ts
@@ -1,7 +1,12 @@
-import { ContentHandler } from './index'
+import { ContentHandler } from './content-handler'
import axios from 'axios'
-class TDotCoHandler extends ContentHandler {
+export class TDotCoHandler extends ContentHandler {
+ constructor() {
+ super()
+ this.name = 't.co'
+ }
+
shouldResolve(url: string): boolean {
const T_DOT_CO_URL_MATCH = /^https:\/\/(?:www\.)?t\.co\/.*$/
return T_DOT_CO_URL_MATCH.test(url)
diff --git a/packages/content-handler/src/twitter-handler.ts b/packages/content-handler/src/twitter-handler.ts
index 10d9ead03..b7345efe5 100644
--- a/packages/content-handler/src/twitter-handler.ts
+++ b/packages/content-handler/src/twitter-handler.ts
@@ -1,4 +1,4 @@
-import { ContentHandler, PreHandleResult } from './index'
+import { ContentHandler, PreHandleResult } from './content-handler'
import axios from 'axios'
import { DateTime } from 'luxon'
import _ from 'underscore'
@@ -52,7 +52,12 @@ const formatTimestamp = (timestamp: string) => {
)
}
-class TwitterHandler extends ContentHandler {
+export class TwitterHandler extends ContentHandler {
+ constructor() {
+ super()
+ this.name = 'Twitter'
+ }
+
shouldPreHandle(url: string, dom?: Document): boolean {
return !!TWITTER_BEARER_TOKEN && TWITTER_URL_MATCH.test(url.toString())
}
diff --git a/packages/content-handler/src/youtube-handler.ts b/packages/content-handler/src/youtube-handler.ts
index 0cbe0df57..898810fe9 100644
--- a/packages/content-handler/src/youtube-handler.ts
+++ b/packages/content-handler/src/youtube-handler.ts
@@ -1,4 +1,4 @@
-import { ContentHandler, PreHandleResult } from './index'
+import { ContentHandler, PreHandleResult } from './content-handler'
import axios from 'axios'
import _ from 'underscore'
@@ -18,7 +18,12 @@ export const getYoutubeVideoId = (url: string) => {
return videoId
}
-class YoutubeHandler extends ContentHandler {
+export class YoutubeHandler extends ContentHandler {
+ constructor() {
+ super()
+ this.name = 'Youtube'
+ }
+
shouldPreHandle(url: string, dom?: Document): boolean {
return YOUTUBE_URL_MATCH.test(url.toString())
}
diff --git a/yarn.lock b/yarn.lock
index d1c9a65dd..3da81da9b 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -10579,19 +10579,6 @@ chai@^4.3.4:
pathval "^1.1.1"
type-detect "^4.0.5"
-chai@^4.3.6:
- version "4.3.6"
- resolved "https://registry.yarnpkg.com/chai/-/chai-4.3.6.tgz#ffe4ba2d9fa9d6680cc0b370adae709ec9011e9c"
- integrity sha512-bbcp3YfHCUzMOvKqsztczerVgBKSsEijCySNlHHbX3VG1nskvqjz5Rfso1gGwD6w6oOV3eI60pKuMOV5MV7p3Q==
- dependencies:
- assertion-error "^1.1.0"
- check-error "^1.0.2"
- deep-eql "^3.0.1"
- get-func-name "^2.0.0"
- loupe "^2.3.1"
- pathval "^1.1.1"
- type-detect "^4.0.5"
-
chalk@^1.0.0, chalk@^1.1.3:
version "1.1.3"
resolved "https://registry.yarnpkg.com/chalk/-/chalk-1.1.3.tgz#a8115c55e4a702fe4d150abd3872822a7e09fc98"
@@ -18078,13 +18065,6 @@ loose-envify@^1.0.0, loose-envify@^1.1.0, loose-envify@^1.4.0:
dependencies:
js-tokens "^3.0.0 || ^4.0.0"
-loupe@^2.3.1:
- version "2.3.4"
- resolved "https://registry.yarnpkg.com/loupe/-/loupe-2.3.4.tgz#7e0b9bffc76f148f9be769cb1321d3dcf3cb25f3"
- integrity sha512-OvKfgCC2Ndby6aSTREl5aCCPTNIzlDfQZvZxNUrBrihDhL3xcrYegTblhmEiCrg2kKQz4XsFIaemE5BF4ybSaQ==
- dependencies:
- get-func-name "^2.0.0"
-
lower-case-first@^1.0.0:
version "1.0.2"
resolved "https://registry.yarnpkg.com/lower-case-first/-/lower-case-first-1.0.2.tgz#e5da7c26f29a7073be02d52bac9980e5922adfa1"