Import content-handler in content-fetch

This commit is contained in:
Hongbo Wu
2022-09-29 18:05:35 +08:00
parent 8fb398eae4
commit 8c61832c77
27 changed files with 279 additions and 751 deletions

View File

@ -1,36 +0,0 @@
/* eslint-disable no-undef */
/* eslint-disable no-empty */
/* eslint-disable @typescript-eslint/explicit-function-return-type */
/* eslint-disable @typescript-eslint/no-var-requires */
/* eslint-disable @typescript-eslint/no-require-imports */
require('dotenv').config();
const Url = require('url');
const axios = require('axios');
const { promisify } = require('util');
const { DateTime } = require('luxon');
const os = require('os');
const { Cipher } = require('crypto');
const { parseHTML } = require('linkedom');
exports.appleNewsHandler = {
shouldPrehandle: (url, env) => {
const u = new URL(url);
if (u.hostname === 'apple.news') {
return true;
}
return false
},
prehandle: async (url, env) => {
const MOBILE_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36'
const response = await axios.get(url, { headers: { 'User-Agent': MOBILE_USER_AGENT } } );
const data = response.data;
const dom = parseHTML(data).document;
// make sure its a valid URL by wrapping in new URL
const u = new URL(dom.querySelector('span.click-here').parentNode.href);
return { url: u.href };
}
}

View File

@ -1,39 +0,0 @@
/* eslint-disable no-undef */
/* eslint-disable no-empty */
/* eslint-disable @typescript-eslint/explicit-function-return-type */
/* eslint-disable @typescript-eslint/no-var-requires */
/* eslint-disable @typescript-eslint/no-require-imports */
require('dotenv').config();
const axios = require('axios');
const os = require('os');
const { parseHTML } = require('linkedom');
exports.bloombergHandler = {
shouldPrehandle: (url, env) => {
const BLOOMBERG_URL_MATCH =
/https?:\/\/(www\.)?bloomberg.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)/
return BLOOMBERG_URL_MATCH.test(url.toString())
},
prehandle: async (url, env) => {
console.log('prehandling bloomberg url', url)
try {
const response = await axios.get('https://app.scrapingbee.com/api/v1', {
params: {
'api_key': process.env.SCRAPINGBEE_API_KEY,
'url': url,
'return_page_source': true,
'block_ads': true,
'block_resources': false,
}
})
const dom = parseHTML(response.data).document;
return { title: dom.title, content: dom.querySelector('body').innerHTML, url: url }
} catch (error) {
console.error('error prehandling bloomberg url', error)
throw error
}
}
}

View File

@ -1,35 +0,0 @@
/* eslint-disable no-undef */
/* eslint-disable no-empty */
/* eslint-disable @typescript-eslint/explicit-function-return-type */
/* eslint-disable @typescript-eslint/no-var-requires */
/* eslint-disable @typescript-eslint/no-require-imports */
require('dotenv').config();
const axios = require('axios');
const { parseHTML } = require('linkedom');
exports.derstandardHandler = {
shouldPrehandle: (url, env) => {
const u = new URL(url);
return u.hostname === 'www.derstandard.at';
},
prehandle: async (url, env) => {
const response = await axios.get(url, {
// set cookie to give consent to get the article
headers: {
'cookie': `DSGVO_ZUSAGE_V1=true; consentUUID=2bacb9c1-1e80-4be0-9f7b-ee987cf4e7b0_6`
},
});
const content = response.data;
var title = undefined;
const dom = parseHTML(content).document;
const titleElement = dom.querySelector('.article-title')
if (!titleElement) {
title = titleElement.textContent
titleElement.remove()
}
return { content: dom.body.outerHTML, title: title };
}
}

View File

@ -9,16 +9,10 @@ const puppeteer = require('puppeteer-core');
const axios = require('axios');
const jwt = require('jsonwebtoken');
const { promisify } = require('util');
const { parseHTML } = require('linkedom');
const { preHandleContent } = require('@omnivore/content-handler');
const signToken = promisify(jwt.sign);
const { appleNewsHandler } = require('./apple-news-handler');
const { twitterHandler } = require('./twitter-handler');
const { youtubeHandler } = require('./youtube-handler');
const { tDotCoHandler } = require('./t-dot-co-handler');
const { pdfHandler } = require('./pdf-handler');
const { mediumHandler } = require('./medium-handler');
const { derstandardHandler } = require('./derstandard-handler');
const { imageHandler } = require('./image-handler');
const { scrapingBeeHandler } = require('./scrapingBee-handler')
const MOBILE_USER_AGENT = 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.62 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
const DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
@ -29,8 +23,6 @@ const NON_SCRIPT_HOSTS= ['medium.com', 'fastcompany.com'];
const ALLOWED_CONTENT_TYPES = ['text/html', 'application/octet-stream', 'text/plain', 'application/pdf'];
const { parseHTML } = require('linkedom');
// Add stealth plugin to hide puppeteer usage
// const StealthPlugin = require('puppeteer-extra-plugin-stealth');
// puppeteer.use(StealthPlugin());
@ -207,19 +199,6 @@ const saveUploadedPdf = async (userId, url, uploadFileId, articleSavingRequestId
);
};
const handlers = {
'pdf': pdfHandler,
'apple-news': appleNewsHandler,
'twitter': twitterHandler,
'youtube': youtubeHandler,
't-dot-co': tDotCoHandler,
'medium': mediumHandler,
'derstandard': derstandardHandler,
'image': imageHandler,
'scrapingBee': scrapingBeeHandler,
};
async function fetchContent(req, res) {
functionStartTime = Date.now();
@ -246,61 +225,18 @@ async function fetchContent(req, res) {
return res.sendStatus(400);
}
// if (!userId || !articleSavingRequestId) {
// Object.assign(logRecord, { invalidParams: true, body: req.body, query: req.query });
// console.log(`Invalid parameters`, logRecord);
// return res.sendStatus(400);
// }
// Before we run the regular handlers we check to see if we need tp
// pre-resolve the URL. TODO: This should probably happen recursively,
// so URLs can be pre-resolved, handled, pre-resolved, handled, etc.
for (const [key, handler] of Object.entries(handlers)) {
if (handler.shouldResolve && handler.shouldResolve(url)) {
try {
url = await handler.resolve(url);
validateUrlString(url);
} catch (err) {
console.log('error resolving url with handler', key, err);
}
break;
}
}
// Before we fetch the page we check the handlers, to see if they want
// to perform a prefetch action that can modify our requests.
// enumerate the handlers and see if any of them want to handle the request
const handler = Object.keys(handlers).find(key => {
try {
return handlers[key].shouldPrehandle(url)
} catch (e) {
console.log('error with handler: ', key, e);
}
return false;
});
var title = undefined;
var content = undefined;
var contentType = undefined;
if (handler) {
try {
// The only handler we have now can modify the URL, but in the
// future maybe we let it modify content. In that case
// we might exit the request early.
console.log('pre-handling url with handler: ', handler);
const result = await handlers[handler].prehandle(url);
if (result && result.url) {
url = result.url
validateUrlString(url);
}
if (result && result.title) { title = result.title }
if (result && result.content) { content = result.content }
if (result && result.contentType) { contentType = result.contentType }
} catch (e) {
console.log('error with handler: ', handler, e);
let title, content, contentType;
try {
const result = await preHandleContent(url);
if (result && result.url) {
url = result.url
validateUrlString(url);
}
if (result && result.title) { title = result.title }
if (result && result.content) { content = result.content }
if (result && result.contentType) { contentType = result.contentType }
} catch (e) {
console.log('error with handler: ', e);
}
let context, page, finalUrl;

View File

@ -1,34 +0,0 @@
/* eslint-disable no-undef */
/* eslint-disable no-empty */
/* eslint-disable @typescript-eslint/explicit-function-return-type */
/* eslint-disable @typescript-eslint/no-var-requires */
/* eslint-disable @typescript-eslint/no-require-imports */
require('dotenv').config();
exports.imageHandler = {
shouldPrehandle: (url, env) => {
const IMAGE_URL_PATTERN =
/(https?:\/\/.*\.(?:jpg|jpeg|png|webp))/i
return IMAGE_URL_PATTERN.test(url.toString())
},
prehandle: async (url, env) => {
const title = url.toString().split('/').pop();
const content = `
<html>
<head>
<title>${title}</title>
<meta property="og:image" content="${url}" />
<meta property="og:title" content="${title}" />
</head>
<body>
<div>
<img src="${url}" alt="${title}">
</div>
</body>
</html>`
return { title, content };
}
}

View File

@ -1,29 +0,0 @@
/* eslint-disable no-undef */
/* eslint-disable no-empty */
/* eslint-disable @typescript-eslint/explicit-function-return-type */
/* eslint-disable @typescript-eslint/no-var-requires */
/* eslint-disable @typescript-eslint/no-require-imports */
require('dotenv').config();
const axios = require('axios');
const os = require('os');
exports.mediumHandler = {
shouldPrehandle: (url, env) => {
const u = new URL(url);
return u.hostname.endsWith('medium.com')
},
prehandle: async (url, env) => {
console.log('prehandling medium url', url)
try {
const res = new URL(url);
res.searchParams.delete('source');
return { url: res.toString() }
} catch (error) {
console.error('error prehandling medium url', error)
throw error
}
}
}

View File

@ -11,7 +11,8 @@
"linkedom": "^0.14.9",
"luxon": "^2.3.1",
"puppeteer-core": "^16.1.0",
"underscore": "^1.13.4"
"underscore": "^1.13.4",
"@omnivore/content-handler": "1.0.0"
},
"scripts": {
"start": "node app.js",

View File

@ -1,21 +0,0 @@
/* eslint-disable no-undef */
/* eslint-disable no-empty */
/* eslint-disable @typescript-eslint/explicit-function-return-type */
/* eslint-disable @typescript-eslint/no-var-requires */
/* eslint-disable @typescript-eslint/no-require-imports */
require('dotenv').config();
const Url = require('url');
exports.pdfHandler = {
shouldPrehandle: (url, env) => {
const u = Url.parse(url)
const path = u.path.replace(u.search, '')
return path.endsWith('.pdf')
},
prehandle: async (url, env) => {
return { contentType: 'application/pdf' };
}
}

View File

@ -1,44 +0,0 @@
/* eslint-disable no-undef */
/* eslint-disable no-empty */
/* eslint-disable @typescript-eslint/explicit-function-return-type */
/* eslint-disable @typescript-eslint/no-var-requires */
/* eslint-disable @typescript-eslint/no-require-imports */
require('dotenv').config();
const axios = require('axios');
const { parseHTML } = require('linkedom');
const os = require('os');
exports.scrapingBeeHandler = {
shouldPrehandle: (url, env) => {
const u = new URL(url);
const hostnames = [
'nytimes.com',
'news.google.com',
]
return hostnames.some((h) => u.hostname.endsWith(h))
},
prehandle: async (url, env) => {
console.log('prehandling url with scrapingbee', url)
try {
const response = await axios.get('https://app.scrapingbee.com/api/v1', {
params: {
'api_key': process.env.SCRAPINGBEE_API_KEY,
'url': url,
'return_page_source': true,
'block_ads': true,
'block_resources': false,
}
})
const dom = parseHTML(response.data).document;
return { title: dom.title, content: response.data, url: url }
} catch (error) {
console.error('error prehandling url w/scrapingbee', error)
throw error
}
}
}

View File

@ -1,31 +0,0 @@
/* eslint-disable no-undef */
/* eslint-disable no-empty */
/* eslint-disable @typescript-eslint/explicit-function-return-type */
/* eslint-disable @typescript-eslint/no-var-requires */
/* eslint-disable @typescript-eslint/no-require-imports */
require('dotenv').config();
const axios = require('axios');
const Url = require('url');
exports.tDotCoHandler = {
shouldResolve: function (url, env) {
const T_DOT_CO_URL_MATCH = /^https:\/\/(?:www\.)?t\.co\/.*$/;
return T_DOT_CO_URL_MATCH.test(url);
},
resolve: async function(url, env) {
return await axios.get(url, { maxRedirects: 0, validateStatus: null })
.then(res => {
return Url.parse(res.headers.location).href;
}).catch((err) => {
console.log('err with t.co url', err);
return undefined;
});
},
shouldPrehandle: (url, env) => {
return false
},
}

View File

@ -1,9 +0,0 @@
const { expect } = require('chai')
const { appleNewsHandler } = require('../apple-news-handler')
describe('open a simple web page', () => {
it('should return a response', async () => {
const response = await appleNewsHandler.prehandle('https://apple.news/AxjzaZaPvSn23b67LhXI5EQ')
console.log('response', response)
})
})

View File

@ -1,12 +0,0 @@
const { expect } = require('chai')
const { getYoutubeVideoId } = require('../youtube-handler')
describe('getYoutubeVideoId', () => {
it('should parse video id out of a URL', async () => {
expect('BnSUk0je6oo').to.eq(getYoutubeVideoId('https://www.youtube.com/watch?v=BnSUk0je6oo&t=269s'));
expect('vFD2gu007dc').to.eq(getYoutubeVideoId('https://www.youtube.com/watch?v=vFD2gu007dc&list=RDvFD2gu007dc&start_radio=1'));
expect('vFD2gu007dc').to.eq(getYoutubeVideoId('https://youtu.be/vFD2gu007dc'));
expect('BMFVCnbRaV4').to.eq(getYoutubeVideoId('https://youtube.com/watch?v=BMFVCnbRaV4&feature=share'));
expect('cg9b4RC87LI').to.eq(getYoutubeVideoId('https://youtu.be/cg9b4RC87LI?t=116'));
})
})

View File

@ -1,172 +0,0 @@
/* eslint-disable no-undef */
/* eslint-disable no-empty */
/* eslint-disable @typescript-eslint/explicit-function-return-type */
/* eslint-disable @typescript-eslint/no-var-requires */
/* eslint-disable @typescript-eslint/no-require-imports */
require('dotenv').config();
const axios = require('axios');
const { DateTime } = require('luxon');
const _ = require('underscore');
const TWITTER_BEARER_TOKEN = process.env.TWITTER_BEARER_TOKEN;
const TWITTER_URL_MATCH = /twitter\.com\/(?:#!\/)?(\w+)\/status(?:es)?\/(\d+)(?:\/.*)?/
const embeddedTweet = async (url) => {
const BASE_ENDPOINT = 'https://publish.twitter.com/oembed'
const apiUrl = new URL(BASE_ENDPOINT)
apiUrl.searchParams.append('url', url);
apiUrl.searchParams.append('omit_script', true);
apiUrl.searchParams.append('dnt', true);
return await axios.get(apiUrl.toString(), {
headers: {
Authorization: `Bearer ${TWITTER_BEARER_TOKEN}`,
redirect: "follow",
},
});
};
const getTweetFields = () => {
const TWEET_FIELDS =
"&tweet.fields=attachments,author_id,conversation_id,created_at," +
"entities,geo,in_reply_to_user_id,lang,possibly_sensitive,public_metrics,referenced_tweets," +
"source,withheld";
const EXPANSIONS = "&expansions=author_id,attachments.media_keys";
const USER_FIELDS =
"&user.fields=created_at,description,entities,location,pinned_tweet_id,profile_image_url,protected,public_metrics,url,verified,withheld";
const MEDIA_FIELDS =
"&media.fields=duration_ms,height,preview_image_url,url,media_key,public_metrics,width";
return `${TWEET_FIELDS}${EXPANSIONS}${USER_FIELDS}${MEDIA_FIELDS}`;
}
const getTweetById = async (id) => {
const BASE_ENDPOINT = "https://api.twitter.com/2/tweets/";
const apiUrl = new URL(BASE_ENDPOINT + id + '?' + getTweetFields())
return await axios.get(apiUrl.toString(), {
headers: {
Authorization: `Bearer ${TWITTER_BEARER_TOKEN}`,
redirect: "follow",
},
});
};
const getUserByUsername = async (username) => {
const BASE_ENDPOINT = "https://api.twitter.com/2/users/by/username/";
const apiUrl = new URL(BASE_ENDPOINT + username)
apiUrl.searchParams.append('user.fields', 'profile_image_url');
return await axios.get(apiUrl.toString(), {
headers: {
Authorization: `Bearer ${TWITTER_BEARER_TOKEN}`,
redirect: "follow",
},
});
};
const titleForTweet = (tweet) => {
return `${tweet.data.author_name} on Twitter`
};
const titleForAuthor = (author) => {
return `${author.name} on Twitter`
};
const usernameFromStatusUrl = (url) => {
const match = url.toString().match(TWITTER_URL_MATCH)
return match[1]
};
const tweetIdFromStatusUrl = (url) => {
const match = url.toString().match(TWITTER_URL_MATCH)
return match[2]
};
const formatTimestamp = (timestamp) => {
return DateTime.fromJSDate(new Date(timestamp)).toLocaleString(DateTime.DATETIME_FULL);
};
exports.twitterHandler = {
shouldPrehandle: (url, env) => {
return TWITTER_BEARER_TOKEN && TWITTER_URL_MATCH.test(url.toString())
},
// version of the handler that uses the oembed API
// This isn't great as it doesn't work well with our
// readability API. But could potentially give a more consistent
// look to the tweets
// prehandle: async (url, env) => {
// const oeTweet = await embeddedTweet(url)
// const dom = new JSDOM(oeTweet.data.html);
// const bq = dom.window.document.querySelector('blockquote')
// console.log('blockquote:', bq);
// const title = titleForTweet(oeTweet)
// return { title, content: '<div>' + bq.innerHTML + '</div>', url: oeTweet.data.url };
// }
prehandle: async (url, env) => {
console.log('prehandling twitter url', url)
const tweetId = tweetIdFromStatusUrl(url)
const tweetData = (await getTweetById(tweetId)).data;
const authorId = tweetData.data.author_id;
const author = tweetData.includes.users.filter(u => u.id = authorId)[0];
// escape html entities in title
const title = _.escape(titleForAuthor(author))
const authorImage = author.profile_image_url.replace('_normal', '_400x400')
let text = tweetData.data.text;
if (tweetData.data.entities && tweetData.data.entities.urls) {
for (let urlObj of tweetData.data.entities.urls) {
text = text.replace(
urlObj.url,
`<a href="${urlObj.expanded_url}">${urlObj.display_url}</a>`
);
}
}
const front = `
<div>
<p>${text}</p>
`
var includesHtml = '';
if (tweetData.includes.media) {
includesHtml = tweetData.includes.media.map(m => {
const linkUrl = m.type == 'photo' ? m.url : url;
const previewUrl = m.type == 'photo' ? m.url : m.preview_image_url;
const mediaOpen = `<a class="media-link" href=${linkUrl}>
<picture>
<img class="tweet-img" src=${previewUrl} />
</picture>
</a>`
return mediaOpen
}).join('\n');
}
const back = `
— <a href="https://twitter.com/${author.username}">${author.username}</a> ${author.name} <a href="${url}">${formatTimestamp(tweetData.data.created_at)}</a>
</div>
`
const content = `
<head>
<meta property="og:image" content="${authorImage}" />
<meta property="og:image:secure_url" content="${authorImage}" />
<meta property="og:title" content="${title}" />
<meta property="og:description" content="${_.escape(tweetData.data.text)}" />
</head>
<body>
${front}
${includesHtml}
${back}
</body>`
return { content, url, title };
}
}

View File

@ -1,68 +0,0 @@
/* eslint-disable no-undef */
/* eslint-disable no-empty */
/* eslint-disable @typescript-eslint/explicit-function-return-type */
/* eslint-disable @typescript-eslint/no-var-requires */
/* eslint-disable @typescript-eslint/no-require-imports */
require('dotenv').config();
const axios = require('axios');
const _ = require('underscore');
const YOUTUBE_URL_MATCH =
/^((?:https?:)?\/\/)?((?:www|m)\.)?((?:youtube\.com|youtu.be))(\/(?:[\w-]+\?v=|embed\/|v\/)?)([\w-]+)(\S+)?$/
function getYoutubeVideoId(url) {
const u = new URL(url);
const videoId = u.searchParams.get('v');
if (!videoId) {
const match = url.toString().match(YOUTUBE_URL_MATCH)
if (match === null || match.length < 6 || !match[5]) {
return undefined
}
return match[5]
}
return videoId
}
exports.getYoutubeVideoId = getYoutubeVideoId
exports.youtubeHandler = {
shouldPrehandle: (url, env) => {
return YOUTUBE_URL_MATCH.test(url.toString())
},
prehandle: async (url, env) => {
const videoId = getYoutubeVideoId(url)
if (!videoId) {
return {}
}
const oembedUrl = `https://www.youtube.com/oembed?format=json&url=` + encodeURIComponent(`https://www.youtube.com/watch?v=${videoId}`)
const oembed = (await axios.get(oembedUrl.toString())).data;
// escape html entities in title
const title = _.escape(oembed.title);
const ratio = oembed.width / oembed.height;
const thumbnail = oembed.thumbnail_url;
const height = 350;
const width = height * ratio;
const authorName = _.escape(oembed.author_name);
const content = `
<html>
<head><title>${title}</title>
<meta property="og:image" content="${thumbnail}" />
<meta property="og:image:secure_url" content="${thumbnail}" />
<meta property="og:title" content="${title}" />
<meta property="og:description" content="" />
<meta property="og:article:author" content="${authorName}" />
</head>
<body>
<iframe width="${width}" height="${height}" src="https://www.youtube.com/embed/${videoId}" title="${title}" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
<p><a href="${url}" target="_blank">${title}</a></p>
<p itemscope="" itemprop="author" itemtype="http://schema.org/Person">By <a href="${oembed.author_url}" target="_blank">${authorName}</a></p>
</body>
</html>`
console.log('got video id', videoId)
return { content, title: 'Youtube Content' };
}
}

View File

@ -1,8 +1,13 @@
import { ContentHandler, PreHandleResult } from './index'
import axios from 'axios'
import { parseHTML } from 'linkedom'
import { ContentHandler, PreHandleResult } from './content-handler'
export class AppleNewsHandler extends ContentHandler {
constructor() {
super()
this.name = 'Apple News'
}
shouldPreHandle(url: string, dom?: Document): boolean {
const u = new URL(url)
return u.hostname === 'apple.news'

View File

@ -1,8 +1,13 @@
import { ContentHandler, PreHandleResult } from './index'
import axios from 'axios'
import { parseHTML } from 'linkedom'
import { ContentHandler, PreHandleResult } from './content-handler'
export class BloombergHandler extends ContentHandler {
constructor() {
super()
this.name = 'Bloomberg'
}
class BloombergHandler extends ContentHandler {
shouldPreHandle(url: string, dom?: Document): boolean {
const BLOOMBERG_URL_MATCH =
/https?:\/\/(www\.)?bloomberg.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_+.~#?&/=]*)/

View File

@ -0,0 +1,126 @@
import addressparser from 'addressparser'
import rfc2047 from 'rfc2047'
import { v4 as uuidv4 } from 'uuid'
interface Unsubscribe {
mailTo?: string
httpUrl?: string
}
interface NewsletterMessage {
email: string
content: string
url: string
title: string
author: string
unsubMailTo?: string
unsubHttpUrl?: string
}
export interface PreHandleResult {
url?: string
title?: string
content?: string
contentType?: string
dom?: Document
}
export abstract class ContentHandler {
protected senderRegex: RegExp
protected urlRegex: RegExp
protected defaultUrl: string
public name: string
protected constructor() {
this.senderRegex = new RegExp(/NEWSLETTER_SENDER_REGEX/)
this.urlRegex = new RegExp(/NEWSLETTER_URL_REGEX/)
this.defaultUrl = 'NEWSLETTER_DEFAULT_URL'
this.name = 'Handler name'
}
shouldResolve(url: string): boolean {
return false
}
async resolve(url: string): Promise<string | undefined> {
return Promise.resolve(url)
}
shouldPreHandle(url: string, dom?: Document): boolean {
return false
}
async preHandle(url: string, document?: Document): Promise<PreHandleResult> {
return Promise.resolve({ url, dom: document })
}
isNewsletter(postHeader: string, from: string, unSubHeader: string): boolean {
// Axios newsletter is from <xx@axios.com>
const re = new RegExp(this.senderRegex)
return re.test(from) && (!!postHeader || !!unSubHeader)
}
parseNewsletterUrl(_postHeader: string, html: string): string | undefined {
// get newsletter url from html
const matches = html.match(this.urlRegex)
if (matches) {
return matches[1]
}
return undefined
}
parseAuthor(from: string): string {
// get author name from email
// e.g. 'Jackson Harper from Omnivore App <jacksonh@substack.com>'
// or 'Mike Allen <mike@axios.com>'
const parsed = addressparser(from)
if (parsed.length > 0) {
return parsed[0].name
}
return from
}
parseUnsubscribe(unSubHeader: string): Unsubscribe {
// parse list-unsubscribe header
// e.g. List-Unsubscribe: <https://omnivore.com/unsub>, <mailto:unsub@omnivore.com>
const decoded = rfc2047.decode(unSubHeader)
return {
mailTo: decoded.match(/<(https?:\/\/[^>]*)>/)?.[1],
httpUrl: decoded.match(/<mailto:([^>]*)>/)?.[1],
}
}
handleNewsletter(
email: string,
html: string,
postHeader: string,
title: string,
from: string,
unSubHeader: string
): NewsletterMessage {
console.log('handleNewsletter', email, postHeader, title, from)
if (!email || !html || !title || !from) {
console.log('invalid newsletter email')
throw new Error('invalid newsletter email')
}
// fallback to default url if newsletter url does not exist
// assign a random uuid to the default url to avoid duplicate url
const url =
this.parseNewsletterUrl(postHeader, html) ||
`${this.defaultUrl}?source=newsletters&id=${uuidv4()}`
const author = this.parseAuthor(from)
const unsubscribe = this.parseUnsubscribe(unSubHeader)
return {
email,
content: html,
url,
title,
author,
unsubMailTo: unsubscribe.mailTo || '',
unsubHttpUrl: unsubscribe.httpUrl || '',
}
}
}

View File

@ -1,8 +1,13 @@
import { ContentHandler, PreHandleResult } from './index'
import { ContentHandler, PreHandleResult } from './content-handler'
import axios from 'axios'
import { parseHTML } from 'linkedom'
class DerstandardHandler extends ContentHandler {
export class DerstandardHandler extends ContentHandler {
constructor() {
super()
this.name = 'Derstandard'
}
shouldPreHandle(url: string, dom?: Document): boolean {
const u = new URL(url)
return u.hostname === 'www.derstandard.at'

View File

@ -1,6 +1,11 @@
import { ContentHandler, PreHandleResult } from './index'
import { ContentHandler, PreHandleResult } from './content-handler'
export class ImageHandler extends ContentHandler {
constructor() {
super()
this.name = 'Image'
}
class ImageHandler extends ContentHandler {
shouldPreHandle(url: string, dom?: Document): boolean {
const IMAGE_URL_PATTERN = /(https?:\/\/.*\.(?:jpg|jpeg|png|webp))/i
return IMAGE_URL_PATTERN.test(url.toString())

View File

@ -1,111 +1,76 @@
import addressparser from 'addressparser'
import { v4 as uuidv4 } from 'uuid'
import rfc2047 from 'rfc2047'
import { AppleNewsHandler } from './apple-news-handler'
import { BloombergHandler } from './bloomberg-handler'
import { DerstandardHandler } from './derstandard-handler'
import { ImageHandler } from './image-handler'
import { MediumHandler } from './medium-handler'
import { PdfHandler } from './pdf-handler'
import { ScrapingBeeHandler } from './scrapingBee-handler'
import { TDotCoHandler } from './t-dot-co-handler'
import { TwitterHandler } from './twitter-handler'
import { YoutubeHandler } from './youtube-handler'
import { ContentHandler, PreHandleResult } from './content-handler'
interface Unsubscribe {
mailTo?: string
httpUrl?: string
const validateUrlString = (url: string) => {
const u = new URL(url)
// Make sure the URL is http or https
if (u.protocol !== 'http:' && u.protocol !== 'https:') {
throw new Error('Invalid URL protocol check failed')
}
// Make sure the domain is not localhost
if (u.hostname === 'localhost' || u.hostname === '0.0.0.0') {
throw new Error('Invalid URL is localhost')
}
// Make sure the domain is not a private IP
if (/^(10|172\.16|192\.168)\..*/.test(u.hostname)) {
throw new Error('Invalid URL is private ip')
}
}
interface NewsletterMessage {
email: string
content: string
url: string
title: string
author: string
unsubMailTo?: string
unsubHttpUrl?: string
}
const contentHandlers: ContentHandler[] = [
new AppleNewsHandler(),
new BloombergHandler(),
new DerstandardHandler(),
new ImageHandler(),
new MediumHandler(),
new PdfHandler(),
new ScrapingBeeHandler(),
new TDotCoHandler(),
new TwitterHandler(),
new YoutubeHandler(),
]
export interface PreHandleResult {
url?: string
title?: string
content?: string
contentType?: string
export const preHandleContent = async (
url: string,
dom?: Document
): Promise<PreHandleResult | undefined> => {
// Before we run the regular handlers we check to see if we need tp
// pre-resolve the URL. TODO: This should probably happen recursively,
// so URLs can be pre-resolved, handled, pre-resolved, handled, etc.
for (const handler of contentHandlers) {
if (handler.shouldResolve(url)) {
try {
const resolvedUrl = await handler.resolve(url)
if (resolvedUrl && validateUrlString(resolvedUrl)) {
url = resolvedUrl
}
} catch (err) {
console.log('error resolving url with handler', handler.name, err)
}
break
}
}
// Before we fetch the page we check the handlers, to see if they want
// to perform a prefetch action that can modify our requests.
// enumerate the handlers and see if any of them want to handle the request
for (const handler of contentHandlers) {
if (handler.shouldPreHandle(url, dom)) {
console.log('preHandleContent', handler.name, url)
return handler.preHandle(url, dom)
}
}
return undefined
}
export class ContentHandler {
protected senderRegex = /NEWSLETTER_SENDER_REGEX/
protected urlRegex = /NEWSLETTER_URL_REGEX/
protected defaultUrl = 'NEWSLETTER_DEFAULT_URL'
protected name = ''
shouldPreHandle(url: string, dom?: Document): boolean {
return false
}
async preHandle(url: string, document?: Document): Promise<PreHandleResult> {
return Promise.resolve({ url, dom: document })
}
isNewsletter(postHeader: string, from: string, unSubHeader: string): boolean {
// Axios newsletter is from <xx@axios.com>
const re = new RegExp(this.senderRegex)
return re.test(from) && (!!postHeader || !!unSubHeader)
}
parseNewsletterUrl(_postHeader: string, html: string): string | undefined {
// get newsletter url from html
const matches = html.match(this.urlRegex)
if (matches) {
return matches[1]
}
return undefined
}
parseAuthor(from: string): string {
// get author name from email
// e.g. 'Jackson Harper from Omnivore App <jacksonh@substack.com>'
// or 'Mike Allen <mike@axios.com>'
const parsed = addressparser(from)
if (parsed.length > 0) {
return parsed[0].name
}
return from
}
parseUnsubscribe(unSubHeader: string): Unsubscribe {
// parse list-unsubscribe header
// e.g. List-Unsubscribe: <https://omnivore.com/unsub>, <mailto:unsub@omnivore.com>
const decoded = rfc2047.decode(unSubHeader)
return {
mailTo: decoded.match(/<(https?:\/\/[^>]*)>/)?.[1],
httpUrl: decoded.match(/<mailto:([^>]*)>/)?.[1],
}
}
handleNewsletter(
email: string,
html: string,
postHeader: string,
title: string,
from: string,
unSubHeader: string
): NewsletterMessage {
console.log('handleNewsletter', email, postHeader, title, from)
if (!email || !html || !title || !from) {
console.log('invalid newsletter email')
throw new Error('invalid newsletter email')
}
// fallback to default url if newsletter url does not exist
// assign a random uuid to the default url to avoid duplicate url
const url =
this.parseNewsletterUrl(postHeader, html) ||
`${this.defaultUrl}?source=newsletters&id=${uuidv4()}`
const author = this.parseAuthor(from)
const unsubscribe = this.parseUnsubscribe(unSubHeader)
return {
email,
content: html,
url,
title,
author,
unsubMailTo: unsubscribe.mailTo || '',
unsubHttpUrl: unsubscribe.httpUrl || '',
}
}
module.exports = {
preHandleContent,
}

View File

@ -1,6 +1,11 @@
import { ContentHandler, PreHandleResult } from './index'
import { ContentHandler, PreHandleResult } from './content-handler'
export class MediumHandler extends ContentHandler {
constructor() {
super()
this.name = 'Medium'
}
class MediumHandler extends ContentHandler {
shouldPreHandle(url: string, dom?: Document): boolean {
const u = new URL(url)
return u.hostname.endsWith('medium.com')

View File

@ -1,6 +1,11 @@
import { ContentHandler, PreHandleResult } from './index'
import { ContentHandler, PreHandleResult } from './content-handler'
export class PdfHandler extends ContentHandler {
constructor() {
super()
this.name = 'PDF'
}
class PdfHandler extends ContentHandler {
shouldPreHandle(url: string, dom?: Document): boolean {
const u = new URL(url)
const path = u.pathname.replace(u.search, '')

View File

@ -1,8 +1,13 @@
import { ContentHandler, PreHandleResult } from './index'
import { ContentHandler, PreHandleResult } from './content-handler'
import axios from 'axios'
import { parseHTML } from 'linkedom'
class ScrapingBeeHandler extends ContentHandler {
export class ScrapingBeeHandler extends ContentHandler {
constructor() {
super()
this.name = 'ScrapingBee'
}
shouldPreHandle(url: string, dom?: Document): boolean {
const u = new URL(url)
const hostnames = ['nytimes.com', 'news.google.com']

View File

@ -1,7 +1,12 @@
import { ContentHandler } from './index'
import { ContentHandler } from './content-handler'
import axios from 'axios'
class TDotCoHandler extends ContentHandler {
export class TDotCoHandler extends ContentHandler {
constructor() {
super()
this.name = 't.co'
}
shouldResolve(url: string): boolean {
const T_DOT_CO_URL_MATCH = /^https:\/\/(?:www\.)?t\.co\/.*$/
return T_DOT_CO_URL_MATCH.test(url)

View File

@ -1,4 +1,4 @@
import { ContentHandler, PreHandleResult } from './index'
import { ContentHandler, PreHandleResult } from './content-handler'
import axios from 'axios'
import { DateTime } from 'luxon'
import _ from 'underscore'
@ -52,7 +52,12 @@ const formatTimestamp = (timestamp: string) => {
)
}
class TwitterHandler extends ContentHandler {
export class TwitterHandler extends ContentHandler {
constructor() {
super()
this.name = 'Twitter'
}
shouldPreHandle(url: string, dom?: Document): boolean {
return !!TWITTER_BEARER_TOKEN && TWITTER_URL_MATCH.test(url.toString())
}

View File

@ -1,4 +1,4 @@
import { ContentHandler, PreHandleResult } from './index'
import { ContentHandler, PreHandleResult } from './content-handler'
import axios from 'axios'
import _ from 'underscore'
@ -18,7 +18,12 @@ export const getYoutubeVideoId = (url: string) => {
return videoId
}
class YoutubeHandler extends ContentHandler {
export class YoutubeHandler extends ContentHandler {
constructor() {
super()
this.name = 'Youtube'
}
shouldPreHandle(url: string, dom?: Document): boolean {
return YOUTUBE_URL_MATCH.test(url.toString())
}

View File

@ -10579,19 +10579,6 @@ chai@^4.3.4:
pathval "^1.1.1"
type-detect "^4.0.5"
chai@^4.3.6:
version "4.3.6"
resolved "https://registry.yarnpkg.com/chai/-/chai-4.3.6.tgz#ffe4ba2d9fa9d6680cc0b370adae709ec9011e9c"
integrity sha512-bbcp3YfHCUzMOvKqsztczerVgBKSsEijCySNlHHbX3VG1nskvqjz5Rfso1gGwD6w6oOV3eI60pKuMOV5MV7p3Q==
dependencies:
assertion-error "^1.1.0"
check-error "^1.0.2"
deep-eql "^3.0.1"
get-func-name "^2.0.0"
loupe "^2.3.1"
pathval "^1.1.1"
type-detect "^4.0.5"
chalk@^1.0.0, chalk@^1.1.3:
version "1.1.3"
resolved "https://registry.yarnpkg.com/chalk/-/chalk-1.1.3.tgz#a8115c55e4a702fe4d150abd3872822a7e09fc98"
@ -18078,13 +18065,6 @@ loose-envify@^1.0.0, loose-envify@^1.1.0, loose-envify@^1.4.0:
dependencies:
js-tokens "^3.0.0 || ^4.0.0"
loupe@^2.3.1:
version "2.3.4"
resolved "https://registry.yarnpkg.com/loupe/-/loupe-2.3.4.tgz#7e0b9bffc76f148f9be769cb1321d3dcf3cb25f3"
integrity sha512-OvKfgCC2Ndby6aSTREl5aCCPTNIzlDfQZvZxNUrBrihDhL3xcrYegTblhmEiCrg2kKQz4XsFIaemE5BF4ybSaQ==
dependencies:
get-func-name "^2.0.0"
lower-case-first@^1.0.0:
version "1.0.2"
resolved "https://registry.yarnpkg.com/lower-case-first/-/lower-case-first-1.0.2.tgz#e5da7c26f29a7073be02d52bac9980e5922adfa1"