Import content-handler as local dependency
This commit is contained in:
@ -23,12 +23,18 @@ WORKDIR /app
|
|||||||
ENV CHROMIUM_PATH /usr/bin/chromium-browser
|
ENV CHROMIUM_PATH /usr/bin/chromium-browser
|
||||||
ENV LAUNCH_HEADLESS=true
|
ENV LAUNCH_HEADLESS=true
|
||||||
|
|
||||||
COPY . /app/
|
COPY package.json .
|
||||||
WORKDIR app
|
COPY yarn.lock .
|
||||||
|
COPY tsconfig.json .
|
||||||
|
COPY .prettierrc .
|
||||||
|
COPY .eslintrc .
|
||||||
|
|
||||||
|
COPY /packages/content-fetch ./packages/content-fetch
|
||||||
|
COPY /packages/content-handler ./packages/content-handler
|
||||||
|
|
||||||
RUN yarn install --pure-lockfile
|
RUN yarn install --pure-lockfile
|
||||||
|
|
||||||
EXPOSE 8080
|
EXPOSE 8080
|
||||||
|
|
||||||
ENTRYPOINT ["yarn", "start"]
|
CMD ["yarn", "workspace", "@omnivore/content-fetch", "start"]
|
||||||
|
|
||||||
|
|||||||
@ -34,6 +34,7 @@ COPY .prettierrc .
|
|||||||
COPY .eslintrc .
|
COPY .eslintrc .
|
||||||
|
|
||||||
COPY /packages/content-fetch ./packages/content-fetch
|
COPY /packages/content-fetch ./packages/content-fetch
|
||||||
|
COPY /packages/content-handler ./packages/content-handler
|
||||||
|
|
||||||
RUN yarn install --pure-lockfile
|
RUN yarn install --pure-lockfile
|
||||||
|
|
||||||
|
|||||||
@ -4,6 +4,7 @@
|
|||||||
"description": "Service that fetches page content from a URL",
|
"description": "Service that fetches page content from a URL",
|
||||||
"main": "index.js",
|
"main": "index.js",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
|
"@omnivore/content-handler": "file:./../content-handler",
|
||||||
"axios": "^0.27.2",
|
"axios": "^0.27.2",
|
||||||
"dotenv": "^8.2.0",
|
"dotenv": "^8.2.0",
|
||||||
"express": "^4.17.1",
|
"express": "^4.17.1",
|
||||||
@ -11,8 +12,7 @@
|
|||||||
"linkedom": "^0.14.9",
|
"linkedom": "^0.14.9",
|
||||||
"luxon": "^2.3.1",
|
"luxon": "^2.3.1",
|
||||||
"puppeteer-core": "^16.1.0",
|
"puppeteer-core": "^16.1.0",
|
||||||
"underscore": "^1.13.4",
|
"underscore": "^1.13.4"
|
||||||
"@omnivore/content-handler": "1.0.0"
|
|
||||||
},
|
},
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"start": "node app.js",
|
"start": "node app.js",
|
||||||
|
|||||||
@ -29,9 +29,9 @@
|
|||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@google-cloud/functions-framework": "3.1.2",
|
"@google-cloud/functions-framework": "3.1.2",
|
||||||
"@google-cloud/pubsub": "^2.18.4",
|
"@google-cloud/pubsub": "^2.18.4",
|
||||||
|
"@omnivore/content-handler": "file:./../content-handler",
|
||||||
"@sendgrid/client": "^7.6.0",
|
"@sendgrid/client": "^7.6.0",
|
||||||
"@sentry/serverless": "^6.16.1",
|
"@sentry/serverless": "^6.16.1",
|
||||||
"@omnivore/content-handler": "1.0.0",
|
|
||||||
"addressparser": "^1.0.1",
|
"addressparser": "^1.0.1",
|
||||||
"axios": "^0.27.2",
|
"axios": "^0.27.2",
|
||||||
"jsonwebtoken": "^8.5.1",
|
"jsonwebtoken": "^8.5.1",
|
||||||
|
|||||||
1
packages/puppeteer-parse/content-handler/.eslintignore
Normal file
1
packages/puppeteer-parse/content-handler/.eslintignore
Normal file
@ -0,0 +1 @@
|
|||||||
|
node_modules/
|
||||||
6
packages/puppeteer-parse/content-handler/.eslintrc
Normal file
6
packages/puppeteer-parse/content-handler/.eslintrc
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
{
|
||||||
|
"extends": "../../.eslintrc",
|
||||||
|
"parserOptions": {
|
||||||
|
"project": "tsconfig.json"
|
||||||
|
}
|
||||||
|
}
|
||||||
2
packages/puppeteer-parse/content-handler/.gitignore
vendored
Normal file
2
packages/puppeteer-parse/content-handler/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
node_modules
|
||||||
|
/lib
|
||||||
7
packages/puppeteer-parse/content-handler/.npmignore
Normal file
7
packages/puppeteer-parse/content-handler/.npmignore
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
/test/
|
||||||
|
src
|
||||||
|
tsconfig.json
|
||||||
|
.eslintrc
|
||||||
|
.eslintignore
|
||||||
|
.gitignore
|
||||||
|
mocha-config.json
|
||||||
@ -0,0 +1,5 @@
|
|||||||
|
{
|
||||||
|
"extension": ["ts"],
|
||||||
|
"spec": "test/**/*.test.ts",
|
||||||
|
"require": "test/babel-register.js"
|
||||||
|
}
|
||||||
34
packages/puppeteer-parse/content-handler/package.json
Normal file
34
packages/puppeteer-parse/content-handler/package.json
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
{
|
||||||
|
"name": "@omnivore/content-handler",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"description": "A standalone version of content handler to parse and format each type of content",
|
||||||
|
"main": "build/src/index.js",
|
||||||
|
"types": "build/src/index.d.ts",
|
||||||
|
"files": [
|
||||||
|
"build/src"
|
||||||
|
],
|
||||||
|
"license": "Apache-2.0",
|
||||||
|
"scripts": {
|
||||||
|
"test": "yarn mocha -r ts-node/register --config mocha-config.json",
|
||||||
|
"lint": "eslint src --ext ts,js,tsx,jsx",
|
||||||
|
"compile": "tsc",
|
||||||
|
"build": "tsc"
|
||||||
|
},
|
||||||
|
"devDependencies": {
|
||||||
|
"chai": "^4.3.6",
|
||||||
|
"chai-as-promised": "^7.1.1",
|
||||||
|
"chai-string": "^1.5.0",
|
||||||
|
"eslint-plugin-prettier": "^4.0.0",
|
||||||
|
"mocha": "^10.0.0",
|
||||||
|
"nock": "^13.2.9"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"addressparser": "^1.0.1",
|
||||||
|
"axios": "^0.27.2",
|
||||||
|
"linkedom": "^0.14.16",
|
||||||
|
"luxon": "^3.0.4",
|
||||||
|
"rfc2047": "^4.0.1",
|
||||||
|
"underscore": "^1.13.6",
|
||||||
|
"uuid": "^9.0.0"
|
||||||
|
}
|
||||||
|
}
|
||||||
175
packages/puppeteer-parse/content-handler/src/content-handler.ts
Normal file
175
packages/puppeteer-parse/content-handler/src/content-handler.ts
Normal file
@ -0,0 +1,175 @@
|
|||||||
|
import addressparser from 'addressparser'
|
||||||
|
import rfc2047 from 'rfc2047'
|
||||||
|
import { v4 as uuid } from 'uuid'
|
||||||
|
import { parseHTML } from 'linkedom'
|
||||||
|
import axios from 'axios'
|
||||||
|
|
||||||
|
interface Unsubscribe {
|
||||||
|
mailTo?: string
|
||||||
|
httpUrl?: string
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface NewsletterInput {
|
||||||
|
postHeader: string
|
||||||
|
from: string
|
||||||
|
unSubHeader: string
|
||||||
|
email: string
|
||||||
|
html: string
|
||||||
|
title: string
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface NewsletterResult {
|
||||||
|
email: string
|
||||||
|
content: string
|
||||||
|
url: string
|
||||||
|
title: string
|
||||||
|
author: string
|
||||||
|
unsubMailTo?: string
|
||||||
|
unsubHttpUrl?: string
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface PreHandleResult {
|
||||||
|
url?: string
|
||||||
|
title?: string
|
||||||
|
content?: string
|
||||||
|
contentType?: string
|
||||||
|
dom?: Document
|
||||||
|
}
|
||||||
|
|
||||||
|
export const FAKE_URL_PREFIX = 'https://omnivore.app/no_url?q='
|
||||||
|
export const generateUniqueUrl = () => FAKE_URL_PREFIX + uuid()
|
||||||
|
|
||||||
|
export abstract class ContentHandler {
|
||||||
|
protected senderRegex: RegExp
|
||||||
|
protected urlRegex: RegExp
|
||||||
|
name: string
|
||||||
|
|
||||||
|
protected constructor() {
|
||||||
|
this.senderRegex = new RegExp(/NEWSLETTER_SENDER_REGEX/)
|
||||||
|
this.urlRegex = new RegExp(/NEWSLETTER_URL_REGEX/)
|
||||||
|
this.name = 'Handler name'
|
||||||
|
}
|
||||||
|
|
||||||
|
shouldResolve(url: string): boolean {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
async resolve(url: string): Promise<string | undefined> {
|
||||||
|
return Promise.resolve(url)
|
||||||
|
}
|
||||||
|
|
||||||
|
shouldPreHandle(url: string, dom?: Document): boolean {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
async preHandle(url: string, dom?: Document): Promise<PreHandleResult> {
|
||||||
|
return Promise.resolve({ url, dom })
|
||||||
|
}
|
||||||
|
|
||||||
|
async isNewsletter(input: {
|
||||||
|
postHeader: string
|
||||||
|
from: string
|
||||||
|
unSubHeader: string
|
||||||
|
html?: string
|
||||||
|
}): Promise<boolean> {
|
||||||
|
const re = new RegExp(this.senderRegex)
|
||||||
|
return Promise.resolve(
|
||||||
|
re.test(input.from) && (!!input.postHeader || !!input.unSubHeader)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
findNewsletterHeaderHref(dom: Document): string | undefined {
|
||||||
|
return undefined
|
||||||
|
}
|
||||||
|
|
||||||
|
// Given an HTML blob tries to find a URL to use for
|
||||||
|
// a canonical URL.
|
||||||
|
async findNewsletterUrl(html: string): Promise<string | undefined> {
|
||||||
|
const dom = parseHTML(html).document
|
||||||
|
|
||||||
|
// Check if this is a substack newsletter
|
||||||
|
const href = this.findNewsletterHeaderHref(dom)
|
||||||
|
if (href) {
|
||||||
|
// Try to make a HEAD request, so we get the redirected URL, since these
|
||||||
|
// will usually be behind tracking url redirects
|
||||||
|
try {
|
||||||
|
const response = await axios.head(href, { timeout: 5000 })
|
||||||
|
return Promise.resolve(
|
||||||
|
// eslint-disable-next-line @typescript-eslint/no-unsafe-member-access
|
||||||
|
response.request.res.responseUrl as string | undefined
|
||||||
|
)
|
||||||
|
} catch (e) {
|
||||||
|
console.log('error making HEAD request', e)
|
||||||
|
return Promise.resolve(href)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return Promise.resolve(undefined)
|
||||||
|
}
|
||||||
|
|
||||||
|
async parseNewsletterUrl(
|
||||||
|
_postHeader: string,
|
||||||
|
html: string
|
||||||
|
): Promise<string | undefined> {
|
||||||
|
// get newsletter url from html
|
||||||
|
const matches = html.match(this.urlRegex)
|
||||||
|
if (matches) {
|
||||||
|
return Promise.resolve(matches[1])
|
||||||
|
}
|
||||||
|
return Promise.resolve(undefined)
|
||||||
|
}
|
||||||
|
|
||||||
|
parseAuthor(from: string): string {
|
||||||
|
// get author name from email
|
||||||
|
// e.g. 'Jackson Harper from Omnivore App <jacksonh@substack.com>'
|
||||||
|
// or 'Mike Allen <mike@axios.com>'
|
||||||
|
const parsed = addressparser(from)
|
||||||
|
if (parsed.length > 0) {
|
||||||
|
return parsed[0].name
|
||||||
|
}
|
||||||
|
return from
|
||||||
|
}
|
||||||
|
|
||||||
|
parseUnsubscribe(unSubHeader: string): Unsubscribe {
|
||||||
|
// parse list-unsubscribe header
|
||||||
|
// e.g. List-Unsubscribe: <https://omnivore.com/unsub>, <mailto:unsub@omnivore.com>
|
||||||
|
const decoded = rfc2047.decode(unSubHeader)
|
||||||
|
return {
|
||||||
|
mailTo: decoded.match(/<(https?:\/\/[^>]*)>/)?.[1],
|
||||||
|
httpUrl: decoded.match(/<mailto:([^>]*)>/)?.[1],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async handleNewsletter({
|
||||||
|
email,
|
||||||
|
html,
|
||||||
|
postHeader,
|
||||||
|
title,
|
||||||
|
from,
|
||||||
|
unSubHeader,
|
||||||
|
}: NewsletterInput): Promise<NewsletterResult> {
|
||||||
|
console.log('handleNewsletter', email, postHeader, title, from)
|
||||||
|
|
||||||
|
if (!email || !html || !title || !from) {
|
||||||
|
console.log('invalid newsletter email')
|
||||||
|
throw new Error('invalid newsletter email')
|
||||||
|
}
|
||||||
|
|
||||||
|
// fallback to default url if newsletter url does not exist
|
||||||
|
// assign a random uuid to the default url to avoid duplicate url
|
||||||
|
const url =
|
||||||
|
(await this.parseNewsletterUrl(postHeader, html)) || generateUniqueUrl()
|
||||||
|
const author = this.parseAuthor(from)
|
||||||
|
const unsubscribe = this.parseUnsubscribe(unSubHeader)
|
||||||
|
|
||||||
|
return {
|
||||||
|
email,
|
||||||
|
content: html,
|
||||||
|
url,
|
||||||
|
title,
|
||||||
|
author,
|
||||||
|
unsubMailTo: unsubscribe.mailTo || '',
|
||||||
|
unsubHttpUrl: unsubscribe.httpUrl || '',
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
116
packages/puppeteer-parse/content-handler/src/index.ts
Normal file
116
packages/puppeteer-parse/content-handler/src/index.ts
Normal file
@ -0,0 +1,116 @@
|
|||||||
|
import { AppleNewsHandler } from './websites/apple-news-handler'
|
||||||
|
import { BloombergHandler } from './websites/bloomberg-handler'
|
||||||
|
import { DerstandardHandler } from './websites/derstandard-handler'
|
||||||
|
import { ImageHandler } from './websites/image-handler'
|
||||||
|
import { MediumHandler } from './websites/medium-handler'
|
||||||
|
import { PdfHandler } from './websites/pdf-handler'
|
||||||
|
import { ScrapingBeeHandler } from './websites/scrapingBee-handler'
|
||||||
|
import { TDotCoHandler } from './websites/t-dot-co-handler'
|
||||||
|
import { TwitterHandler } from './websites/twitter-handler'
|
||||||
|
import { YoutubeHandler } from './websites/youtube-handler'
|
||||||
|
import { WikipediaHandler } from './websites/wikipedia-handler'
|
||||||
|
import {
|
||||||
|
ContentHandler,
|
||||||
|
NewsletterInput,
|
||||||
|
NewsletterResult,
|
||||||
|
PreHandleResult,
|
||||||
|
} from './content-handler'
|
||||||
|
import { SubstackHandler } from './newsletters/substack-handler'
|
||||||
|
import { AxiosHandler } from './newsletters/axios-handler'
|
||||||
|
import { GolangHandler } from './newsletters/golang-handler'
|
||||||
|
import { MorningBrewHandler } from './newsletters/morning-brew-handler'
|
||||||
|
import { BloombergNewsletterHandler } from './newsletters/bloomberg-newsletter-handler'
|
||||||
|
import { BeehiivHandler } from './newsletters/beehiiv-handler'
|
||||||
|
import { ConvertkitHandler } from './newsletters/convertkit-handler'
|
||||||
|
import { RevueHandler } from './newsletters/revue-handler'
|
||||||
|
|
||||||
|
const validateUrlString = (url: string) => {
|
||||||
|
const u = new URL(url)
|
||||||
|
// Make sure the URL is http or https
|
||||||
|
if (u.protocol !== 'http:' && u.protocol !== 'https:') {
|
||||||
|
throw new Error('Invalid URL protocol check failed')
|
||||||
|
}
|
||||||
|
// Make sure the domain is not localhost
|
||||||
|
if (u.hostname === 'localhost' || u.hostname === '0.0.0.0') {
|
||||||
|
throw new Error('Invalid URL is localhost')
|
||||||
|
}
|
||||||
|
// Make sure the domain is not a private IP
|
||||||
|
if (/^(10|172\.16|192\.168)\..*/.test(u.hostname)) {
|
||||||
|
throw new Error('Invalid URL is private ip')
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const contentHandlers: ContentHandler[] = [
|
||||||
|
new AppleNewsHandler(),
|
||||||
|
new BloombergHandler(),
|
||||||
|
new DerstandardHandler(),
|
||||||
|
new ImageHandler(),
|
||||||
|
new MediumHandler(),
|
||||||
|
new PdfHandler(),
|
||||||
|
new ScrapingBeeHandler(),
|
||||||
|
new TDotCoHandler(),
|
||||||
|
new TwitterHandler(),
|
||||||
|
new YoutubeHandler(),
|
||||||
|
new WikipediaHandler(),
|
||||||
|
]
|
||||||
|
|
||||||
|
const newsletterHandlers: ContentHandler[] = [
|
||||||
|
new AxiosHandler(),
|
||||||
|
new BloombergNewsletterHandler(),
|
||||||
|
new GolangHandler(),
|
||||||
|
new SubstackHandler(),
|
||||||
|
new MorningBrewHandler(),
|
||||||
|
new SubstackHandler(),
|
||||||
|
new BeehiivHandler(),
|
||||||
|
new ConvertkitHandler(),
|
||||||
|
new RevueHandler(),
|
||||||
|
]
|
||||||
|
|
||||||
|
export const preHandleContent = async (
|
||||||
|
url: string,
|
||||||
|
dom?: Document
|
||||||
|
): Promise<PreHandleResult | undefined> => {
|
||||||
|
// Before we run the regular handlers we check to see if we need tp
|
||||||
|
// pre-resolve the URL. TODO: This should probably happen recursively,
|
||||||
|
// so URLs can be pre-resolved, handled, pre-resolved, handled, etc.
|
||||||
|
for (const handler of contentHandlers) {
|
||||||
|
if (handler.shouldResolve(url)) {
|
||||||
|
try {
|
||||||
|
const resolvedUrl = await handler.resolve(url)
|
||||||
|
if (resolvedUrl && validateUrlString(resolvedUrl)) {
|
||||||
|
url = resolvedUrl
|
||||||
|
}
|
||||||
|
} catch (err) {
|
||||||
|
console.log('error resolving url with handler', handler.name, err)
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Before we fetch the page we check the handlers, to see if they want
|
||||||
|
// to perform a prefetch action that can modify our requests.
|
||||||
|
// enumerate the handlers and see if any of them want to handle the request
|
||||||
|
for (const handler of contentHandlers) {
|
||||||
|
if (handler.shouldPreHandle(url, dom)) {
|
||||||
|
console.log('preHandleContent', handler.name, url)
|
||||||
|
return handler.preHandle(url, dom)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return undefined
|
||||||
|
}
|
||||||
|
|
||||||
|
export const handleNewsletter = async (
|
||||||
|
input: NewsletterInput
|
||||||
|
): Promise<NewsletterResult | undefined> => {
|
||||||
|
for (const handler of newsletterHandlers) {
|
||||||
|
if (await handler.isNewsletter(input)) {
|
||||||
|
return handler.handleNewsletter(input)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return undefined
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = {
|
||||||
|
preHandleContent,
|
||||||
|
handleNewsletter,
|
||||||
|
}
|
||||||
@ -0,0 +1,46 @@
|
|||||||
|
import { ContentHandler, PreHandleResult } from '../content-handler'
|
||||||
|
|
||||||
|
export class AxiosHandler extends ContentHandler {
|
||||||
|
constructor() {
|
||||||
|
super()
|
||||||
|
this.senderRegex = /<.+@axios.com>/
|
||||||
|
this.urlRegex = /View in browser at <a.*>(.*)<\/a>/
|
||||||
|
this.name = 'axios'
|
||||||
|
}
|
||||||
|
|
||||||
|
shouldPreHandle(url: string, dom?: Document): boolean {
|
||||||
|
const host = this.name + '.com'
|
||||||
|
// check if url ends with axios.com
|
||||||
|
return new URL(url).hostname.endsWith(host)
|
||||||
|
}
|
||||||
|
|
||||||
|
async preHandle(url: string, dom: Document): Promise<PreHandleResult> {
|
||||||
|
const body = dom.querySelector('table')
|
||||||
|
|
||||||
|
let isFooter = false
|
||||||
|
// this removes ads and replaces table with a div
|
||||||
|
body?.querySelectorAll('table').forEach((el) => {
|
||||||
|
// remove the footer and the ads
|
||||||
|
if (!el.textContent || el.textContent.length < 20 || isFooter) {
|
||||||
|
el.remove()
|
||||||
|
} else {
|
||||||
|
// removes the first few rows of the table (the header)
|
||||||
|
// remove the last two rows of the table (they are ads)
|
||||||
|
el.querySelectorAll('tr').forEach((tr, i) => {
|
||||||
|
if (i <= 7 || i >= el.querySelectorAll('tr').length - 2) {
|
||||||
|
console.log('removing', tr)
|
||||||
|
tr.remove()
|
||||||
|
}
|
||||||
|
})
|
||||||
|
// replace the table with a div
|
||||||
|
const div = dom.createElement('div')
|
||||||
|
div.innerHTML = el.innerHTML
|
||||||
|
el.parentNode?.replaceChild(div, el)
|
||||||
|
// set the isFooter flag to true because the next table is the footer
|
||||||
|
isFooter = true
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
return Promise.resolve({ dom })
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,43 @@
|
|||||||
|
import { ContentHandler } from '../content-handler'
|
||||||
|
import { parseHTML } from 'linkedom'
|
||||||
|
|
||||||
|
export class BeehiivHandler extends ContentHandler {
|
||||||
|
constructor() {
|
||||||
|
super()
|
||||||
|
this.name = 'beehiiv'
|
||||||
|
}
|
||||||
|
|
||||||
|
findNewsletterHeaderHref(dom: Document): string | undefined {
|
||||||
|
const readOnline = dom.querySelectorAll('table tr td div a[class*="link"]')
|
||||||
|
let res: string | undefined = undefined
|
||||||
|
readOnline.forEach((e) => {
|
||||||
|
if (e.textContent === 'Read Online') {
|
||||||
|
res = e.getAttribute('href') || undefined
|
||||||
|
}
|
||||||
|
})
|
||||||
|
return res
|
||||||
|
}
|
||||||
|
|
||||||
|
async isNewsletter(input: {
|
||||||
|
postHeader: string
|
||||||
|
from: string
|
||||||
|
unSubHeader: string
|
||||||
|
html: string
|
||||||
|
}): Promise<boolean> {
|
||||||
|
const dom = parseHTML(input.html).document
|
||||||
|
if (dom.querySelectorAll('img[src*="beehiiv.net"]').length > 0) {
|
||||||
|
const beehiivUrl = this.findNewsletterHeaderHref(dom)
|
||||||
|
if (beehiivUrl) {
|
||||||
|
return Promise.resolve(true)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
async parseNewsletterUrl(
|
||||||
|
postHeader: string,
|
||||||
|
html: string
|
||||||
|
): Promise<string | undefined> {
|
||||||
|
return this.findNewsletterUrl(html)
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,37 @@
|
|||||||
|
import { ContentHandler, PreHandleResult } from '../content-handler'
|
||||||
|
|
||||||
|
export class BloombergNewsletterHandler extends ContentHandler {
|
||||||
|
constructor() {
|
||||||
|
super()
|
||||||
|
this.senderRegex = /<.+@mail.bloomberg.*.com>/
|
||||||
|
this.urlRegex = /<a class="view-in-browser__url" href=["']([^"']*)["']/
|
||||||
|
this.name = 'bloomberg'
|
||||||
|
}
|
||||||
|
|
||||||
|
shouldPreHandle(url: string, dom: Document): boolean {
|
||||||
|
const host = this.name + '.com'
|
||||||
|
// check if url ends with bloomberg.com
|
||||||
|
return (
|
||||||
|
new URL(url).hostname.endsWith(host) ||
|
||||||
|
dom.querySelector('.logo-image')?.getAttribute('alt')?.toLowerCase() ===
|
||||||
|
this.name
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
async preHandle(url: string, dom: Document): Promise<PreHandleResult> {
|
||||||
|
const body = dom.querySelector('.wrapper')
|
||||||
|
|
||||||
|
// this removes header
|
||||||
|
body?.querySelector('.sailthru-variables')?.remove()
|
||||||
|
body?.querySelector('.preview-text')?.remove()
|
||||||
|
body?.querySelector('.logo-wrapper')?.remove()
|
||||||
|
body?.querySelector('.by-the-number-wrapper')?.remove()
|
||||||
|
// this removes footer
|
||||||
|
body?.querySelector('.quote-box-wrapper')?.remove()
|
||||||
|
body?.querySelector('.header-wrapper')?.remove()
|
||||||
|
body?.querySelector('.component-wrapper')?.remove()
|
||||||
|
body?.querySelector('.footer')?.remove()
|
||||||
|
|
||||||
|
return Promise.resolve({ dom })
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,41 @@
|
|||||||
|
import { ContentHandler } from '../content-handler'
|
||||||
|
import { parseHTML } from 'linkedom'
|
||||||
|
|
||||||
|
export class ConvertkitHandler extends ContentHandler {
|
||||||
|
constructor() {
|
||||||
|
super()
|
||||||
|
this.name = 'convertkit'
|
||||||
|
}
|
||||||
|
|
||||||
|
findNewsletterHeaderHref(dom: Document): string | undefined {
|
||||||
|
const readOnline = dom.querySelectorAll('table tr td a')
|
||||||
|
let res: string | undefined = undefined
|
||||||
|
readOnline.forEach((e) => {
|
||||||
|
if (e.textContent === 'View this email in your browser') {
|
||||||
|
res = e.getAttribute('href') || undefined
|
||||||
|
}
|
||||||
|
})
|
||||||
|
return res
|
||||||
|
}
|
||||||
|
|
||||||
|
async isNewsletter(input: {
|
||||||
|
postHeader: string
|
||||||
|
from: string
|
||||||
|
unSubHeader: string
|
||||||
|
html: string
|
||||||
|
}): Promise<boolean> {
|
||||||
|
const dom = parseHTML(input.html).document
|
||||||
|
return Promise.resolve(
|
||||||
|
dom.querySelectorAll(
|
||||||
|
'img[src*="convertkit.com"], img[src*="convertkit-mail.com"]'
|
||||||
|
).length > 0
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
async parseNewsletterUrl(
|
||||||
|
postHeader: string,
|
||||||
|
html: string
|
||||||
|
): Promise<string | undefined> {
|
||||||
|
return this.findNewsletterUrl(html)
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,27 @@
|
|||||||
|
import { ContentHandler, PreHandleResult } from '../content-handler'
|
||||||
|
|
||||||
|
export class GolangHandler extends ContentHandler {
|
||||||
|
constructor() {
|
||||||
|
super()
|
||||||
|
this.senderRegex = /<.+@golangweekly.com>/
|
||||||
|
this.urlRegex = /<a href=["']([^"']*)["'].*>Read on the Web<\/a>/
|
||||||
|
this.name = 'golangweekly'
|
||||||
|
}
|
||||||
|
|
||||||
|
shouldPreHandle(url: string, dom?: Document): boolean {
|
||||||
|
const host = this.name + '.com'
|
||||||
|
// check if url ends with golangweekly.com
|
||||||
|
return new URL(url).hostname.endsWith(host)
|
||||||
|
}
|
||||||
|
|
||||||
|
async preHandle(url: string, dom: Document): Promise<PreHandleResult> {
|
||||||
|
const body = dom.querySelector('body')
|
||||||
|
|
||||||
|
// this removes the "Subscribe" button
|
||||||
|
body?.querySelector('.el-splitbar')?.remove()
|
||||||
|
// this removes the title
|
||||||
|
body?.querySelector('.el-masthead')?.remove()
|
||||||
|
|
||||||
|
return Promise.resolve({ dom })
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,35 @@
|
|||||||
|
import { ContentHandler, PreHandleResult } from '../content-handler'
|
||||||
|
|
||||||
|
export class MorningBrewHandler extends ContentHandler {
|
||||||
|
constructor() {
|
||||||
|
super()
|
||||||
|
this.senderRegex = /Morning Brew <crew@morningbrew.com>/
|
||||||
|
this.urlRegex = /<a.* href=["']([^"']*)["'].*>View Online<\/a>/
|
||||||
|
this.name = 'morningbrew'
|
||||||
|
}
|
||||||
|
|
||||||
|
shouldPreHandle(url: string, dom?: Document): boolean {
|
||||||
|
const host = this.name + '.com'
|
||||||
|
// check if url ends with morningbrew.com
|
||||||
|
return new URL(url).hostname.endsWith(host)
|
||||||
|
}
|
||||||
|
|
||||||
|
async preHandle(url: string, dom: Document): Promise<PreHandleResult> {
|
||||||
|
// retain the width of the cells in the table of market info
|
||||||
|
dom.querySelectorAll('.markets-arrow-cell').forEach((td) => {
|
||||||
|
const table = td.closest('table')
|
||||||
|
if (table) {
|
||||||
|
const bubbleTable = table.querySelector('.markets-bubble')
|
||||||
|
if (bubbleTable) {
|
||||||
|
// replace the nested table with the text
|
||||||
|
const e = bubbleTable.querySelector('.markets-table-text')
|
||||||
|
e && bubbleTable.parentNode?.replaceChild(e, bubbleTable)
|
||||||
|
}
|
||||||
|
// set custom class for the table
|
||||||
|
table.className = 'morning-brew-markets'
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
return Promise.resolve({ dom })
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,46 @@
|
|||||||
|
import { ContentHandler } from '../content-handler'
|
||||||
|
import { parseHTML } from 'linkedom'
|
||||||
|
|
||||||
|
export class RevueHandler extends ContentHandler {
|
||||||
|
constructor() {
|
||||||
|
super()
|
||||||
|
this.name = 'revue'
|
||||||
|
}
|
||||||
|
|
||||||
|
findNewsletterHeaderHref(dom: Document): string | undefined {
|
||||||
|
const viewOnline = dom.querySelectorAll('table tr td a[target="_blank"]')
|
||||||
|
let res: string | undefined = undefined
|
||||||
|
viewOnline.forEach((e) => {
|
||||||
|
if (e.textContent === 'View online') {
|
||||||
|
res = e.getAttribute('href') || undefined
|
||||||
|
}
|
||||||
|
})
|
||||||
|
return res
|
||||||
|
}
|
||||||
|
|
||||||
|
async isNewsletter(input: {
|
||||||
|
postHeader: string
|
||||||
|
from: string
|
||||||
|
unSubHeader: string
|
||||||
|
html: string
|
||||||
|
}): Promise<boolean> {
|
||||||
|
const dom = parseHTML(input.html).document
|
||||||
|
if (
|
||||||
|
dom.querySelectorAll('img[src*="getrevue.co"], img[src*="revue.email"]')
|
||||||
|
.length > 0
|
||||||
|
) {
|
||||||
|
const getrevueUrl = this.findNewsletterHeaderHref(dom)
|
||||||
|
if (getrevueUrl) {
|
||||||
|
return Promise.resolve(true)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
async parseNewsletterUrl(
|
||||||
|
postHeader: string,
|
||||||
|
html: string
|
||||||
|
): Promise<string | undefined> {
|
||||||
|
return this.findNewsletterUrl(html)
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,90 @@
|
|||||||
|
import addressparser from 'addressparser'
|
||||||
|
import { ContentHandler, PreHandleResult } from '../content-handler'
|
||||||
|
import { parseHTML } from 'linkedom'
|
||||||
|
|
||||||
|
export class SubstackHandler extends ContentHandler {
|
||||||
|
constructor() {
|
||||||
|
super()
|
||||||
|
this.name = 'substack'
|
||||||
|
}
|
||||||
|
|
||||||
|
shouldPreHandle(url: string, dom: Document): boolean {
|
||||||
|
const host = this.name + '.com'
|
||||||
|
// check if url ends with substack.com
|
||||||
|
// or has a profile image hosted at substack.com
|
||||||
|
return (
|
||||||
|
new URL(url).hostname.endsWith(host) ||
|
||||||
|
!!dom
|
||||||
|
.querySelector('.email-body img')
|
||||||
|
?.getAttribute('src')
|
||||||
|
?.includes(host)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
async preHandle(url: string, dom: Document): Promise<PreHandleResult> {
|
||||||
|
const body = dom.querySelector('.email-body-container')
|
||||||
|
|
||||||
|
// this removes header and profile avatar
|
||||||
|
body?.querySelector('.header')?.remove()
|
||||||
|
body?.querySelector('.preamble')?.remove()
|
||||||
|
body?.querySelector('.meta-author-wrap')?.remove()
|
||||||
|
// this removes meta button
|
||||||
|
body?.querySelector('.post-meta')?.remove()
|
||||||
|
// this removes footer
|
||||||
|
body?.querySelector('.post-cta')?.remove()
|
||||||
|
body?.querySelector('.container-border')?.remove()
|
||||||
|
body?.querySelector('.footer')?.remove()
|
||||||
|
|
||||||
|
return Promise.resolve(dom)
|
||||||
|
}
|
||||||
|
|
||||||
|
findNewsletterHeaderHref(dom: Document): string | undefined {
|
||||||
|
// Substack header links
|
||||||
|
const postLink = dom.querySelector('h1 a ')
|
||||||
|
if (postLink) {
|
||||||
|
return postLink.getAttribute('href') || undefined
|
||||||
|
}
|
||||||
|
|
||||||
|
return undefined
|
||||||
|
}
|
||||||
|
|
||||||
|
async isNewsletter({
|
||||||
|
postHeader,
|
||||||
|
html,
|
||||||
|
}: {
|
||||||
|
postHeader: string
|
||||||
|
from: string
|
||||||
|
unSubHeader: string
|
||||||
|
html: string
|
||||||
|
}): Promise<boolean> {
|
||||||
|
if (postHeader) {
|
||||||
|
return Promise.resolve(true)
|
||||||
|
}
|
||||||
|
const dom = parseHTML(html).document
|
||||||
|
// substack newsletter emails have tables with a *post-meta class
|
||||||
|
if (dom.querySelector('table[class$="post-meta"]')) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
// If the article has a header link, and substack icons its probably a newsletter
|
||||||
|
const href = this.findNewsletterHeaderHref(dom)
|
||||||
|
const heartIcon = dom.querySelector(
|
||||||
|
'table tbody td span a img[src*="HeartIcon"]'
|
||||||
|
)
|
||||||
|
const recommendIcon = dom.querySelector(
|
||||||
|
'table tbody td span a img[src*="RecommendIconRounded"]'
|
||||||
|
)
|
||||||
|
return Promise.resolve(!!(href && (heartIcon || recommendIcon)))
|
||||||
|
}
|
||||||
|
|
||||||
|
async parseNewsletterUrl(
|
||||||
|
postHeader: string,
|
||||||
|
html: string
|
||||||
|
): Promise<string | undefined> {
|
||||||
|
// raw SubStack newsletter url is like <https://hongbo130.substack.com/p/tldr>
|
||||||
|
// we need to get the real url from the raw url
|
||||||
|
if (postHeader && addressparser(postHeader).length > 0) {
|
||||||
|
return Promise.resolve(addressparser(postHeader)[0].name)
|
||||||
|
}
|
||||||
|
return this.findNewsletterUrl(html)
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,31 @@
|
|||||||
|
import axios from 'axios'
|
||||||
|
import { parseHTML } from 'linkedom'
|
||||||
|
import { ContentHandler, PreHandleResult } from '../content-handler'
|
||||||
|
|
||||||
|
export class AppleNewsHandler extends ContentHandler {
|
||||||
|
constructor() {
|
||||||
|
super()
|
||||||
|
this.name = 'Apple News'
|
||||||
|
}
|
||||||
|
|
||||||
|
shouldPreHandle(url: string, dom?: Document): boolean {
|
||||||
|
const u = new URL(url)
|
||||||
|
return u.hostname === 'apple.news'
|
||||||
|
}
|
||||||
|
|
||||||
|
async preHandle(url: string, document?: Document): Promise<PreHandleResult> {
|
||||||
|
const MOBILE_USER_AGENT =
|
||||||
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36'
|
||||||
|
const response = await axios.get(url, {
|
||||||
|
headers: { 'User-Agent': MOBILE_USER_AGENT },
|
||||||
|
})
|
||||||
|
const data = response.data as string
|
||||||
|
const dom = parseHTML(data).document
|
||||||
|
// make sure it's a valid URL by wrapping in new URL
|
||||||
|
const href = dom
|
||||||
|
.querySelector('span.click-here')
|
||||||
|
?.parentElement?.getAttribute('href')
|
||||||
|
const u = href ? new URL(href) : undefined
|
||||||
|
return { url: u?.href }
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,41 @@
|
|||||||
|
import axios from 'axios'
|
||||||
|
import { parseHTML } from 'linkedom'
|
||||||
|
import { ContentHandler, PreHandleResult } from '../content-handler'
|
||||||
|
|
||||||
|
export class BloombergHandler extends ContentHandler {
|
||||||
|
constructor() {
|
||||||
|
super()
|
||||||
|
this.name = 'Bloomberg'
|
||||||
|
}
|
||||||
|
|
||||||
|
shouldPreHandle(url: string, dom?: Document): boolean {
|
||||||
|
const BLOOMBERG_URL_MATCH =
|
||||||
|
/https?:\/\/(www\.)?bloomberg.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_+.~#?&/=]*)/
|
||||||
|
return BLOOMBERG_URL_MATCH.test(url.toString())
|
||||||
|
}
|
||||||
|
|
||||||
|
async preHandle(url: string, document?: Document): Promise<PreHandleResult> {
|
||||||
|
console.log('prehandling bloomberg url', url)
|
||||||
|
|
||||||
|
try {
|
||||||
|
const response = await axios.get('https://app.scrapingbee.com/api/v1', {
|
||||||
|
params: {
|
||||||
|
api_key: process.env.SCRAPINGBEE_API_KEY,
|
||||||
|
url: url,
|
||||||
|
return_page_source: true,
|
||||||
|
block_ads: true,
|
||||||
|
block_resources: false,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
const dom = parseHTML(response.data).document
|
||||||
|
return {
|
||||||
|
title: dom.title,
|
||||||
|
content: dom.querySelector('body')?.innerHTML,
|
||||||
|
url: url,
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('error prehandling bloomberg url', error)
|
||||||
|
throw error
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,34 @@
|
|||||||
|
import { ContentHandler, PreHandleResult } from '../content-handler'
|
||||||
|
import axios from 'axios'
|
||||||
|
import { parseHTML } from 'linkedom'
|
||||||
|
|
||||||
|
export class DerstandardHandler extends ContentHandler {
|
||||||
|
constructor() {
|
||||||
|
super()
|
||||||
|
this.name = 'Derstandard'
|
||||||
|
}
|
||||||
|
|
||||||
|
shouldPreHandle(url: string, dom?: Document): boolean {
|
||||||
|
const u = new URL(url)
|
||||||
|
return u.hostname === 'www.derstandard.at'
|
||||||
|
}
|
||||||
|
|
||||||
|
async preHandle(url: string, document?: Document): Promise<PreHandleResult> {
|
||||||
|
const response = await axios.get(url, {
|
||||||
|
// set cookie to give consent to get the article
|
||||||
|
headers: {
|
||||||
|
cookie: `DSGVO_ZUSAGE_V1=true; consentUUID=2bacb9c1-1e80-4be0-9f7b-ee987cf4e7b0_6`,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
const content = response.data as string
|
||||||
|
|
||||||
|
const dom = parseHTML(content).document
|
||||||
|
const titleElement = dom.querySelector('.article-title')
|
||||||
|
titleElement && titleElement.remove()
|
||||||
|
|
||||||
|
return {
|
||||||
|
content: dom.body.outerHTML,
|
||||||
|
title: titleElement?.textContent || undefined,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,32 @@
|
|||||||
|
import { ContentHandler, PreHandleResult } from '../content-handler'
|
||||||
|
|
||||||
|
export class ImageHandler extends ContentHandler {
|
||||||
|
constructor() {
|
||||||
|
super()
|
||||||
|
this.name = 'Image'
|
||||||
|
}
|
||||||
|
|
||||||
|
shouldPreHandle(url: string, dom?: Document): boolean {
|
||||||
|
const IMAGE_URL_PATTERN = /(https?:\/\/.*\.(?:jpg|jpeg|png|webp))/i
|
||||||
|
return IMAGE_URL_PATTERN.test(url.toString())
|
||||||
|
}
|
||||||
|
|
||||||
|
async preHandle(url: string, document?: Document): Promise<PreHandleResult> {
|
||||||
|
const title = url.toString().split('/').pop() || 'Image'
|
||||||
|
const content = `
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>${title}</title>
|
||||||
|
<meta property="og:image" content="${url}" />
|
||||||
|
<meta property="og:title" content="${title}" />
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div>
|
||||||
|
<img src="${url}" alt="${title}">
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>`
|
||||||
|
|
||||||
|
return Promise.resolve({ title, content })
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,26 @@
|
|||||||
|
import { ContentHandler, PreHandleResult } from '../content-handler'
|
||||||
|
|
||||||
|
export class MediumHandler extends ContentHandler {
|
||||||
|
constructor() {
|
||||||
|
super()
|
||||||
|
this.name = 'Medium'
|
||||||
|
}
|
||||||
|
|
||||||
|
shouldPreHandle(url: string, dom?: Document): boolean {
|
||||||
|
const u = new URL(url)
|
||||||
|
return u.hostname.endsWith('medium.com')
|
||||||
|
}
|
||||||
|
|
||||||
|
async preHandle(url: string, document?: Document): Promise<PreHandleResult> {
|
||||||
|
console.log('prehandling medium url', url)
|
||||||
|
|
||||||
|
try {
|
||||||
|
const res = new URL(url)
|
||||||
|
res.searchParams.delete('source')
|
||||||
|
return Promise.resolve({ url: res.toString() })
|
||||||
|
} catch (error) {
|
||||||
|
console.error('error prehandling medium url', error)
|
||||||
|
throw error
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,18 @@
|
|||||||
|
import { ContentHandler, PreHandleResult } from '../content-handler'
|
||||||
|
|
||||||
|
export class PdfHandler extends ContentHandler {
|
||||||
|
constructor() {
|
||||||
|
super()
|
||||||
|
this.name = 'PDF'
|
||||||
|
}
|
||||||
|
|
||||||
|
shouldPreHandle(url: string, dom?: Document): boolean {
|
||||||
|
const u = new URL(url)
|
||||||
|
const path = u.pathname.replace(u.search, '')
|
||||||
|
return path.endsWith('.pdf')
|
||||||
|
}
|
||||||
|
|
||||||
|
async preHandle(_url: string, document?: Document): Promise<PreHandleResult> {
|
||||||
|
return Promise.resolve({ contentType: 'application/pdf' })
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,38 @@
|
|||||||
|
import { ContentHandler, PreHandleResult } from '../content-handler'
|
||||||
|
import axios from 'axios'
|
||||||
|
import { parseHTML } from 'linkedom'
|
||||||
|
|
||||||
|
export class ScrapingBeeHandler extends ContentHandler {
|
||||||
|
constructor() {
|
||||||
|
super()
|
||||||
|
this.name = 'ScrapingBee'
|
||||||
|
}
|
||||||
|
|
||||||
|
shouldPreHandle(url: string, dom?: Document): boolean {
|
||||||
|
const u = new URL(url)
|
||||||
|
const hostnames = ['nytimes.com', 'news.google.com']
|
||||||
|
|
||||||
|
return hostnames.some((h) => u.hostname.endsWith(h))
|
||||||
|
}
|
||||||
|
|
||||||
|
async preHandle(url: string, document?: Document): Promise<PreHandleResult> {
|
||||||
|
console.log('prehandling url with scrapingbee', url)
|
||||||
|
|
||||||
|
try {
|
||||||
|
const response = await axios.get('https://app.scrapingbee.com/api/v1', {
|
||||||
|
params: {
|
||||||
|
api_key: process.env.SCRAPINGBEE_API_KEY,
|
||||||
|
url: url,
|
||||||
|
return_page_source: true,
|
||||||
|
block_ads: true,
|
||||||
|
block_resources: false,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
const dom = parseHTML(response.data).document
|
||||||
|
return { title: dom.title, content: response.data as string, url: url }
|
||||||
|
} catch (error) {
|
||||||
|
console.error('error prehandling url w/scrapingbee', error)
|
||||||
|
throw error
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,26 @@
|
|||||||
|
import { ContentHandler } from '../content-handler'
|
||||||
|
import axios from 'axios'
|
||||||
|
|
||||||
|
export class TDotCoHandler extends ContentHandler {
|
||||||
|
constructor() {
|
||||||
|
super()
|
||||||
|
this.name = 't.co'
|
||||||
|
}
|
||||||
|
|
||||||
|
shouldResolve(url: string): boolean {
|
||||||
|
const T_DOT_CO_URL_MATCH = /^https:\/\/(?:www\.)?t\.co\/.*$/
|
||||||
|
return T_DOT_CO_URL_MATCH.test(url)
|
||||||
|
}
|
||||||
|
|
||||||
|
async resolve(url: string) {
|
||||||
|
return axios
|
||||||
|
.get(url, { maxRedirects: 0, validateStatus: null })
|
||||||
|
.then((res) => {
|
||||||
|
return new URL(res.headers.location).href
|
||||||
|
})
|
||||||
|
.catch((err) => {
|
||||||
|
console.log('err with t.co url', err)
|
||||||
|
return undefined
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,167 @@
|
|||||||
|
import { ContentHandler, PreHandleResult } from '../content-handler'
|
||||||
|
import axios from 'axios'
|
||||||
|
import { DateTime } from 'luxon'
|
||||||
|
import _ from 'underscore'
|
||||||
|
|
||||||
|
const TWITTER_BEARER_TOKEN = process.env.TWITTER_BEARER_TOKEN
|
||||||
|
const TWITTER_URL_MATCH =
|
||||||
|
/twitter\.com\/(?:#!\/)?(\w+)\/status(?:es)?\/(\d+)(?:\/.*)?/
|
||||||
|
|
||||||
|
const getTweetFields = () => {
|
||||||
|
const TWEET_FIELDS =
|
||||||
|
'&tweet.fields=attachments,author_id,conversation_id,created_at,' +
|
||||||
|
'entities,geo,in_reply_to_user_id,lang,possibly_sensitive,public_metrics,referenced_tweets,' +
|
||||||
|
'source,withheld'
|
||||||
|
const EXPANSIONS = '&expansions=author_id,attachments.media_keys'
|
||||||
|
const USER_FIELDS =
|
||||||
|
'&user.fields=created_at,description,entities,location,pinned_tweet_id,profile_image_url,protected,public_metrics,url,verified,withheld'
|
||||||
|
const MEDIA_FIELDS =
|
||||||
|
'&media.fields=duration_ms,height,preview_image_url,url,media_key,public_metrics,width'
|
||||||
|
|
||||||
|
return `${TWEET_FIELDS}${EXPANSIONS}${USER_FIELDS}${MEDIA_FIELDS}`
|
||||||
|
}
|
||||||
|
|
||||||
|
const getTweetById = async (id: string) => {
|
||||||
|
const BASE_ENDPOINT = 'https://api.twitter.com/2/tweets/'
|
||||||
|
const apiUrl = new URL(BASE_ENDPOINT + id + '?' + getTweetFields())
|
||||||
|
|
||||||
|
if (!TWITTER_BEARER_TOKEN) {
|
||||||
|
throw new Error('No Twitter bearer token found')
|
||||||
|
}
|
||||||
|
|
||||||
|
return axios.get(apiUrl.toString(), {
|
||||||
|
headers: {
|
||||||
|
Authorization: `Bearer ${TWITTER_BEARER_TOKEN}`,
|
||||||
|
redirect: 'follow',
|
||||||
|
},
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
const titleForAuthor = (author: { name: string }) => {
|
||||||
|
return `${author.name} on Twitter`
|
||||||
|
}
|
||||||
|
|
||||||
|
const tweetIdFromStatusUrl = (url: string): string | undefined => {
|
||||||
|
const match = url.toString().match(TWITTER_URL_MATCH)
|
||||||
|
return match?.[2]
|
||||||
|
}
|
||||||
|
|
||||||
|
const formatTimestamp = (timestamp: string) => {
|
||||||
|
return DateTime.fromJSDate(new Date(timestamp)).toLocaleString(
|
||||||
|
DateTime.DATETIME_FULL
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
export class TwitterHandler extends ContentHandler {
|
||||||
|
constructor() {
|
||||||
|
super()
|
||||||
|
this.name = 'Twitter'
|
||||||
|
}
|
||||||
|
|
||||||
|
shouldPreHandle(url: string, dom?: Document): boolean {
|
||||||
|
return !!TWITTER_BEARER_TOKEN && TWITTER_URL_MATCH.test(url.toString())
|
||||||
|
}
|
||||||
|
|
||||||
|
async preHandle(url: string, document?: Document): Promise<PreHandleResult> {
|
||||||
|
console.log('prehandling twitter url', url)
|
||||||
|
|
||||||
|
const tweetId = tweetIdFromStatusUrl(url)
|
||||||
|
if (!tweetId) {
|
||||||
|
throw new Error('could not find tweet id in url')
|
||||||
|
}
|
||||||
|
const tweetData = (await getTweetById(tweetId)).data as {
|
||||||
|
data: {
|
||||||
|
author_id: string
|
||||||
|
text: string
|
||||||
|
entities: {
|
||||||
|
urls: [
|
||||||
|
{
|
||||||
|
url: string
|
||||||
|
expanded_url: string
|
||||||
|
display_url: string
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
created_at: string
|
||||||
|
}
|
||||||
|
includes: {
|
||||||
|
users: [
|
||||||
|
{
|
||||||
|
id: string
|
||||||
|
name: string
|
||||||
|
profile_image_url: string
|
||||||
|
username: string
|
||||||
|
}
|
||||||
|
]
|
||||||
|
media: [
|
||||||
|
{
|
||||||
|
preview_image_url: string
|
||||||
|
type: string
|
||||||
|
url: string
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const authorId = tweetData.data.author_id
|
||||||
|
const author = tweetData.includes.users.filter((u) => (u.id = authorId))[0]
|
||||||
|
// escape html entities in title
|
||||||
|
const title = _.escape(titleForAuthor(author))
|
||||||
|
const authorImage = author.profile_image_url.replace('_normal', '_400x400')
|
||||||
|
|
||||||
|
let text = tweetData.data.text
|
||||||
|
if (tweetData.data.entities && tweetData.data.entities.urls) {
|
||||||
|
for (const urlObj of tweetData.data.entities.urls) {
|
||||||
|
text = text.replace(
|
||||||
|
urlObj.url,
|
||||||
|
`<a href="${urlObj.expanded_url}">${urlObj.display_url}</a>`
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const front = `
|
||||||
|
<div>
|
||||||
|
<p>${text}</p>
|
||||||
|
`
|
||||||
|
|
||||||
|
let includesHtml = ''
|
||||||
|
if (tweetData.includes.media) {
|
||||||
|
includesHtml = tweetData.includes.media
|
||||||
|
.map((m) => {
|
||||||
|
const linkUrl = m.type == 'photo' ? m.url : url
|
||||||
|
const previewUrl = m.type == 'photo' ? m.url : m.preview_image_url
|
||||||
|
const mediaOpen = `<a class="media-link" href=${linkUrl}>
|
||||||
|
<picture>
|
||||||
|
<img class="tweet-img" src=${previewUrl} />
|
||||||
|
</picture>
|
||||||
|
</a>`
|
||||||
|
return mediaOpen
|
||||||
|
})
|
||||||
|
.join('\n')
|
||||||
|
}
|
||||||
|
|
||||||
|
const back = `
|
||||||
|
— <a href="https://twitter.com/${author.username}">${
|
||||||
|
author.username
|
||||||
|
}</a> ${author.name} <a href="${url}">${formatTimestamp(
|
||||||
|
tweetData.data.created_at
|
||||||
|
)}</a>
|
||||||
|
</div>
|
||||||
|
`
|
||||||
|
const content = `
|
||||||
|
<head>
|
||||||
|
<meta property="og:image" content="${authorImage}" />
|
||||||
|
<meta property="og:image:secure_url" content="${authorImage}" />
|
||||||
|
<meta property="og:title" content="${title}" />
|
||||||
|
<meta property="og:description" content="${_.escape(
|
||||||
|
tweetData.data.text
|
||||||
|
)}" />
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
${front}
|
||||||
|
${includesHtml}
|
||||||
|
${back}
|
||||||
|
</body>`
|
||||||
|
|
||||||
|
return { content, url, title }
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,20 @@
|
|||||||
|
import { ContentHandler, PreHandleResult } from '../content-handler'
|
||||||
|
|
||||||
|
export class WikipediaHandler extends ContentHandler {
|
||||||
|
constructor() {
|
||||||
|
super()
|
||||||
|
this.name = 'wikipedia'
|
||||||
|
}
|
||||||
|
|
||||||
|
shouldPreHandle(url: string, dom?: Document): boolean {
|
||||||
|
return new URL(url).hostname.endsWith('wikipedia.org')
|
||||||
|
}
|
||||||
|
|
||||||
|
async preHandle(url: string, dom: Document): Promise<PreHandleResult> {
|
||||||
|
// This removes the [edit] anchors from wikipedia pages
|
||||||
|
dom.querySelectorAll('.mw-editsection').forEach((e) => e.remove())
|
||||||
|
// this removes the sidebar
|
||||||
|
dom.querySelector('.infobox')?.remove()
|
||||||
|
return Promise.resolve({ dom })
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,76 @@
|
|||||||
|
import { ContentHandler, PreHandleResult } from '../content-handler'
|
||||||
|
import axios from 'axios'
|
||||||
|
import _ from 'underscore'
|
||||||
|
|
||||||
|
const YOUTUBE_URL_MATCH =
|
||||||
|
/^((?:https?:)?\/\/)?((?:www|m)\.)?((?:youtube\.com|youtu.be))(\/(?:[\w-]+\?v=|embed\/|v\/)?)([\w-]+)(\S+)?$/
|
||||||
|
|
||||||
|
export const getYoutubeVideoId = (url: string) => {
|
||||||
|
const u = new URL(url)
|
||||||
|
const videoId = u.searchParams.get('v')
|
||||||
|
if (!videoId) {
|
||||||
|
const match = url.toString().match(YOUTUBE_URL_MATCH)
|
||||||
|
if (match === null || match.length < 6 || !match[5]) {
|
||||||
|
return undefined
|
||||||
|
}
|
||||||
|
return match[5]
|
||||||
|
}
|
||||||
|
return videoId
|
||||||
|
}
|
||||||
|
|
||||||
|
export class YoutubeHandler extends ContentHandler {
|
||||||
|
constructor() {
|
||||||
|
super()
|
||||||
|
this.name = 'Youtube'
|
||||||
|
}
|
||||||
|
|
||||||
|
shouldPreHandle(url: string, dom?: Document): boolean {
|
||||||
|
return YOUTUBE_URL_MATCH.test(url.toString())
|
||||||
|
}
|
||||||
|
|
||||||
|
async preHandle(url: string, document?: Document): Promise<PreHandleResult> {
|
||||||
|
const videoId = getYoutubeVideoId(url)
|
||||||
|
if (!videoId) {
|
||||||
|
return {}
|
||||||
|
}
|
||||||
|
|
||||||
|
const oembedUrl =
|
||||||
|
`https://www.youtube.com/oembed?format=json&url=` +
|
||||||
|
encodeURIComponent(`https://www.youtube.com/watch?v=${videoId}`)
|
||||||
|
const oembed = (await axios.get(oembedUrl.toString())).data as {
|
||||||
|
title: string
|
||||||
|
width: number
|
||||||
|
height: number
|
||||||
|
thumbnail_url: string
|
||||||
|
author_name: string
|
||||||
|
author_url: string
|
||||||
|
}
|
||||||
|
// escape html entities in title
|
||||||
|
const title = _.escape(oembed.title)
|
||||||
|
const ratio = oembed.width / oembed.height
|
||||||
|
const thumbnail = oembed.thumbnail_url
|
||||||
|
const height = 350
|
||||||
|
const width = height * ratio
|
||||||
|
const authorName = _.escape(oembed.author_name)
|
||||||
|
|
||||||
|
const content = `
|
||||||
|
<html>
|
||||||
|
<head><title>${title}</title>
|
||||||
|
<meta property="og:image" content="${thumbnail}" />
|
||||||
|
<meta property="og:image:secure_url" content="${thumbnail}" />
|
||||||
|
<meta property="og:title" content="${title}" />
|
||||||
|
<meta property="og:description" content="" />
|
||||||
|
<meta property="og:article:author" content="${authorName}" />
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<iframe width="${width}" height="${height}" src="https://www.youtube.com/embed/${videoId}" title="${title}" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
|
||||||
|
<p><a href="${url}" target="_blank">${title}</a></p>
|
||||||
|
<p itemscope="" itemprop="author" itemtype="http://schema.org/Person">By <a href="${oembed.author_url}" target="_blank">${authorName}</a></p>
|
||||||
|
</body>
|
||||||
|
</html>`
|
||||||
|
|
||||||
|
console.log('got video id', videoId)
|
||||||
|
|
||||||
|
return { content, title: 'Youtube Content' }
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -0,0 +1,10 @@
|
|||||||
|
import { AppleNewsHandler } from '../src/websites/apple-news-handler'
|
||||||
|
|
||||||
|
describe('open a simple web page', () => {
|
||||||
|
it('should return a response', async () => {
|
||||||
|
const response = await new AppleNewsHandler().preHandle(
|
||||||
|
'https://apple.news/AxjzaZaPvSn23b67LhXI5EQ'
|
||||||
|
)
|
||||||
|
console.log('response', response)
|
||||||
|
})
|
||||||
|
})
|
||||||
@ -0,0 +1,3 @@
|
|||||||
|
const register = require('@babel/register').default
|
||||||
|
|
||||||
|
register({ extensions: ['.ts', '.tsx', '.js', '.jsx'] })
|
||||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
191
packages/puppeteer-parse/content-handler/test/newsletter.test.ts
Normal file
191
packages/puppeteer-parse/content-handler/test/newsletter.test.ts
Normal file
@ -0,0 +1,191 @@
|
|||||||
|
import 'mocha'
|
||||||
|
import * as chai from 'chai'
|
||||||
|
import { expect } from 'chai'
|
||||||
|
import chaiAsPromised from 'chai-as-promised'
|
||||||
|
import chaiString from 'chai-string'
|
||||||
|
import { SubstackHandler } from '../src/newsletters/substack-handler'
|
||||||
|
import { AxiosHandler } from '../src/newsletters/axios-handler'
|
||||||
|
import { BloombergNewsletterHandler } from '../src/newsletters/bloomberg-newsletter-handler'
|
||||||
|
import { GolangHandler } from '../src/newsletters/golang-handler'
|
||||||
|
import { MorningBrewHandler } from '../src/newsletters/morning-brew-handler'
|
||||||
|
import nock from 'nock'
|
||||||
|
import { generateUniqueUrl } from '../src/content-handler'
|
||||||
|
import fs from 'fs'
|
||||||
|
import { BeehiivHandler } from '../src/newsletters/beehiiv-handler'
|
||||||
|
|
||||||
|
chai.use(chaiAsPromised)
|
||||||
|
chai.use(chaiString)
|
||||||
|
|
||||||
|
const load = (path: string): string => {
|
||||||
|
return fs.readFileSync(path, 'utf8')
|
||||||
|
}
|
||||||
|
|
||||||
|
describe('Newsletter email test', () => {
|
||||||
|
describe('#getNewsletterUrl()', () => {
|
||||||
|
it('returns url when email is from SubStack', async () => {
|
||||||
|
const rawUrl = '<https://hongbo130.substack.com/p/tldr>'
|
||||||
|
|
||||||
|
await expect(
|
||||||
|
new SubstackHandler().parseNewsletterUrl(rawUrl, '')
|
||||||
|
).to.eventually.equal('https://hongbo130.substack.com/p/tldr')
|
||||||
|
})
|
||||||
|
|
||||||
|
it('returns url when email is from Axios', async () => {
|
||||||
|
const url = 'https://axios.com/blog/the-best-way-to-build-a-web-app'
|
||||||
|
const html = `View in browser at <a>${url}</a>`
|
||||||
|
|
||||||
|
await expect(
|
||||||
|
new AxiosHandler().parseNewsletterUrl('', html)
|
||||||
|
).to.eventually.equal(url)
|
||||||
|
})
|
||||||
|
|
||||||
|
it('returns url when email is from Bloomberg', async () => {
|
||||||
|
const url = 'https://www.bloomberg.com/news/google-is-now-a-partner'
|
||||||
|
const html = `
|
||||||
|
<a class="view-in-browser__url" href="${url}">
|
||||||
|
View in browser
|
||||||
|
</a>
|
||||||
|
`
|
||||||
|
|
||||||
|
await expect(
|
||||||
|
new BloombergNewsletterHandler().parseNewsletterUrl('', html)
|
||||||
|
).to.eventually.equal(url)
|
||||||
|
})
|
||||||
|
|
||||||
|
it('returns url when email is from Golang Weekly', async () => {
|
||||||
|
const url = 'https://www.golangweekly.com/first'
|
||||||
|
const html = `
|
||||||
|
<a href="${url}" style="text-decoration: none">Read on the Web</a>
|
||||||
|
`
|
||||||
|
|
||||||
|
await expect(
|
||||||
|
new GolangHandler().parseNewsletterUrl('', html)
|
||||||
|
).to.eventually.equal(url)
|
||||||
|
})
|
||||||
|
|
||||||
|
it('returns url when email is from Morning Brew', async () => {
|
||||||
|
const url = 'https://www.morningbrew.com/daily/issues/first'
|
||||||
|
const html = `
|
||||||
|
<a style="color: #000000; text-decoration: none;" target="_blank" rel="noopener" href="${url}">View Online</a>
|
||||||
|
`
|
||||||
|
|
||||||
|
await expect(
|
||||||
|
new MorningBrewHandler().parseNewsletterUrl('', html)
|
||||||
|
).to.eventually.equal(url)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
describe('get author from email address', () => {
|
||||||
|
it('returns author when email is from Substack', () => {
|
||||||
|
const from = 'Jackson Harper from Omnivore App <jacksonh@substack.com>'
|
||||||
|
expect(new AxiosHandler().parseAuthor(from)).to.equal(
|
||||||
|
'Jackson Harper from Omnivore App'
|
||||||
|
)
|
||||||
|
})
|
||||||
|
|
||||||
|
it('returns author when email is from Axios', () => {
|
||||||
|
const from = 'Mike Allen <mike@axios.com>'
|
||||||
|
expect(new AxiosHandler().parseAuthor(from)).to.equal('Mike Allen')
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
describe('isProbablyNewsletter', () => {
|
||||||
|
it('returns true for substack newsletter', async () => {
|
||||||
|
const html = load('./test/data/substack-forwarded-newsletter.html')
|
||||||
|
await expect(
|
||||||
|
new SubstackHandler().isNewsletter({
|
||||||
|
html,
|
||||||
|
postHeader: '',
|
||||||
|
from: '',
|
||||||
|
unSubHeader: '',
|
||||||
|
})
|
||||||
|
).to.eventually.be.true
|
||||||
|
})
|
||||||
|
it('returns true for private forwarded substack newsletter', async () => {
|
||||||
|
const html = load(
|
||||||
|
'./test/data/substack-private-forwarded-newsletter.html'
|
||||||
|
)
|
||||||
|
await expect(
|
||||||
|
new SubstackHandler().isNewsletter({
|
||||||
|
html,
|
||||||
|
postHeader: '',
|
||||||
|
from: '',
|
||||||
|
unSubHeader: '',
|
||||||
|
})
|
||||||
|
).to.eventually.be.true
|
||||||
|
})
|
||||||
|
it('returns false for substack welcome email', async () => {
|
||||||
|
const html = load('./test/data/substack-forwarded-welcome-email.html')
|
||||||
|
await expect(
|
||||||
|
new SubstackHandler().isNewsletter({
|
||||||
|
html,
|
||||||
|
postHeader: '',
|
||||||
|
from: '',
|
||||||
|
unSubHeader: '',
|
||||||
|
})
|
||||||
|
).to.eventually.be.false
|
||||||
|
})
|
||||||
|
it('returns true for beehiiv.com newsletter', async () => {
|
||||||
|
const html = load('./test/data/beehiiv-newsletter.html')
|
||||||
|
await expect(
|
||||||
|
new BeehiivHandler().isNewsletter({
|
||||||
|
html,
|
||||||
|
postHeader: '',
|
||||||
|
from: '',
|
||||||
|
unSubHeader: '',
|
||||||
|
})
|
||||||
|
).to.eventually.be.true
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
describe('findNewsletterUrl', async () => {
|
||||||
|
it('gets the URL from the header if it is a substack newsletter', async () => {
|
||||||
|
nock('https://email.mg2.substack.com')
|
||||||
|
.head(
|
||||||
|
'/c/eJxNkk2TojAQhn-N3KTyQfg4cGDGchdnYcsZx9K5UCE0EMVAkTiKv36iHnarupNUd7rfVJ4W3EDTj1M89No496Uw0wCxgovuwBgYnbOGsZBVjDHzKPWYU8VehUMWOlIX9Qhw4rKLzXgGZziXnRTcyF7dK0iIGMVOG_OS1aTmKPRDilgVhTQUPCQIcE0x-MFTmJ8rCUpA3KtuenR2urg1ZtAzmszI0tq_Z7m66y-ilQo0uAqMTQ7WRX8auJKg56blZg7WB-iHDuYEBzO6NP0R1IwuYFphQbbTjnTH9NBfs80nym4Zyj8uUvyKbtUyGr5eUz9fNDQ7JCxfJDo9dW1lY9lmj_JNivPbGmf2Pt_lN9tDit9b-WeTetni85Z9pDpVOd7L1E_Vy7egayNO23ZP34eSeLJeux1b0rer_xaZ7ykS78nuSjMY-nL98rparNZNcv07JCjN06_EkTFBxBqOUMACErnELUNMSxTUjLDQZwzcqa4bRjCfeejUEFefS224OLr2S5wxPtij7lVrs80d2CNseRV2P52VNFMBipcdVE-U5jkRD7hFAwpGOylVwU2Mfc9qBh7DoR89yVnWXhgQFHnIsbpVb6tU_B-hH_2yzWY'
|
||||||
|
)
|
||||||
|
.reply(302, undefined, {
|
||||||
|
Location:
|
||||||
|
'https://newsletter.slowchinese.net/p/companies-that-eat-people-217',
|
||||||
|
})
|
||||||
|
.get('/p/companies-that-eat-people-217')
|
||||||
|
.reply(200, '')
|
||||||
|
const html = load('./test/data/substack-forwarded-newsletter.html')
|
||||||
|
const url = await new SubstackHandler().findNewsletterUrl(html)
|
||||||
|
// Not sure if the redirects from substack expire, this test could eventually fail
|
||||||
|
expect(url).to.startWith(
|
||||||
|
'https://newsletter.slowchinese.net/p/companies-that-eat-people-217'
|
||||||
|
)
|
||||||
|
}).timeout(10000)
|
||||||
|
it('gets the URL from the header if it is a beehiiv newsletter', async () => {
|
||||||
|
nock('https://u23463625.ct.sendgrid.net')
|
||||||
|
.head(
|
||||||
|
'/ss/c/AX1lEgEQaxtvFxLaVo0GBo_geajNrlI1TGeIcmMViR3pL3fEDZnbbkoeKcaY62QZk0KPFudUiUXc_uMLerV4nA/3k5/3TFZmreTR0qKSCgowABnVg/h30/zzLik7UXd1H_n4oyd5W8Xu639AYQQB2UXz-CsssSnno'
|
||||||
|
)
|
||||||
|
.reply(302, undefined, {
|
||||||
|
Location: 'https://www.milkroad.com/p/talked-guy-spent-30m-beeple',
|
||||||
|
})
|
||||||
|
.get('/p/talked-guy-spent-30m-beeple')
|
||||||
|
.reply(200, '')
|
||||||
|
const html = load('./test/data/beehiiv-newsletter.html')
|
||||||
|
const url = await new BeehiivHandler().findNewsletterUrl(html)
|
||||||
|
expect(url).to.startWith(
|
||||||
|
'https://www.milkroad.com/p/talked-guy-spent-30m-beeple'
|
||||||
|
)
|
||||||
|
})
|
||||||
|
it('returns undefined if it is not a newsletter', async () => {
|
||||||
|
const html = load('./test/data/substack-forwarded-welcome-email.html')
|
||||||
|
const url = await new SubstackHandler().findNewsletterUrl(html)
|
||||||
|
expect(url).to.be.undefined
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
describe('generateUniqueUrl', () => {
|
||||||
|
it('generates a unique URL', () => {
|
||||||
|
const url1 = generateUniqueUrl()
|
||||||
|
const url2 = generateUniqueUrl()
|
||||||
|
|
||||||
|
expect(url1).to.not.eql(url2)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
})
|
||||||
@ -0,0 +1,25 @@
|
|||||||
|
import { expect } from 'chai'
|
||||||
|
import 'mocha'
|
||||||
|
import { getYoutubeVideoId } from '../src/websites/youtube-handler'
|
||||||
|
|
||||||
|
describe('getYoutubeVideoId', () => {
|
||||||
|
it('should parse video id out of a URL', async () => {
|
||||||
|
expect('BnSUk0je6oo').to.eq(
|
||||||
|
getYoutubeVideoId('https://www.youtube.com/watch?v=BnSUk0je6oo&t=269s')
|
||||||
|
)
|
||||||
|
expect('vFD2gu007dc').to.eq(
|
||||||
|
getYoutubeVideoId(
|
||||||
|
'https://www.youtube.com/watch?v=vFD2gu007dc&list=RDvFD2gu007dc&start_radio=1'
|
||||||
|
)
|
||||||
|
)
|
||||||
|
expect('vFD2gu007dc').to.eq(
|
||||||
|
getYoutubeVideoId('https://youtu.be/vFD2gu007dc')
|
||||||
|
)
|
||||||
|
expect('BMFVCnbRaV4').to.eq(
|
||||||
|
getYoutubeVideoId('https://youtube.com/watch?v=BMFVCnbRaV4&feature=share')
|
||||||
|
)
|
||||||
|
expect('cg9b4RC87LI').to.eq(
|
||||||
|
getYoutubeVideoId('https://youtu.be/cg9b4RC87LI?t=116')
|
||||||
|
)
|
||||||
|
})
|
||||||
|
})
|
||||||
10
packages/puppeteer-parse/content-handler/tsconfig.json
Normal file
10
packages/puppeteer-parse/content-handler/tsconfig.json
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
{
|
||||||
|
"extends": "@tsconfig/node14/tsconfig.json",
|
||||||
|
"compilerOptions": {
|
||||||
|
"rootDir": ".",
|
||||||
|
"declaration": true,
|
||||||
|
"outDir": "build",
|
||||||
|
"lib": ["dom"]
|
||||||
|
},
|
||||||
|
"include": ["src"]
|
||||||
|
}
|
||||||
@ -6,6 +6,7 @@
|
|||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@google-cloud/logging-winston": "^5.1.1",
|
"@google-cloud/logging-winston": "^5.1.1",
|
||||||
"@google-cloud/storage": "^5.18.1",
|
"@google-cloud/storage": "^5.18.1",
|
||||||
|
"@omnivore/content-handler": "file:./../content-handler",
|
||||||
"@sentry/serverless": "^6.13.3",
|
"@sentry/serverless": "^6.13.3",
|
||||||
"axios": "^0.27.2",
|
"axios": "^0.27.2",
|
||||||
"chrome-aws-lambda": "^10.1.0",
|
"chrome-aws-lambda": "^10.1.0",
|
||||||
|
|||||||
11
yarn.lock
11
yarn.lock
@ -4312,6 +4312,17 @@
|
|||||||
dependencies:
|
dependencies:
|
||||||
"@octokit/openapi-types" "^9.5.0"
|
"@octokit/openapi-types" "^9.5.0"
|
||||||
|
|
||||||
|
"@omnivore/content-handler@file:./packages/content-handler":
|
||||||
|
version "1.0.0"
|
||||||
|
dependencies:
|
||||||
|
addressparser "^1.0.1"
|
||||||
|
axios "^0.27.2"
|
||||||
|
linkedom "^0.14.16"
|
||||||
|
luxon "^3.0.4"
|
||||||
|
rfc2047 "^4.0.1"
|
||||||
|
underscore "^1.13.6"
|
||||||
|
uuid "^9.0.0"
|
||||||
|
|
||||||
"@opentelemetry/api-metrics@0.27.0":
|
"@opentelemetry/api-metrics@0.27.0":
|
||||||
version "0.27.0"
|
version "0.27.0"
|
||||||
resolved "https://registry.yarnpkg.com/@opentelemetry/api-metrics/-/api-metrics-0.27.0.tgz#d8eca344ed1155f3ea8a8133ade827b4bb90efbf"
|
resolved "https://registry.yarnpkg.com/@opentelemetry/api-metrics/-/api-metrics-0.27.0.tgz#d8eca344ed1155f3ea8a8133ade827b4bb90efbf"
|
||||||
|
|||||||
Reference in New Issue
Block a user