This commit is contained in:
Hongbo Wu
2022-09-29 16:08:38 +08:00
parent 6deb62d983
commit 5fdb8b337d
10 changed files with 61 additions and 36 deletions

View File

@ -14,7 +14,7 @@ export class AppleNewsHandler extends ContentHandler {
const response = await axios.get(url, {
headers: { 'User-Agent': MOBILE_USER_AGENT },
})
const data = response.data
const data = response.data as string
const dom = parseHTML(data).document
// make sure it's a valid URL by wrapping in new URL
const href = dom

View File

@ -5,7 +5,7 @@ import { parseHTML } from 'linkedom'
class BloombergHandler extends ContentHandler {
shouldPreHandle(url: string, _dom: Document): boolean {
const BLOOMBERG_URL_MATCH =
/https?:\/\/(www\.)?bloomberg.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)/
/https?:\/\/(www\.)?bloomberg.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_+.~#?&/=]*)/
return BLOOMBERG_URL_MATCH.test(url.toString())
}

View File

@ -15,7 +15,7 @@ class DerstandardHandler extends ContentHandler {
cookie: `DSGVO_ZUSAGE_V1=true; consentUUID=2bacb9c1-1e80-4be0-9f7b-ee987cf4e7b0_6`,
},
})
const content = response.data
const content = response.data as string
const dom = parseHTML(content).document
const titleElement = dom.querySelector('.article-title')

View File

@ -7,7 +7,7 @@ class ImageHandler extends ContentHandler {
}
async preHandle(url: string, _document: Document): Promise<PreHandleResult> {
const title = url.toString().split('/').pop()
const title = url.toString().split('/').pop() || 'Image'
const content = `
<html>
<head>
@ -22,6 +22,6 @@ class ImageHandler extends ContentHandler {
</body>
</html>`
return { title, content }
return Promise.resolve({ title, content })
}
}

View File

@ -12,7 +12,7 @@ class MediumHandler extends ContentHandler {
try {
const res = new URL(url)
res.searchParams.delete('source')
return { url: res.toString() }
return Promise.resolve({ url: res.toString() })
} catch (error) {
console.error('error prehandling medium url', error)
throw error

View File

@ -8,6 +8,6 @@ class PdfHandler extends ContentHandler {
}
async preHandle(_url: string, _document: Document): Promise<PreHandleResult> {
return { contentType: 'application/pdf' }
return Promise.resolve({ contentType: 'application/pdf' })
}
}

View File

@ -24,7 +24,7 @@ class ScrapingBeeHandler extends ContentHandler {
},
})
const dom = parseHTML(response.data).document
return { title: dom.title, content: response.data, url: url }
return { title: dom.title, content: response.data as string, url: url }
} catch (error) {
console.error('error prehandling url w/scrapingbee', error)
throw error

View File

@ -8,7 +8,7 @@ class TDotCoHandler extends ContentHandler {
}
async resolve(url: string) {
return await axios
return axios
.get(url, { maxRedirects: 0, validateStatus: null })
.then((res) => {
return new URL(res.headers.location).href

View File

@ -25,7 +25,11 @@ const getTweetById = async (id: string) => {
const BASE_ENDPOINT = 'https://api.twitter.com/2/tweets/'
const apiUrl = new URL(BASE_ENDPOINT + id + '?' + getTweetFields())
return await axios.get(apiUrl.toString(), {
if (!TWITTER_BEARER_TOKEN) {
throw new Error('No Twitter bearer token found')
}
return axios.get(apiUrl.toString(), {
headers: {
Authorization: `Bearer ${TWITTER_BEARER_TOKEN}`,
redirect: 'follow',
@ -33,7 +37,7 @@ const getTweetById = async (id: string) => {
})
}
const titleForAuthor = (author: any) => {
const titleForAuthor = (author: { name: string }) => {
return `${author.name} on Twitter`
}
@ -53,20 +57,6 @@ class TwitterHandler extends ContentHandler {
return !!TWITTER_BEARER_TOKEN && TWITTER_URL_MATCH.test(url.toString())
}
// version of the handler that uses the oembed API
// This isn't great as it doesn't work well with our
// readability API. But could potentially give a more consistent
// look to the tweets
// prehandle: async (url, env) => {
// const oeTweet = await embeddedTweet(url)
// const dom = new JSDOM(oeTweet.data.html);
// const bq = dom.window.document.querySelector('blockquote')
// console.log('blockquote:', bq);
// const title = titleForTweet(oeTweet)
// return { title, content: '<div>' + bq.innerHTML + '</div>', url: oeTweet.data.url };
// }
async preHandle(url: string, _document: Document): Promise<PreHandleResult> {
console.log('prehandling twitter url', url)
@ -74,18 +64,48 @@ class TwitterHandler extends ContentHandler {
if (!tweetId) {
throw new Error('could not find tweet id in url')
}
const tweetData = (await getTweetById(tweetId)).data
const tweetData = (await getTweetById(tweetId)).data as {
data: {
author_id: string
text: string
entities: {
urls: [
{
url: string
expanded_url: string
display_url: string
}
]
}
created_at: string
}
includes: {
users: [
{
id: string
name: string
profile_image_url: string
username: string
}
]
media: [
{
preview_image_url: string
type: string
url: string
}
]
}
}
const authorId = tweetData.data.author_id
const author = tweetData.includes.users.filter(
(u: any) => (u.id = authorId)
)[0]
const author = tweetData.includes.users.filter((u) => (u.id = authorId))[0]
// escape html entities in title
const title = _.escape(titleForAuthor(author))
const authorImage = author.profile_image_url.replace('_normal', '_400x400')
let text = tweetData.data.text
if (tweetData.data.entities && tweetData.data.entities.urls) {
for (let urlObj of tweetData.data.entities.urls) {
for (const urlObj of tweetData.data.entities.urls) {
text = text.replace(
urlObj.url,
`<a href="${urlObj.expanded_url}">${urlObj.display_url}</a>`
@ -98,10 +118,10 @@ class TwitterHandler extends ContentHandler {
<p>${text}</p>
`
var includesHtml = ''
let includesHtml = ''
if (tweetData.includes.media) {
includesHtml = tweetData.includes.media
.map((m: any) => {
.map((m) => {
const linkUrl = m.type == 'photo' ? m.url : url
const previewUrl = m.type == 'photo' ? m.url : m.preview_image_url
const mediaOpen = `<a class="media-link" href=${linkUrl}>

View File

@ -1,9 +1,7 @@
import { PreHandleResult } from '../index'
import { ContentHandler, PreHandleResult } from '../index'
import axios from 'axios'
import _ from 'underscore'
const { ContentHandler } = require('../index')
const YOUTUBE_URL_MATCH =
/^((?:https?:)?\/\/)?((?:www|m)\.)?((?:youtube\.com|youtu.be))(\/(?:[\w-]+\?v=|embed\/|v\/)?)([\w-]+)(\S+)?$/
@ -34,7 +32,14 @@ class YoutubeHandler extends ContentHandler {
const oembedUrl =
`https://www.youtube.com/oembed?format=json&url=` +
encodeURIComponent(`https://www.youtube.com/watch?v=${videoId}`)
const oembed = (await axios.get(oembedUrl.toString())).data
const oembed = (await axios.get(oembedUrl.toString())).data as {
title: string
width: number
height: number
thumbnail_url: string
author_name: string
author_url: string
}
// escape html entities in title
const title = _.escape(oembed.title)
const ratio = oembed.width / oembed.height