Merge pull request #4221 from omnivore-app/fix/content-fetch-error
improve content-fetch
This commit is contained in:
@ -27,6 +27,7 @@ export const bulkAction = async (data: BulkActionData) => {
|
||||
for (let offset = 0; offset < count; offset += batchSize) {
|
||||
const searchArgs = {
|
||||
size: batchSize,
|
||||
includePending: true,
|
||||
query: `(${query}) AND updated:*..${now}`, // only process items that have not been updated
|
||||
}
|
||||
|
||||
|
||||
@ -25,7 +25,7 @@ export const libraryItemRepository = appDataSource
|
||||
.andWhere('md5(original_url) = md5(:url)', { url })
|
||||
|
||||
if (forUpdate) {
|
||||
qb.setLock('pessimistic_write')
|
||||
qb.setLock('pessimistic_read')
|
||||
}
|
||||
|
||||
return qb.getOne()
|
||||
|
||||
@ -760,9 +760,10 @@ export const bulkActionResolver = authorized<
|
||||
},
|
||||
})
|
||||
|
||||
const batchSize = 100
|
||||
const batchSize = 20
|
||||
const searchArgs = {
|
||||
query,
|
||||
includePending: true,
|
||||
size: 0,
|
||||
}
|
||||
const count = await countLibraryItems(searchArgs, uid)
|
||||
@ -778,13 +779,13 @@ export const bulkActionResolver = authorized<
|
||||
action,
|
||||
count,
|
||||
})
|
||||
// if there are less than 100 items, update them synchronously
|
||||
// if there are less than batchSize items, update them synchronously
|
||||
await batchUpdateLibraryItems(action, searchArgs, uid, labelIds, args)
|
||||
|
||||
return { success: true }
|
||||
}
|
||||
|
||||
// if there are more than 100 items, update them asynchronously
|
||||
// if there are more than batchSize items, update them asynchronously
|
||||
const data = {
|
||||
userId: uid,
|
||||
action,
|
||||
|
||||
@ -46,7 +46,7 @@ import { analytics } from './utils/analytics'
|
||||
import { corsConfig } from './utils/corsConfig'
|
||||
import { getClientFromUserAgent } from './utils/helpers'
|
||||
import { buildLogger, buildLoggerTransport, logger } from './utils/logger'
|
||||
import { apiLimiter, authLimiter } from './utils/rate_limit'
|
||||
import { apiHourLimiter, apiLimiter, authLimiter } from './utils/rate_limit'
|
||||
import { shortcutsRouter } from './routers/shortcuts_router'
|
||||
|
||||
const PORT = process.env.PORT || 4000
|
||||
@ -68,7 +68,7 @@ export const createApp = (): Express => {
|
||||
app.set('trust proxy', env.server.trustProxy)
|
||||
|
||||
// Apply the rate limiting middleware to API calls only
|
||||
app.use('/api/', apiLimiter)
|
||||
app.use('/api/', apiLimiter, apiHourLimiter)
|
||||
|
||||
// set client info in the request context
|
||||
app.use(httpContext.middleware)
|
||||
|
||||
@ -8,11 +8,11 @@ import {
|
||||
PageType,
|
||||
} from '../generated/graphql'
|
||||
import { createPubSubClient, PubsubClient } from '../pubsub'
|
||||
import { Merge } from '../util'
|
||||
import { redisDataSource } from '../redis_data_source'
|
||||
import { enqueueParseRequest } from '../utils/createTask'
|
||||
import { cleanUrl, generateSlug } from '../utils/helpers'
|
||||
import { logger } from '../utils/logger'
|
||||
import { countBySavedAt, createOrUpdateLibraryItem } from './library_item'
|
||||
import { createOrUpdateLibraryItem } from './library_item'
|
||||
|
||||
interface PageSaveRequest {
|
||||
user: User
|
||||
@ -34,13 +34,47 @@ const SAVING_CONTENT = 'Your link is being saved...'
|
||||
|
||||
const isPrivateIP = privateIpLib.default
|
||||
|
||||
const recentSavedItemKey = (userId: string) => `recent-saved-item:${userId}`
|
||||
|
||||
const addRecentSavedItem = async (userId: string) => {
|
||||
const redisClient = redisDataSource.redisClient
|
||||
|
||||
if (redisClient) {
|
||||
const key = recentSavedItemKey(userId)
|
||||
try {
|
||||
// add now to the sorted set for rate limiting
|
||||
await redisClient.zadd(key, Date.now(), Date.now())
|
||||
} catch (error) {
|
||||
logger.error('error adding recently saved item in redis', {
|
||||
key,
|
||||
error,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 5 items saved in the last minute: use low queue
|
||||
// default: use normal queue
|
||||
const getPriorityByRateLimit = async (
|
||||
userId: string
|
||||
): Promise<'low' | 'high'> => {
|
||||
const count = await countBySavedAt(userId, new Date(Date.now() - 60 * 1000))
|
||||
return count >= 5 ? 'low' : 'high'
|
||||
): Promise<'low' | 'high' | undefined> => {
|
||||
const redisClient = redisDataSource.redisClient
|
||||
if (redisClient) {
|
||||
const oneMinuteAgo = Date.now() - 60 * 1000
|
||||
const key = recentSavedItemKey(userId)
|
||||
|
||||
try {
|
||||
// Remove items older than one minute
|
||||
await redisClient.zremrangebyscore(key, '-inf', oneMinuteAgo)
|
||||
|
||||
// Count items in the last minute
|
||||
const count = await redisClient.zcard(key)
|
||||
|
||||
return count >= 5 ? 'low' : 'high'
|
||||
} catch (error) {
|
||||
logger.error('Failed to get priority by rate limit', { userId, error })
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export const validateUrl = (url: string): URL => {
|
||||
@ -118,8 +152,12 @@ export const createPageSaveRequest = async ({
|
||||
pubsub
|
||||
)
|
||||
|
||||
// add to recent saved item
|
||||
await addRecentSavedItem(userId)
|
||||
|
||||
// get priority by checking rate limit if not specified
|
||||
priority = priority || (await getPriorityByRateLimit(userId))
|
||||
logger.debug('priority', { priority })
|
||||
|
||||
// enqueue task to parse item
|
||||
await enqueueParseRequest({
|
||||
|
||||
@ -1192,7 +1192,7 @@ export const findLibraryItemsByPrefix = async (
|
||||
)
|
||||
}
|
||||
|
||||
export const countBySavedAt = async (
|
||||
export const countByCreatedAt = async (
|
||||
userId: string,
|
||||
startDate = new Date(0),
|
||||
endDate = new Date()
|
||||
@ -1202,7 +1202,7 @@ export const countBySavedAt = async (
|
||||
tx
|
||||
.createQueryBuilder(LibraryItem, 'library_item')
|
||||
.where('library_item.user_id = :userId', { userId })
|
||||
.andWhere('library_item.saved_at between :startDate and :endDate', {
|
||||
.andWhere('library_item.created_at between :startDate and :endDate', {
|
||||
startDate,
|
||||
endDate,
|
||||
})
|
||||
@ -1256,7 +1256,7 @@ export const batchUpdateLibraryItems = async (
|
||||
const queryBuilder = getQueryBuilder(userId, em)
|
||||
|
||||
if (forUpdate) {
|
||||
queryBuilder.setLock('pessimistic_write')
|
||||
queryBuilder.setLock('pessimistic_read')
|
||||
}
|
||||
|
||||
const libraryItems = await queryBuilder
|
||||
|
||||
@ -27,7 +27,7 @@ const configs: Partial<Options> = {
|
||||
export const apiLimiter = rateLimit({
|
||||
...configs,
|
||||
max: async (req) => {
|
||||
// 100 RPM for an authenticated request, 15 for a non-authenticated request
|
||||
// 60 RPM for authenticated request, 15 for non-authenticated request
|
||||
const token = getTokenByRequest(req)
|
||||
try {
|
||||
const claims = await getClaimsByToken(token)
|
||||
@ -43,6 +43,26 @@ export const apiLimiter = rateLimit({
|
||||
store: getStore('api-rate-limit'),
|
||||
})
|
||||
|
||||
export const apiHourLimiter = rateLimit({
|
||||
...configs,
|
||||
windowMs: 60 * 60 * 1000, // 1 hour
|
||||
max: async (req) => {
|
||||
// 600 for authenticated request, 150 for non-authenticated request
|
||||
const token = getTokenByRequest(req)
|
||||
try {
|
||||
const claims = await getClaimsByToken(token)
|
||||
return claims ? 600 : 150
|
||||
} catch (e) {
|
||||
console.log('non-authenticated request')
|
||||
return 150
|
||||
}
|
||||
},
|
||||
keyGenerator: (req) => {
|
||||
return getTokenByRequest(req) || req.ip
|
||||
},
|
||||
store: getStore('api-hour-rate-limit'),
|
||||
})
|
||||
|
||||
// 5 RPM for auth requests
|
||||
export const authLimiter = rateLimit({
|
||||
...configs,
|
||||
|
||||
@ -23,7 +23,6 @@ COPY .prettierrc .
|
||||
COPY .eslintrc .
|
||||
|
||||
COPY /packages/content-fetch/package.json ./packages/content-fetch/package.json
|
||||
COPY /packages/readabilityjs/package.json ./packages/readabilityjs/package.json
|
||||
COPY /packages/content-handler/package.json ./packages/content-handler/package.json
|
||||
COPY /packages/puppeteer-parse/package.json ./packages/puppeteer-parse/package.json
|
||||
COPY /packages/utils/package.json ./packages/utils/package.json
|
||||
@ -33,7 +32,6 @@ RUN yarn install --pure-lockfile
|
||||
ADD /packages/content-fetch ./packages/content-fetch
|
||||
ADD /packages/content-handler ./packages/content-handler
|
||||
ADD /packages/puppeteer-parse ./packages/puppeteer-parse
|
||||
ADD /packages/readabilityjs ./packages/readabilityjs
|
||||
ADD /packages/utils ./packages/utils
|
||||
RUN yarn workspace @omnivore/utils build
|
||||
RUN yarn workspace @omnivore/content-handler build
|
||||
|
||||
@ -27,7 +27,6 @@ COPY .prettierrc .
|
||||
COPY .eslintrc .
|
||||
|
||||
COPY /packages/content-fetch/package.json ./packages/content-fetch/package.json
|
||||
COPY /packages/readabilityjs/package.json ./packages/readabilityjs/package.json
|
||||
COPY /packages/content-handler/package.json ./packages/content-handler/package.json
|
||||
COPY /packages/puppeteer-parse/package.json ./packages/puppeteer-parse/package.json
|
||||
COPY /packages/utils/package.json ./packages/utils/package.json
|
||||
@ -37,7 +36,6 @@ RUN yarn install --pure-lockfile
|
||||
ADD /packages/content-handler ./packages/content-handler
|
||||
ADD /packages/puppeteer-parse ./packages/puppeteer-parse
|
||||
ADD /packages/content-fetch ./packages/content-fetch
|
||||
ADD /packages/readabilityjs ./packages/readabilityjs
|
||||
ADD /packages/utils ./packages/utils
|
||||
RUN yarn workspace @omnivore/utils build
|
||||
RUN yarn workspace @omnivore/content-handler build
|
||||
|
||||
@ -77,7 +77,10 @@ export const queueSavePageJob = async (
|
||||
data: job.data,
|
||||
opts: getOpts(job),
|
||||
}))
|
||||
console.log('queue save page jobs:', JSON.stringify(jobs, null, 2))
|
||||
console.log(
|
||||
'queue save page jobs:',
|
||||
jobs.map((job) => job.data.finalUrl)
|
||||
)
|
||||
|
||||
const queue = new Queue(QUEUE_NAME, {
|
||||
connection: redisDataSource.queueRedisClient,
|
||||
|
||||
@ -129,6 +129,52 @@ const getCachedFetchResult = async (
|
||||
return fetchResult
|
||||
}
|
||||
|
||||
const failureRedisKey = (domain: string) => `fetch-failure:${domain}`
|
||||
|
||||
const isDomainBlocked = async (
|
||||
redisDataSource: RedisDataSource,
|
||||
domain: string
|
||||
) => {
|
||||
const blockedDomains = ['localhost', 'weibo.com']
|
||||
if (blockedDomains.includes(domain)) {
|
||||
return true
|
||||
}
|
||||
|
||||
const key = failureRedisKey(domain)
|
||||
const redisClient = redisDataSource.cacheClient
|
||||
try {
|
||||
const result = await redisClient.get(key)
|
||||
// if the domain has failed to fetch more than certain times, block it
|
||||
const maxFailures = parseInt(process.env.MAX_FEED_FETCH_FAILURES ?? '10')
|
||||
if (result && parseInt(result) > maxFailures) {
|
||||
console.info(`domain is blocked: ${domain}`)
|
||||
return true
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Failed to check domain block status', { domain, error })
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
const incrementContentFetchFailure = async (
|
||||
redisDataSource: RedisDataSource,
|
||||
domain: string
|
||||
) => {
|
||||
const redisClient = redisDataSource.cacheClient
|
||||
const key = failureRedisKey(domain)
|
||||
try {
|
||||
const result = await redisClient.incr(key)
|
||||
// expire the key in 1 day
|
||||
await redisClient.expire(key, 24 * 60 * 60)
|
||||
|
||||
return result
|
||||
} catch (error) {
|
||||
console.error('Failed to increment failure in redis', { domain, error })
|
||||
return null
|
||||
}
|
||||
}
|
||||
|
||||
export const contentFetchRequestHandler: RequestHandler = async (req, res) => {
|
||||
const functionStartTime = Date.now()
|
||||
|
||||
@ -200,8 +246,22 @@ export const contentFetchRequestHandler: RequestHandler = async (req, res) => {
|
||||
url
|
||||
)
|
||||
|
||||
fetchResult = await fetchContent(url, locale, timezone)
|
||||
console.log('content has been fetched')
|
||||
const domain = new URL(url).hostname
|
||||
const isBlocked = await isDomainBlocked(redisDataSource, domain)
|
||||
if (isBlocked) {
|
||||
console.log('domain is blocked', domain)
|
||||
|
||||
return res.sendStatus(200)
|
||||
}
|
||||
|
||||
try {
|
||||
fetchResult = await fetchContent(url, locale, timezone)
|
||||
console.log('content has been fetched')
|
||||
} catch (error) {
|
||||
await incrementContentFetchFailure(redisDataSource, domain)
|
||||
|
||||
throw error
|
||||
}
|
||||
|
||||
if (fetchResult.content && !NO_CACHE_URLS.includes(url)) {
|
||||
const cacheResult = await cacheFetchResult(
|
||||
|
||||
@ -9,20 +9,12 @@
|
||||
],
|
||||
"dependencies": {
|
||||
"@omnivore/content-handler": "1.0.0",
|
||||
"@omnivore/readability": "1.0.0",
|
||||
"axios": "^1.4.0",
|
||||
"crypto": "^1.0.1",
|
||||
"dompurify": "^2.4.1",
|
||||
"linkedom": "^0.14.9",
|
||||
"puppeteer-core": "^22.12.1",
|
||||
"puppeteer-extra": "^3.3.6",
|
||||
"puppeteer-extra-plugin-adblocker": "^2.13.6",
|
||||
"puppeteer-extra-plugin-stealth": "^2.11.2",
|
||||
"urlsafe-base64": "^1.0.0"
|
||||
"puppeteer-extra-plugin-stealth": "^2.11.2"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/dompurify": "^3.0.5",
|
||||
"@types/urlsafe-base64": "^1.0.31",
|
||||
"chai": "^4.3.6",
|
||||
"mocha": "^10.0.0"
|
||||
},
|
||||
|
||||
@ -1,8 +1,5 @@
|
||||
/* eslint-disable @typescript-eslint/no-unsafe-member-access */
|
||||
/* eslint-disable @typescript-eslint/no-unsafe-assignment */
|
||||
import { preHandleContent } from '@omnivore/content-handler'
|
||||
import axios from 'axios'
|
||||
import { parseHTML } from 'linkedom'
|
||||
import path from 'path'
|
||||
import { BrowserContext, Page, Protocol } from 'puppeteer-core'
|
||||
import { getBrowser } from './browser'
|
||||
@ -16,22 +13,6 @@ const ALLOWED_CONTENT_TYPES = [
|
||||
'application/pdf',
|
||||
]
|
||||
|
||||
const fetchContentWithScrapingBee = async (url: string) => {
|
||||
const response = await axios.get('https://app.scrapingbee.com/api/v1', {
|
||||
params: {
|
||||
api_key: process.env.SCRAPINGBEE_API_KEY,
|
||||
url: url,
|
||||
render_js: 'false',
|
||||
premium_proxy: 'true',
|
||||
country_code: 'us',
|
||||
},
|
||||
timeout: 10_000,
|
||||
})
|
||||
|
||||
const dom = parseHTML(response.data).document
|
||||
return { title: dom.title, domContent: dom.documentElement.outerHTML, url }
|
||||
}
|
||||
|
||||
const enableJavascriptForUrl = (url: string) => {
|
||||
try {
|
||||
const u = new URL(url)
|
||||
@ -60,39 +41,28 @@ export const fetchContent = async (
|
||||
}
|
||||
console.log(`content-fetch request`, logRecord)
|
||||
|
||||
let page: Page | undefined,
|
||||
title: string | undefined,
|
||||
let title: string | undefined,
|
||||
content: string | undefined,
|
||||
contentType: string | undefined,
|
||||
context: BrowserContext | undefined
|
||||
|
||||
try {
|
||||
url = getUrl(url)
|
||||
if (!url) {
|
||||
throw new Error('Valid URL to parse not specified')
|
||||
}
|
||||
|
||||
// pre handle url with custom handlers
|
||||
try {
|
||||
const result = await preHandleContent(url)
|
||||
if (result && result.url) {
|
||||
validateUrlString(url)
|
||||
url = result.url
|
||||
}
|
||||
if (result && result.title) {
|
||||
title = result.title
|
||||
}
|
||||
if (result && result.content) {
|
||||
content = result.content
|
||||
}
|
||||
if (result && result.contentType) {
|
||||
contentType = result.contentType
|
||||
if (result?.url) {
|
||||
url = getUrl(result.url)
|
||||
}
|
||||
title = result?.title
|
||||
content = result?.content
|
||||
contentType = result?.contentType
|
||||
} catch (e) {
|
||||
console.info('error with handler: ', e)
|
||||
console.error('error with handler: ', e)
|
||||
}
|
||||
|
||||
if ((!content || !title) && contentType !== 'application/pdf') {
|
||||
if (contentType !== 'application/pdf' && (!content || !title)) {
|
||||
const result = await retrievePage(
|
||||
url,
|
||||
logRecord,
|
||||
@ -100,59 +70,27 @@ export const fetchContent = async (
|
||||
locale,
|
||||
timezone
|
||||
)
|
||||
if (result && result.context) {
|
||||
context = result.context
|
||||
}
|
||||
if (result && result.page) {
|
||||
page = result.page
|
||||
}
|
||||
if (result && result.finalUrl) {
|
||||
url = result.finalUrl
|
||||
}
|
||||
if (result && result.contentType) {
|
||||
contentType = result.contentType
|
||||
}
|
||||
}
|
||||
context = result.context
|
||||
url = result.finalUrl
|
||||
contentType = result.contentType
|
||||
|
||||
if (contentType !== 'application/pdf') {
|
||||
if (page && (!content || !title)) {
|
||||
const page = result.page
|
||||
if (page) {
|
||||
const result = await retrieveHtml(page, logRecord)
|
||||
if (result.isBlocked) {
|
||||
const sbResult = await fetchContentWithScrapingBee(url)
|
||||
title = sbResult.title
|
||||
content = sbResult.domContent
|
||||
} else {
|
||||
title = result.title
|
||||
content = result.domContent
|
||||
}
|
||||
} else {
|
||||
console.info('using prefetched content and title')
|
||||
title = result.title
|
||||
content = result.domContent
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
console.error(`Error while retrieving page ${url}`, e)
|
||||
|
||||
// fallback to scrapingbee for non pdf content
|
||||
if (url && contentType !== 'application/pdf') {
|
||||
console.info('fallback to scrapingbee', url)
|
||||
|
||||
const sbResult = await fetchContentWithScrapingBee(url)
|
||||
|
||||
return {
|
||||
finalUrl: url,
|
||||
title: sbResult.title,
|
||||
content: sbResult.domContent,
|
||||
contentType,
|
||||
}
|
||||
}
|
||||
|
||||
throw e
|
||||
} finally {
|
||||
// close browser context if it was created
|
||||
if (context) {
|
||||
console.info('closing page...', url)
|
||||
console.info('closing context...', url)
|
||||
await context.close()
|
||||
console.info('page closed', url)
|
||||
console.info('context closed', url)
|
||||
}
|
||||
|
||||
console.info(`content-fetch result`, logRecord)
|
||||
@ -519,8 +457,12 @@ async function retrieveHtml(page: Page, logRecord: Record<string, any>) {
|
||||
|
||||
throw e
|
||||
}
|
||||
|
||||
if (domContent === 'IS_BLOCKED') {
|
||||
return { isBlocked: true }
|
||||
logRecord.blockedByClient = true
|
||||
|
||||
throw new Error('Page is blocked')
|
||||
}
|
||||
|
||||
return { domContent, title }
|
||||
}
|
||||
|
||||
173
packages/puppeteer-parse/src/readability.d.ts
vendored
173
packages/puppeteer-parse/src/readability.d.ts
vendored
@ -1,173 +0,0 @@
|
||||
// Type definitions for non-npm package mozilla-readability 0.2
|
||||
// Project: https://github.com/mozilla/readability
|
||||
// Definitions by: Charles Vandevoorde <https://github.com/charlesvdv>, Alex Wendland <https://github.com/awendland>
|
||||
// Definitions: https://github.com/DefinitelyTyped/DefinitelyTyped
|
||||
// TypeScript Version: 2.2
|
||||
|
||||
declare module '@omnivore/readability' {
|
||||
/**
|
||||
* A standalone version of the readability library used for Firefox Reader View.
|
||||
*
|
||||
* Note that isProbablyReaderable() was moved into a separate file in https://github.com/mozilla/readability/commit/2620542dd1e8380220d82afa97a2c283ae636e40
|
||||
* and therefore is no longer part of the Readability class.
|
||||
*/
|
||||
class Readability {
|
||||
/**
|
||||
* ## Usage on the web
|
||||
*
|
||||
* To parse a document, you must create a new Readability object from a
|
||||
* DOM document object, and then call parse(). Here's an example:
|
||||
*
|
||||
* ```js
|
||||
* var article = new Readability(document).parse();
|
||||
* ```
|
||||
*
|
||||
* If you're using Readability on the web, you will likely be able to
|
||||
* use a document reference from elsewhere (e.g. fetched via XMLHttpRequest,
|
||||
* in a same-origin <iframe> you have access to, etc.).
|
||||
*
|
||||
* ## Usage from node.js
|
||||
*
|
||||
* In node.js, you won't generally have a DOM document object. To obtain one, you can use external
|
||||
* libraries like [jsdom](https://github.com/tmpvar/jsdom). While this repository contains a parser of
|
||||
* its own (`JSDOMParser`), that is restricted to reading XML-compatible markup and therefore we do
|
||||
* not recommend it for general use.
|
||||
*
|
||||
* If you're using `jsdom` to create a DOM object, you should ensure that the page doesn't run (page)
|
||||
* scripts (avoid fetching remote resources etc.) as well as passing it the page's URI as the `url`
|
||||
* property of the `options` object you pass the `JSDOM` constructor.
|
||||
*
|
||||
* ```js
|
||||
* var JSDOM = require('jsdom').JSDOM;
|
||||
* var doc = new JSDOM("<body>Here's a bunch of text</body>", {
|
||||
* url: "https://www.example.com/the-page-i-got-the-source-from",
|
||||
* });
|
||||
* let reader = new Readability(doc.window.document);
|
||||
* let article = reader.parse();
|
||||
* ```
|
||||
*/
|
||||
constructor(doc: Document, options?: Readability.Options)
|
||||
|
||||
/**
|
||||
* Runs readability.
|
||||
*
|
||||
* ## Workflow:
|
||||
*
|
||||
* 1. Prep the document by removing script tags, css, etc.
|
||||
* 2. Build readability's DOM tree.
|
||||
* 3. Grab the article content from the current dom tree.
|
||||
* 4. Replace the current DOM tree with the new one.
|
||||
* 5. Read peacefully.
|
||||
*
|
||||
* ## Additional notes:
|
||||
*
|
||||
* Readability's parse() works by modifying the DOM. This removes some
|
||||
* elements in the web page. You could avoid this by passing the clone
|
||||
* of the document object while creating a Readability object.
|
||||
*
|
||||
* ```js
|
||||
* var documentClone = document.cloneNode(true);
|
||||
* var article = new Readability(documentClone).parse();
|
||||
* ```
|
||||
*
|
||||
* The response will be null if the processing failed (https://github.com/mozilla/readability/blob/52ab9b5c8916c306a47b2119270dcdabebf9d203/Readability.js#L2038)
|
||||
*/
|
||||
async parse(): Promise<Readability.ParseResult | null>
|
||||
}
|
||||
|
||||
namespace Readability {
|
||||
interface Options {
|
||||
/**
|
||||
* Control whether log messages are sent to the console
|
||||
*/
|
||||
debug?: boolean
|
||||
|
||||
/**
|
||||
* Set a maximum size on the documents that will be processed. This size is
|
||||
* checked before any parsing operations occur. If the number of elements in
|
||||
* the document exceeds this threshold then an Error will be thrown.
|
||||
*
|
||||
* See implementation details at https://github.com/mozilla/readability/blob/52ab9b5c8916c306a47b2119270dcdabebf9d203/Readability.js#L2019
|
||||
*/
|
||||
maxElemsToParse?: number
|
||||
|
||||
nbTopCandidates?: number
|
||||
|
||||
/**
|
||||
* Minimum number of characters in the extracted textContent in order to
|
||||
* consider the article correctly identified. If the threshold is not met then
|
||||
* the extraction process will automatically run again with different flags.
|
||||
*
|
||||
* See implementation details at https://github.com/mozilla/readability/blob/52ab9b5c8916c306a47b2119270dcdabebf9d203/Readability.js#L1208
|
||||
*
|
||||
* Changed from wordThreshold in https://github.com/mozilla/readability/commit/3ff9a166fb27928f222c4c0722e730eda412658a
|
||||
*/
|
||||
charThreshold?: number
|
||||
|
||||
/**
|
||||
* parse() removes the class="" attribute from every element in the given
|
||||
* subtree, except those that match CLASSES_TO_PRESERVE and
|
||||
* the classesToPreserve array from the options object.
|
||||
*/
|
||||
classesToPreserve?: string[]
|
||||
|
||||
/**
|
||||
* By default Readability will strip all classes from the HTML elements in the
|
||||
* processed article. By setting this to `true` the classes will be retained.
|
||||
*
|
||||
* This is a blanket alternative to `classesToPreserve`.
|
||||
*
|
||||
* Added in https://github.com/mozilla/readability/commit/2982216913af2c66b0690e88606b03116553ad92
|
||||
*/
|
||||
|
||||
keepClasses?: boolean
|
||||
url?: string
|
||||
|
||||
/**
|
||||
* Function that converts a regular image url into imageproxy url
|
||||
* @param url string
|
||||
*/
|
||||
createImageProxyUrl?: (
|
||||
url: string,
|
||||
width?: number,
|
||||
height?: number
|
||||
) => string
|
||||
|
||||
/**
|
||||
* By default, Readability will clean all tables from the HTML elements in the
|
||||
* processed article. But newsletters in emails use tables to display their content.
|
||||
* By setting this to `true`, these tables will be retained.
|
||||
*/
|
||||
keepTables?: boolean
|
||||
ignoreLinkDensity?: boolean
|
||||
}
|
||||
|
||||
interface ParseResult {
|
||||
/** Article title */
|
||||
title: string
|
||||
/** Author metadata */
|
||||
byline?: string | null
|
||||
/** Content direction */
|
||||
dir?: string | null
|
||||
/** HTML string of processed article content */
|
||||
content: string
|
||||
/** non-HTML version of `content` */
|
||||
textContent: string
|
||||
/** Length of an article, in characters */
|
||||
length: number
|
||||
/** Article description, or short excerpt from the content */
|
||||
excerpt: string
|
||||
/** Article site name */
|
||||
siteName?: string | null
|
||||
/** Article site icon */
|
||||
siteIcon?: string | null
|
||||
/** Article preview image */
|
||||
previewImage?: string | null
|
||||
/** Article published date */
|
||||
publishedDate?: Date | null
|
||||
language?: string | null
|
||||
}
|
||||
}
|
||||
|
||||
export { Readability }
|
||||
}
|
||||
24
yarn.lock
24
yarn.lock
@ -7909,13 +7909,6 @@
|
||||
dependencies:
|
||||
"@types/trusted-types" "*"
|
||||
|
||||
"@types/dompurify@^3.0.5":
|
||||
version "3.0.5"
|
||||
resolved "https://registry.yarnpkg.com/@types/dompurify/-/dompurify-3.0.5.tgz#02069a2fcb89a163bacf1a788f73cb415dd75cb7"
|
||||
integrity sha512-1Wg0g3BtQF7sSb27fJQAKck1HECM6zV1EB66j8JH9i3LCjYabJa0FSdiSgsD5K/RbrsR0SiraKacLB+T8ZVYAg==
|
||||
dependencies:
|
||||
"@types/trusted-types" "*"
|
||||
|
||||
"@types/duplexify@^3.6.0":
|
||||
version "3.6.0"
|
||||
resolved "https://registry.yarnpkg.com/@types/duplexify/-/duplexify-3.6.0.tgz#dfc82b64bd3a2168f5bd26444af165bf0237dcd8"
|
||||
@ -8850,13 +8843,6 @@
|
||||
dependencies:
|
||||
"@types/node" "*"
|
||||
|
||||
"@types/urlsafe-base64@^1.0.31":
|
||||
version "1.0.31"
|
||||
resolved "https://registry.yarnpkg.com/@types/urlsafe-base64/-/urlsafe-base64-1.0.31.tgz#716fde3a64a558a32f7f464bc31b43da3e5e87fd"
|
||||
integrity sha512-f+l5StyboXmVsjTXLUkNHu8FptkdJ6iGl7z7nKBQVQwvSDulzi+Ov5Rtuq6c43jHYjSHax7Mplr1ldweOdFohg==
|
||||
dependencies:
|
||||
"@types/node" "*"
|
||||
|
||||
"@types/uuid@^8.3.0", "@types/uuid@^8.3.1", "@types/uuid@^8.3.4":
|
||||
version "8.3.4"
|
||||
resolved "https://registry.yarnpkg.com/@types/uuid/-/uuid-8.3.4.tgz#bd86a43617df0594787d38b735f55c805becf1bc"
|
||||
@ -13235,11 +13221,6 @@ crypto-random-string@^2.0.0:
|
||||
resolved "https://registry.yarnpkg.com/crypto-random-string/-/crypto-random-string-2.0.0.tgz#ef2a7a966ec11083388369baa02ebead229b30d5"
|
||||
integrity sha512-v1plID3y9r/lPhviJ1wrXpLeyUIGAZ2SHNYTEapm7/8A9nLPoyvVp3RK/EPFqn5kEznyWgYZNsRtYYIWbuG8KA==
|
||||
|
||||
crypto@^1.0.1:
|
||||
version "1.0.1"
|
||||
resolved "https://registry.yarnpkg.com/crypto/-/crypto-1.0.1.tgz#2af1b7cad8175d24c8a1b0778255794a21803037"
|
||||
integrity sha512-VxBKmeNcqQdiUQUW2Tzq0t377b54N2bMtXO/qiLa+6eRRmmC4qT3D4OnTGoT/U6O9aklQ/jTwbOtRMTTY8G0Ig==
|
||||
|
||||
css-loader@^3.6.0:
|
||||
version "3.6.0"
|
||||
resolved "https://registry.yarnpkg.com/css-loader/-/css-loader-3.6.0.tgz#2e4b2c7e6e2d27f8c8f28f61bffcd2e6c91ef645"
|
||||
@ -14179,11 +14160,6 @@ dompurify@^2.0.17:
|
||||
resolved "https://registry.yarnpkg.com/dompurify/-/dompurify-2.3.8.tgz#224fe9ae57d7ebd9a1ae1ac18c1c1ca3f532226f"
|
||||
integrity sha512-eVhaWoVibIzqdGYjwsBWodIQIaXFSB+cKDf4cfxLMsK0xiud6SE+/WCVx/Xw/UwQsa4cS3T2eITcdtmTg2UKcw==
|
||||
|
||||
dompurify@^2.4.1:
|
||||
version "2.4.1"
|
||||
resolved "https://registry.yarnpkg.com/dompurify/-/dompurify-2.4.1.tgz#f9cb1a275fde9af6f2d0a2644ef648dd6847b631"
|
||||
integrity sha512-ewwFzHzrrneRjxzmK6oVz/rZn9VWspGFRDb4/rRtIsM1n36t9AKma/ye8syCpcw+XJ25kOK/hOG7t1j2I2yBqA==
|
||||
|
||||
dompurify@^2.4.3:
|
||||
version "2.4.3"
|
||||
resolved "https://registry.yarnpkg.com/dompurify/-/dompurify-2.4.3.tgz#f4133af0e6a50297fc8874e2eaedc13a3c308c03"
|
||||
|
||||
Reference in New Issue
Block a user