Files
omnivore/packages/content-handler/src/websites/scrapingBee-handler.ts
2022-10-07 16:57:01 +08:00

39 lines
1.1 KiB
TypeScript

import { ContentHandler, PreHandleResult } from '../content-handler'
import axios from 'axios'
import { parseHTML } from 'linkedom'
export class ScrapingBeeHandler extends ContentHandler {
constructor() {
super()
this.name = 'ScrapingBee'
}
shouldPreHandle(url: string): boolean {
const u = new URL(url)
const hostnames = ['nytimes.com', 'news.google.com']
return hostnames.some((h) => u.hostname.endsWith(h))
}
async preHandle(url: string): Promise<PreHandleResult> {
console.log('prehandling url with scrapingbee', url)
try {
const response = await axios.get('https://app.scrapingbee.com/api/v1', {
params: {
api_key: process.env.SCRAPINGBEE_API_KEY,
url: url,
return_page_source: true,
block_ads: true,
block_resources: false,
},
})
const dom = parseHTML(response.data).document
return { title: dom.title, content: response.data as string, url: url }
} catch (error) {
console.error('error prehandling url w/scrapingbee', error)
throw error
}
}
}