bypass cloudflare captcha
This commit is contained in:
@ -9,26 +9,7 @@ RUN apt-get update && apt-get install -y \
|
||||
yarn \
|
||||
g++ \
|
||||
make \
|
||||
python3 \
|
||||
libasound2 \
|
||||
libatk-bridge2.0-0 \
|
||||
libatk1.0-0 \
|
||||
libatspi2.0-0 \
|
||||
libcups2 \
|
||||
libdbus-1-3 \
|
||||
libdrm2 \
|
||||
libgbm1 \
|
||||
libgtk-3-0 \
|
||||
libnspr4 \
|
||||
libnss3 \
|
||||
libu2f-udev \
|
||||
libvulkan1 \
|
||||
libwayland-client0 \
|
||||
libxcomposite1 \
|
||||
libxdamage1 \
|
||||
libxfixes3 \
|
||||
libxkbcommon0 \
|
||||
libxrandr2
|
||||
python3
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@ -13,8 +13,6 @@
|
||||
"axios": "^1.4.0",
|
||||
"crypto": "^1.0.1",
|
||||
"dompurify": "^2.4.1",
|
||||
"fingerprint-generator": "^2.1.52",
|
||||
"fingerprint-injector": "^2.1.52",
|
||||
"linkedom": "^0.14.9",
|
||||
"puppeteer-core": "^22.12.1",
|
||||
"puppeteer-extra": "^3.3.6",
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
import { Browser } from 'puppeteer-core'
|
||||
import { Browser, Target } from 'puppeteer-core'
|
||||
import puppeteer from 'puppeteer-extra'
|
||||
import AdblockerPlugin from 'puppeteer-extra-plugin-adblocker'
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth'
|
||||
@ -26,12 +26,9 @@ export const getBrowser = async (): Promise<Browser> => {
|
||||
'--autoplay-policy=user-gesture-required',
|
||||
'--disable-component-update',
|
||||
'--disable-domain-reliability',
|
||||
'--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process',
|
||||
'--disable-print-preview',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-site-isolation-trials',
|
||||
'--disable-speech-api',
|
||||
'--disk-cache-size=33554432',
|
||||
'--enable-features=SharedArrayBuffer',
|
||||
'--hide-scrollbars',
|
||||
'--mute-audio',
|
||||
@ -39,15 +36,12 @@ export const getBrowser = async (): Promise<Browser> => {
|
||||
'--no-pings',
|
||||
'--no-sandbox',
|
||||
'--no-zygote',
|
||||
'--window-size=1920,1080',
|
||||
'--disable-extensions',
|
||||
'--disable-dev-shm-usage',
|
||||
'--no-first-run',
|
||||
'--disable-background-networking',
|
||||
'--disable-gpu',
|
||||
'--disable-software-rasterizer',
|
||||
'--use-gl=angle',
|
||||
'--use-angle=swiftshader',
|
||||
],
|
||||
defaultViewport: {
|
||||
deviceScaleFactor: 1,
|
||||
@ -61,6 +55,7 @@ export const getBrowser = async (): Promise<Browser> => {
|
||||
headless: process.env.LAUNCH_HEADLESS === 'true',
|
||||
timeout: 10_000, // 10 seconds
|
||||
dumpio: true, // show console logs in the terminal
|
||||
targetFilter: (target: Target) => target.type() !== 'other',
|
||||
})) as Browser
|
||||
|
||||
const version = await browserInstance.version()
|
||||
|
||||
@ -2,17 +2,11 @@
|
||||
/* eslint-disable @typescript-eslint/no-unsafe-assignment */
|
||||
import { preHandleContent } from '@omnivore/content-handler'
|
||||
import axios from 'axios'
|
||||
import { newInjectedPage } from 'fingerprint-injector'
|
||||
import { parseHTML } from 'linkedom'
|
||||
import path from 'path'
|
||||
import { Page, Protocol } from 'puppeteer-core'
|
||||
import { getBrowser } from './browser'
|
||||
|
||||
const DESKTOP_USER_AGENT =
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
|
||||
const NON_BOT_DESKTOP_USER_AGENT =
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
|
||||
const NON_BOT_HOSTS = ['bloomberg.com', 'forbes.com']
|
||||
const NON_SCRIPT_HOSTS = ['medium.com', 'fastcompany.com', 'fortelabs.com']
|
||||
|
||||
const ALLOWED_CONTENT_TYPES = [
|
||||
@ -21,21 +15,6 @@ const ALLOWED_CONTENT_TYPES = [
|
||||
'text/plain',
|
||||
'application/pdf',
|
||||
]
|
||||
const REQUEST_TIMEOUT = 30000
|
||||
|
||||
const userAgentForUrl = (url: string) => {
|
||||
try {
|
||||
const u = new URL(url)
|
||||
for (const host of NON_BOT_HOSTS) {
|
||||
if (u.hostname.endsWith(host)) {
|
||||
return NON_BOT_DESKTOP_USER_AGENT
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
console.log('error getting user agent for url', url, e)
|
||||
}
|
||||
return DESKTOP_USER_AGENT
|
||||
}
|
||||
|
||||
const fetchContentWithScrapingBee = async (url: string) => {
|
||||
const response = await axios.get('https://app.scrapingbee.com/api/v1', {
|
||||
@ -46,7 +25,7 @@ const fetchContentWithScrapingBee = async (url: string) => {
|
||||
premium_proxy: 'true',
|
||||
country_code: 'us',
|
||||
},
|
||||
timeout: REQUEST_TIMEOUT,
|
||||
timeout: 10_000,
|
||||
})
|
||||
|
||||
const dom = parseHTML(response.data).document
|
||||
@ -238,15 +217,7 @@ async function retrievePage(
|
||||
}
|
||||
|
||||
const browser = await getBrowser()
|
||||
|
||||
const page = (await newInjectedPage(browser, {
|
||||
fingerprintOptions: {
|
||||
devices: ['desktop'],
|
||||
operatingSystems: ['linux'],
|
||||
browsers: ['chrome'],
|
||||
locales: ['en-US'],
|
||||
},
|
||||
})) as Page
|
||||
const page = await browser.newPage()
|
||||
|
||||
// Puppeteer fails during download of PDf files,
|
||||
// so record the failure and use those items
|
||||
@ -365,6 +336,8 @@ async function retrievePage(
|
||||
throw new Error('No response from page')
|
||||
}
|
||||
|
||||
await page.waitForSelector('body')
|
||||
|
||||
const finalUrl = response.url()
|
||||
const contentType = response.headers()['content-type']
|
||||
|
||||
|
||||
Reference in New Issue
Block a user