bypass cloudflare captcha

This commit is contained in:
Hongbo Wu
2024-07-10 14:43:47 +08:00
parent 73e180f43d
commit 75338f5927
5 changed files with 9 additions and 166 deletions

View File

@ -9,26 +9,7 @@ RUN apt-get update && apt-get install -y \
yarn \
g++ \
make \
python3 \
libasound2 \
libatk-bridge2.0-0 \
libatk1.0-0 \
libatspi2.0-0 \
libcups2 \
libdbus-1-3 \
libdrm2 \
libgbm1 \
libgtk-3-0 \
libnspr4 \
libnss3 \
libu2f-udev \
libvulkan1 \
libwayland-client0 \
libxcomposite1 \
libxdamage1 \
libxfixes3 \
libxkbcommon0 \
libxrandr2
python3
WORKDIR /app

View File

@ -13,8 +13,6 @@
"axios": "^1.4.0",
"crypto": "^1.0.1",
"dompurify": "^2.4.1",
"fingerprint-generator": "^2.1.52",
"fingerprint-injector": "^2.1.52",
"linkedom": "^0.14.9",
"puppeteer-core": "^22.12.1",
"puppeteer-extra": "^3.3.6",

View File

@ -1,4 +1,4 @@
import { Browser } from 'puppeteer-core'
import { Browser, Target } from 'puppeteer-core'
import puppeteer from 'puppeteer-extra'
import AdblockerPlugin from 'puppeteer-extra-plugin-adblocker'
import StealthPlugin from 'puppeteer-extra-plugin-stealth'
@ -26,12 +26,9 @@ export const getBrowser = async (): Promise<Browser> => {
'--autoplay-policy=user-gesture-required',
'--disable-component-update',
'--disable-domain-reliability',
'--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process',
'--disable-print-preview',
'--disable-setuid-sandbox',
'--disable-site-isolation-trials',
'--disable-speech-api',
'--disk-cache-size=33554432',
'--enable-features=SharedArrayBuffer',
'--hide-scrollbars',
'--mute-audio',
@ -39,15 +36,12 @@ export const getBrowser = async (): Promise<Browser> => {
'--no-pings',
'--no-sandbox',
'--no-zygote',
'--window-size=1920,1080',
'--disable-extensions',
'--disable-dev-shm-usage',
'--no-first-run',
'--disable-background-networking',
'--disable-gpu',
'--disable-software-rasterizer',
'--use-gl=angle',
'--use-angle=swiftshader',
],
defaultViewport: {
deviceScaleFactor: 1,
@ -61,6 +55,7 @@ export const getBrowser = async (): Promise<Browser> => {
headless: process.env.LAUNCH_HEADLESS === 'true',
timeout: 10_000, // 10 seconds
dumpio: true, // show console logs in the terminal
targetFilter: (target: Target) => target.type() !== 'other',
})) as Browser
const version = await browserInstance.version()

View File

@ -2,17 +2,11 @@
/* eslint-disable @typescript-eslint/no-unsafe-assignment */
import { preHandleContent } from '@omnivore/content-handler'
import axios from 'axios'
import { newInjectedPage } from 'fingerprint-injector'
import { parseHTML } from 'linkedom'
import path from 'path'
import { Page, Protocol } from 'puppeteer-core'
import { getBrowser } from './browser'
const DESKTOP_USER_AGENT =
'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
const NON_BOT_DESKTOP_USER_AGENT =
'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
const NON_BOT_HOSTS = ['bloomberg.com', 'forbes.com']
const NON_SCRIPT_HOSTS = ['medium.com', 'fastcompany.com', 'fortelabs.com']
const ALLOWED_CONTENT_TYPES = [
@ -21,21 +15,6 @@ const ALLOWED_CONTENT_TYPES = [
'text/plain',
'application/pdf',
]
const REQUEST_TIMEOUT = 30000
const userAgentForUrl = (url: string) => {
try {
const u = new URL(url)
for (const host of NON_BOT_HOSTS) {
if (u.hostname.endsWith(host)) {
return NON_BOT_DESKTOP_USER_AGENT
}
}
} catch (e) {
console.log('error getting user agent for url', url, e)
}
return DESKTOP_USER_AGENT
}
const fetchContentWithScrapingBee = async (url: string) => {
const response = await axios.get('https://app.scrapingbee.com/api/v1', {
@ -46,7 +25,7 @@ const fetchContentWithScrapingBee = async (url: string) => {
premium_proxy: 'true',
country_code: 'us',
},
timeout: REQUEST_TIMEOUT,
timeout: 10_000,
})
const dom = parseHTML(response.data).document
@ -238,15 +217,7 @@ async function retrievePage(
}
const browser = await getBrowser()
const page = (await newInjectedPage(browser, {
fingerprintOptions: {
devices: ['desktop'],
operatingSystems: ['linux'],
browsers: ['chrome'],
locales: ['en-US'],
},
})) as Page
const page = await browser.newPage()
// Puppeteer fails during download of PDf files,
// so record the failure and use those items
@ -365,6 +336,8 @@ async function retrievePage(
throw new Error('No response from page')
}
await page.waitForSelector('body')
const finalUrl = response.url()
const contentType = response.headers()['content-type']