Improving Self-Hosting and Removing 3rd Party dependencies. (#4513)

* fix: Library Header layout shift

* Bump Github Actions versions.

* Self-Hosting Changes

* Fix Minio Environment Variable

* Just make pdfs successful, due to lack of PDFHandler

* Fix issue where flag was set wrong

* Added an NGINX Example file

* Add some documentation for self-hosting via Docker Compose

* Make some adjustments to Puppeteer due to failing sites.

* adjust timings

* Add start of Mail Service

* Fix Docker Files

* More email service stuff

* Add Guide to use Zapier for Email-Importing.

* Ensure that if no env is provided it uses the old email settings

* Add some instructions for self-hosted email

* Add SNS Endpoints for Mail Watcher

* Add steps and functionality for using SES and SNS for email

* Uncomment a few jobs.

* Added option for Firefox for parser. Was having issues with Chromium on Docker.

* Add missing space.

Co-authored-by: Russ Taylor <729694+russtaylor@users.noreply.github.com>

* Fix some wording on the Guide

* update browser extension to handle self-hosted instances

* add slight documentation to options page

* Fix MV

* Do raw handlers for Medium

* Fix images in Medium

* Update self-hosting/GUIDE.md

Co-authored-by: Mike Baker <1426795+mbaker3@users.noreply.github.com>

* Update Guide with other variables

* Add The Verge to JS-less handlers

* Update regex and image-proxy

* Update self-hosting/nginx/nginx.conf

Co-authored-by: Mike Baker <1426795+mbaker3@users.noreply.github.com>

* Update regex and image-proxy

* Update self-hosting/docker-compose/docker-compose.yml

Co-authored-by: Mike Baker <1426795+mbaker3@users.noreply.github.com>

* Fix Minio for Export

* Merge to main

* Update GUIDE with newer NGINX

* Update nginx config to include api/save route

* Enable Native PDF View for PDFS

* Enable Native PDF View for PDFS

* feat:lover packages test

* feat:working build

* feat:alpine build

* docs:api dockerfile docs

* Write a PDF.js wrapper to replace pspdfkit

* Revert changes for replication, set settings to have default mode

* build folder got removed due to gitignore on pdf

* Add Box shadow to pdf pages

* Add Toggle for Progress in PDFS, enabled native viewer toggle

* Update node version to LTS

* Update node version to LTS

* Fix Linting issues

* Fix Linting issues

* Make env variable nullable

* Add touchend listener for mobile

* Make changes to PDF for mobile

* fix(android): change serverUrl to selfhosted first

* feat:2 stage alpine content fetch

* feat:separated start script

* fix:changed to node 22

* Add back youtube functionality and add guide

* trigger build

* Fix cache issue on YouTube

* Allow empty AWS_S3_ENDPOINT

* Allow empty AWS_S3_ENDPOINT

* Add GCHR for all images

* Add GCHR For self hosting.

* Add GCHR For self hosting.

* Test prebuilt.

* Test prebuilt

* Test prebuilt...

* Fix web image

* Remove Web Image (For now)

* Move docker-compose to images

* Move docker-compose files to correct locations

* Remove the need for ARGS

* Update packages, and Typescript versions

* Fix

* Fix issues with build on Web

* Correct push

* Fix Linting issues

* Fix Trace import

* Add missing types

* Fix Tasks

* Add information into guide about self-build

* Fix issues with PDF Viewer

---------

Co-authored-by: keumky2 <keumky2@woowahan.com>
Co-authored-by: William Theaker <wtheaker@nvidia.com>
Co-authored-by: Russ Taylor <729694+russtaylor@users.noreply.github.com>
Co-authored-by: David Adams <david@dadams2.com>
Co-authored-by: Mike Baker <1426795+mbaker3@users.noreply.github.com>
Co-authored-by: m1xxos <66390094+m1xxos@users.noreply.github.com>
Co-authored-by: Adil <mr.adil777@gmail.com>
This commit is contained in:
Tom Rogers
2025-01-27 13:33:16 +01:00
committed by GitHub
parent 4cd5f2f02a
commit 4e582fb55d
339 changed files with 14859 additions and 11964 deletions

View File

@ -4,6 +4,9 @@
"project": "tsconfig.json"
},
"rules": {
"@typescript-eslint/no-unsafe-call": ["warn"],
"@typescript-eslint/no-unsafe-member-access": ["warn"],
"@typescript-eslint/no-unsafe-argument": ["warn"],
"@typescript-eslint/no-floating-promises": [
"error",
{

View File

@ -9,7 +9,7 @@
],
"dependencies": {
"@omnivore/content-handler": "1.0.0",
"puppeteer-core": "^22.12.1",
"puppeteer-core": "^23.6.1",
"puppeteer-extra": "^3.3.6",
"puppeteer-extra-plugin-adblocker": "^2.13.6",
"puppeteer-extra-plugin-stealth": "^2.11.2"

View File

@ -3,8 +3,10 @@ import puppeteer from 'puppeteer-extra'
import AdblockerPlugin from 'puppeteer-extra-plugin-adblocker'
import StealthPlugin from 'puppeteer-extra-plugin-stealth'
puppeteer.use(StealthPlugin())
puppeteer.use(AdblockerPlugin({ blockTrackers: true }))
if (process.env['USE_FIREFOX'] != 'true') {
puppeteer.use(StealthPlugin())
puppeteer.use(AdblockerPlugin({ blockTrackers: true }))
}
let browserInstance: Browser | null = null
@ -51,15 +53,22 @@ export const getBrowser = async (): Promise<Browser> => {
isMobile: false,
width: 1920,
},
executablePath: process.env.CHROMIUM_PATH,
ignoreHTTPSErrors: true,
executablePath:
process.env.USE_FIREFOX == 'true'
? process.env.FIREFOX_PATH
: process.env.CHROMIUM_PATH,
// run in shell mode if headless
headless: process.env.LAUNCH_HEADLESS === 'true' ? 'shell' : false,
timeout: 10_000, // 10 seconds
dumpio: true, // show console logs in the terminal
headless: true,
browser: process.env['USE_FIREFOX'] == 'true' ? 'firefox' : 'chrome',
product: process.env['USE_FIREFOX'] == 'true' ? 'firefox' : 'chrome',
timeout: 30000,
dumpio: true,
// filter out targets
targetFilter: (target: Target) =>
target.type() !== 'other' || !!target.url(),
})) as Browser
})) as unknown as Browser
const version = await browserInstance.version()
console.log('Browser started', version)

View File

@ -144,6 +144,52 @@ function getUrl(urlStr: string) {
return parsed.href
}
const waitForDOMToSettle = (page: Page, timeoutMs = 5000, debounceMs = 1000) =>
page.evaluate(
(timeoutMs, debounceMs) => {
const debounce = (func: (...args: unknown[]) => void, ms = 1000) => {
let timeout: NodeJS.Timeout
console.log(`Debouncing in ${ms}`)
return (...args: unknown[]) => {
console.log('in debounce, clearing timeout again')
clearTimeout(timeout)
timeout = setTimeout(() => {
func.apply(this, args)
}, ms)
}
}
return new Promise<void>((resolve) => {
const mainTimeout = setTimeout(() => {
observer.disconnect()
console.log(
'Timed out whilst waiting for DOM to settle. Using what we have.'
)
resolve()
}, timeoutMs)
const debouncedResolve = debounce(() => {
observer.disconnect()
clearTimeout(mainTimeout)
resolve()
}, debounceMs)
const observer = new MutationObserver(() => {
debouncedResolve()
})
const config = {
attributes: true,
childList: true,
subtree: true,
}
observer.observe(document.body, config)
})
},
timeoutMs,
debounceMs
)
async function retrievePage(
url: string,
logRecord: Record<string, any>,
@ -177,105 +223,128 @@ async function retrievePage(
}
// set timezone for the page
if (timezone) {
await page.emulateTimezone(timezone)
}
const client = await page.createCDPSession()
const downloadPath = path.resolve('./download_dir/')
await client.send('Page.setDownloadBehavior', {
behavior: 'allow',
downloadPath,
})
// intercept request when response headers was received
await client.send('Network.setRequestInterception', {
patterns: [
{
urlPattern: '*',
resourceType: 'Document',
interceptionStage: 'HeadersReceived',
},
],
})
client.on(
'Network.requestIntercepted',
(e: Protocol.Network.RequestInterceptedEvent) => {
;(async () => {
const headers = e.responseHeaders || {}
const [contentType] = (
headers['content-type'] ||
headers['Content-Type'] ||
''
)
.toLowerCase()
.split(';')
const obj: Protocol.Network.ContinueInterceptedRequestRequest = {
interceptionId: e.interceptionId,
}
if (
e.responseStatusCode &&
e.responseStatusCode >= 200 &&
e.responseStatusCode < 300
) {
// We only check content-type on success responses
// as it doesn't matter what the content type is for things
// like redirects
if (contentType && !ALLOWED_CONTENT_TYPES.includes(contentType)) {
obj['errorReason'] = 'BlockedByClient'
}
}
try {
await client.send('Network.continueInterceptedRequest', obj)
} catch {
// ignore
}
})()
if (process.env['USE_FIREFOX'] !== 'true') {
if (timezone) {
await page.emulateTimezone(timezone)
}
)
const client = await page.createCDPSession()
const downloadPath = path.resolve('./download_dir/')
await client.send('Page.setDownloadBehavior', {
behavior: 'allow',
downloadPath,
})
// intercept request when response headers was received
await client.send('Network.setRequestInterception', {
patterns: [
{
urlPattern: '*',
resourceType: 'Document',
interceptionStage: 'HeadersReceived',
},
],
})
client.on(
'Network.requestIntercepted',
(e: Protocol.Network.RequestInterceptedEvent) => {
;(async () => {
const headers = e.responseHeaders || {}
const [contentType] = (
headers['content-type'] ||
headers['Content-Type'] ||
''
)
.toLowerCase()
.split(';')
const obj: Protocol.Network.ContinueInterceptedRequestRequest = {
interceptionId: e.interceptionId,
}
if (
e.responseStatusCode &&
e.responseStatusCode >= 200 &&
e.responseStatusCode < 300
) {
// We only check content-type on success responses
// as it doesn't matter what the content type is for things
// like redirects
if (contentType && !ALLOWED_CONTENT_TYPES.includes(contentType)) {
obj['errorReason'] = 'BlockedByClient'
}
}
try {
await client.send('Network.continueInterceptedRequest', obj)
} catch {
// ignore
}
})()
}
)
}
/*
* Disallow MathJax from running in Puppeteer and modifying the document,
* we shall instead run it in our frontend application to transform any
* mathjax content when present.
*/
await page.setRequestInterception(true)
let requestCount = 0
const failedRequests = new Set()
page.removeAllListeners('request')
page.on('request', (request) => {
;(async () => {
if (request.resourceType() === 'font') {
if (request.isInterceptResolutionHandled()) return
// since .requestType() is not FF compatible, look for font files.
if (request.url().toLowerCase().includes('.woff2')) {
// Disallow fonts from loading
return request.abort()
}
if (requestCount++ > 100) {
return request.abort()
}
if (
request.resourceType() === 'script' &&
request.url().toLowerCase().indexOf('mathjax') > -1
) {
if (failedRequests.has(request.url())) {
return request.abort()
}
if (request.url().toLowerCase().indexOf('mathjax') > -1) {
return request.abort()
}
await request.continue()
})()
})
await page.setRequestInterception(true)
page.on('response', (response) => {
if (!response.ok()) {
console.log('Failed request', response.url())
failedRequests.add(response.url())
}
if (response.headers()['content-type'] === 'application/pdf') {
lastPdfUrl = response.url()
}
})
console.log('Trying to load page, for 30 seconds')
const response = await page.goto(url, {
timeout: 30 * 1000,
waitUntil: ['networkidle0'],
waitUntil: ['load'],
})
console.log('Waited for content to load, waiting for DOM to settle.')
await waitForDOMToSettle(page)
// Just wait for a few seconds to allow the dom to resolve.
// await new Promise((r) => setTimeout(r, 2500))
if (!response) {
throw new Error('No response from page')
}