Improving Self-Hosting and Removing 3rd Party dependencies. (#4513)
* fix: Library Header layout shift * Bump Github Actions versions. * Self-Hosting Changes * Fix Minio Environment Variable * Just make pdfs successful, due to lack of PDFHandler * Fix issue where flag was set wrong * Added an NGINX Example file * Add some documentation for self-hosting via Docker Compose * Make some adjustments to Puppeteer due to failing sites. * adjust timings * Add start of Mail Service * Fix Docker Files * More email service stuff * Add Guide to use Zapier for Email-Importing. * Ensure that if no env is provided it uses the old email settings * Add some instructions for self-hosted email * Add SNS Endpoints for Mail Watcher * Add steps and functionality for using SES and SNS for email * Uncomment a few jobs. * Added option for Firefox for parser. Was having issues with Chromium on Docker. * Add missing space. Co-authored-by: Russ Taylor <729694+russtaylor@users.noreply.github.com> * Fix some wording on the Guide * update browser extension to handle self-hosted instances * add slight documentation to options page * Fix MV * Do raw handlers for Medium * Fix images in Medium * Update self-hosting/GUIDE.md Co-authored-by: Mike Baker <1426795+mbaker3@users.noreply.github.com> * Update Guide with other variables * Add The Verge to JS-less handlers * Update regex and image-proxy * Update self-hosting/nginx/nginx.conf Co-authored-by: Mike Baker <1426795+mbaker3@users.noreply.github.com> * Update regex and image-proxy * Update self-hosting/docker-compose/docker-compose.yml Co-authored-by: Mike Baker <1426795+mbaker3@users.noreply.github.com> * Fix Minio for Export * Merge to main * Update GUIDE with newer NGINX * Update nginx config to include api/save route * Enable Native PDF View for PDFS * Enable Native PDF View for PDFS * feat:lover packages test * feat:working build * feat:alpine build * docs:api dockerfile docs * Write a PDF.js wrapper to replace pspdfkit * Revert changes for replication, set settings to have default mode * build folder got removed due to gitignore on pdf * Add Box shadow to pdf pages * Add Toggle for Progress in PDFS, enabled native viewer toggle * Update node version to LTS * Update node version to LTS * Fix Linting issues * Fix Linting issues * Make env variable nullable * Add touchend listener for mobile * Make changes to PDF for mobile * fix(android): change serverUrl to selfhosted first * feat:2 stage alpine content fetch * feat:separated start script * fix:changed to node 22 * Add back youtube functionality and add guide * trigger build * Fix cache issue on YouTube * Allow empty AWS_S3_ENDPOINT * Allow empty AWS_S3_ENDPOINT * Add GCHR for all images * Add GCHR For self hosting. * Add GCHR For self hosting. * Test prebuilt. * Test prebuilt * Test prebuilt... * Fix web image * Remove Web Image (For now) * Move docker-compose to images * Move docker-compose files to correct locations * Remove the need for ARGS * Update packages, and Typescript versions * Fix * Fix issues with build on Web * Correct push * Fix Linting issues * Fix Trace import * Add missing types * Fix Tasks * Add information into guide about self-build * Fix issues with PDF Viewer --------- Co-authored-by: keumky2 <keumky2@woowahan.com> Co-authored-by: William Theaker <wtheaker@nvidia.com> Co-authored-by: Russ Taylor <729694+russtaylor@users.noreply.github.com> Co-authored-by: David Adams <david@dadams2.com> Co-authored-by: Mike Baker <1426795+mbaker3@users.noreply.github.com> Co-authored-by: m1xxos <66390094+m1xxos@users.noreply.github.com> Co-authored-by: Adil <mr.adil777@gmail.com>
This commit is contained in:
@ -4,6 +4,9 @@
|
||||
"project": "tsconfig.json"
|
||||
},
|
||||
"rules": {
|
||||
"@typescript-eslint/no-unsafe-call": ["warn"],
|
||||
"@typescript-eslint/no-unsafe-member-access": ["warn"],
|
||||
"@typescript-eslint/no-unsafe-argument": ["warn"],
|
||||
"@typescript-eslint/no-floating-promises": [
|
||||
"error",
|
||||
{
|
||||
|
||||
@ -9,7 +9,7 @@
|
||||
],
|
||||
"dependencies": {
|
||||
"@omnivore/content-handler": "1.0.0",
|
||||
"puppeteer-core": "^22.12.1",
|
||||
"puppeteer-core": "^23.6.1",
|
||||
"puppeteer-extra": "^3.3.6",
|
||||
"puppeteer-extra-plugin-adblocker": "^2.13.6",
|
||||
"puppeteer-extra-plugin-stealth": "^2.11.2"
|
||||
|
||||
@ -3,8 +3,10 @@ import puppeteer from 'puppeteer-extra'
|
||||
import AdblockerPlugin from 'puppeteer-extra-plugin-adblocker'
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth'
|
||||
|
||||
puppeteer.use(StealthPlugin())
|
||||
puppeteer.use(AdblockerPlugin({ blockTrackers: true }))
|
||||
if (process.env['USE_FIREFOX'] != 'true') {
|
||||
puppeteer.use(StealthPlugin())
|
||||
puppeteer.use(AdblockerPlugin({ blockTrackers: true }))
|
||||
}
|
||||
|
||||
let browserInstance: Browser | null = null
|
||||
|
||||
@ -51,15 +53,22 @@ export const getBrowser = async (): Promise<Browser> => {
|
||||
isMobile: false,
|
||||
width: 1920,
|
||||
},
|
||||
executablePath: process.env.CHROMIUM_PATH,
|
||||
ignoreHTTPSErrors: true,
|
||||
executablePath:
|
||||
process.env.USE_FIREFOX == 'true'
|
||||
? process.env.FIREFOX_PATH
|
||||
: process.env.CHROMIUM_PATH,
|
||||
// run in shell mode if headless
|
||||
headless: process.env.LAUNCH_HEADLESS === 'true' ? 'shell' : false,
|
||||
timeout: 10_000, // 10 seconds
|
||||
dumpio: true, // show console logs in the terminal
|
||||
headless: true,
|
||||
browser: process.env['USE_FIREFOX'] == 'true' ? 'firefox' : 'chrome',
|
||||
product: process.env['USE_FIREFOX'] == 'true' ? 'firefox' : 'chrome',
|
||||
timeout: 30000,
|
||||
dumpio: true,
|
||||
|
||||
// filter out targets
|
||||
targetFilter: (target: Target) =>
|
||||
target.type() !== 'other' || !!target.url(),
|
||||
})) as Browser
|
||||
})) as unknown as Browser
|
||||
|
||||
const version = await browserInstance.version()
|
||||
console.log('Browser started', version)
|
||||
|
||||
@ -144,6 +144,52 @@ function getUrl(urlStr: string) {
|
||||
return parsed.href
|
||||
}
|
||||
|
||||
const waitForDOMToSettle = (page: Page, timeoutMs = 5000, debounceMs = 1000) =>
|
||||
page.evaluate(
|
||||
(timeoutMs, debounceMs) => {
|
||||
const debounce = (func: (...args: unknown[]) => void, ms = 1000) => {
|
||||
let timeout: NodeJS.Timeout
|
||||
console.log(`Debouncing in ${ms}`)
|
||||
return (...args: unknown[]) => {
|
||||
console.log('in debounce, clearing timeout again')
|
||||
clearTimeout(timeout)
|
||||
timeout = setTimeout(() => {
|
||||
func.apply(this, args)
|
||||
}, ms)
|
||||
}
|
||||
}
|
||||
return new Promise<void>((resolve) => {
|
||||
const mainTimeout = setTimeout(() => {
|
||||
observer.disconnect()
|
||||
console.log(
|
||||
'Timed out whilst waiting for DOM to settle. Using what we have.'
|
||||
)
|
||||
resolve()
|
||||
}, timeoutMs)
|
||||
|
||||
const debouncedResolve = debounce(() => {
|
||||
observer.disconnect()
|
||||
clearTimeout(mainTimeout)
|
||||
resolve()
|
||||
}, debounceMs)
|
||||
|
||||
const observer = new MutationObserver(() => {
|
||||
debouncedResolve()
|
||||
})
|
||||
|
||||
const config = {
|
||||
attributes: true,
|
||||
childList: true,
|
||||
subtree: true,
|
||||
}
|
||||
|
||||
observer.observe(document.body, config)
|
||||
})
|
||||
},
|
||||
timeoutMs,
|
||||
debounceMs
|
||||
)
|
||||
|
||||
async function retrievePage(
|
||||
url: string,
|
||||
logRecord: Record<string, any>,
|
||||
@ -177,105 +223,128 @@ async function retrievePage(
|
||||
}
|
||||
|
||||
// set timezone for the page
|
||||
if (timezone) {
|
||||
await page.emulateTimezone(timezone)
|
||||
}
|
||||
|
||||
const client = await page.createCDPSession()
|
||||
|
||||
const downloadPath = path.resolve('./download_dir/')
|
||||
await client.send('Page.setDownloadBehavior', {
|
||||
behavior: 'allow',
|
||||
downloadPath,
|
||||
})
|
||||
|
||||
// intercept request when response headers was received
|
||||
await client.send('Network.setRequestInterception', {
|
||||
patterns: [
|
||||
{
|
||||
urlPattern: '*',
|
||||
resourceType: 'Document',
|
||||
interceptionStage: 'HeadersReceived',
|
||||
},
|
||||
],
|
||||
})
|
||||
|
||||
client.on(
|
||||
'Network.requestIntercepted',
|
||||
(e: Protocol.Network.RequestInterceptedEvent) => {
|
||||
;(async () => {
|
||||
const headers = e.responseHeaders || {}
|
||||
|
||||
const [contentType] = (
|
||||
headers['content-type'] ||
|
||||
headers['Content-Type'] ||
|
||||
''
|
||||
)
|
||||
.toLowerCase()
|
||||
.split(';')
|
||||
const obj: Protocol.Network.ContinueInterceptedRequestRequest = {
|
||||
interceptionId: e.interceptionId,
|
||||
}
|
||||
|
||||
if (
|
||||
e.responseStatusCode &&
|
||||
e.responseStatusCode >= 200 &&
|
||||
e.responseStatusCode < 300
|
||||
) {
|
||||
// We only check content-type on success responses
|
||||
// as it doesn't matter what the content type is for things
|
||||
// like redirects
|
||||
if (contentType && !ALLOWED_CONTENT_TYPES.includes(contentType)) {
|
||||
obj['errorReason'] = 'BlockedByClient'
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
await client.send('Network.continueInterceptedRequest', obj)
|
||||
} catch {
|
||||
// ignore
|
||||
}
|
||||
})()
|
||||
if (process.env['USE_FIREFOX'] !== 'true') {
|
||||
if (timezone) {
|
||||
await page.emulateTimezone(timezone)
|
||||
}
|
||||
)
|
||||
|
||||
const client = await page.createCDPSession()
|
||||
|
||||
const downloadPath = path.resolve('./download_dir/')
|
||||
await client.send('Page.setDownloadBehavior', {
|
||||
behavior: 'allow',
|
||||
downloadPath,
|
||||
})
|
||||
|
||||
// intercept request when response headers was received
|
||||
await client.send('Network.setRequestInterception', {
|
||||
patterns: [
|
||||
{
|
||||
urlPattern: '*',
|
||||
resourceType: 'Document',
|
||||
interceptionStage: 'HeadersReceived',
|
||||
},
|
||||
],
|
||||
})
|
||||
|
||||
client.on(
|
||||
'Network.requestIntercepted',
|
||||
(e: Protocol.Network.RequestInterceptedEvent) => {
|
||||
;(async () => {
|
||||
const headers = e.responseHeaders || {}
|
||||
|
||||
const [contentType] = (
|
||||
headers['content-type'] ||
|
||||
headers['Content-Type'] ||
|
||||
''
|
||||
)
|
||||
.toLowerCase()
|
||||
.split(';')
|
||||
const obj: Protocol.Network.ContinueInterceptedRequestRequest = {
|
||||
interceptionId: e.interceptionId,
|
||||
}
|
||||
|
||||
if (
|
||||
e.responseStatusCode &&
|
||||
e.responseStatusCode >= 200 &&
|
||||
e.responseStatusCode < 300
|
||||
) {
|
||||
// We only check content-type on success responses
|
||||
// as it doesn't matter what the content type is for things
|
||||
// like redirects
|
||||
if (contentType && !ALLOWED_CONTENT_TYPES.includes(contentType)) {
|
||||
obj['errorReason'] = 'BlockedByClient'
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
await client.send('Network.continueInterceptedRequest', obj)
|
||||
} catch {
|
||||
// ignore
|
||||
}
|
||||
})()
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
/*
|
||||
* Disallow MathJax from running in Puppeteer and modifying the document,
|
||||
* we shall instead run it in our frontend application to transform any
|
||||
* mathjax content when present.
|
||||
*/
|
||||
await page.setRequestInterception(true)
|
||||
|
||||
let requestCount = 0
|
||||
const failedRequests = new Set()
|
||||
page.removeAllListeners('request')
|
||||
page.on('request', (request) => {
|
||||
;(async () => {
|
||||
if (request.resourceType() === 'font') {
|
||||
if (request.isInterceptResolutionHandled()) return
|
||||
// since .requestType() is not FF compatible, look for font files.
|
||||
if (request.url().toLowerCase().includes('.woff2')) {
|
||||
// Disallow fonts from loading
|
||||
return request.abort()
|
||||
}
|
||||
|
||||
if (requestCount++ > 100) {
|
||||
return request.abort()
|
||||
}
|
||||
if (
|
||||
request.resourceType() === 'script' &&
|
||||
request.url().toLowerCase().indexOf('mathjax') > -1
|
||||
) {
|
||||
|
||||
if (failedRequests.has(request.url())) {
|
||||
return request.abort()
|
||||
}
|
||||
|
||||
if (request.url().toLowerCase().indexOf('mathjax') > -1) {
|
||||
return request.abort()
|
||||
}
|
||||
|
||||
await request.continue()
|
||||
})()
|
||||
})
|
||||
await page.setRequestInterception(true)
|
||||
|
||||
page.on('response', (response) => {
|
||||
if (!response.ok()) {
|
||||
console.log('Failed request', response.url())
|
||||
failedRequests.add(response.url())
|
||||
}
|
||||
|
||||
if (response.headers()['content-type'] === 'application/pdf') {
|
||||
lastPdfUrl = response.url()
|
||||
}
|
||||
})
|
||||
|
||||
console.log('Trying to load page, for 30 seconds')
|
||||
|
||||
const response = await page.goto(url, {
|
||||
timeout: 30 * 1000,
|
||||
waitUntil: ['networkidle0'],
|
||||
waitUntil: ['load'],
|
||||
})
|
||||
|
||||
console.log('Waited for content to load, waiting for DOM to settle.')
|
||||
await waitForDOMToSettle(page)
|
||||
// Just wait for a few seconds to allow the dom to resolve.
|
||||
// await new Promise((r) => setTimeout(r, 2500))
|
||||
|
||||
if (!response) {
|
||||
throw new Error('No response from page')
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user