From 9606cd6b28f6df90237b13a90da82c0424dd04cc Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Thu, 12 May 2022 16:32:22 +0800 Subject: [PATCH 1/7] Remove chrome-aws-lambda dependencies --- packages/content-fetch/fetch-content.js | 29 ++- packages/content-fetch/package.json | 5 - packages/puppeteer-parse/index.js | 31 ++- packages/puppeteer-parse/package.json | 2 - yarn.lock | 290 +++--------------------- 5 files changed, 83 insertions(+), 274 deletions(-) diff --git a/packages/content-fetch/fetch-content.js b/packages/content-fetch/fetch-content.js index 38c19fcf1..fea99d59b 100644 --- a/packages/content-fetch/fetch-content.js +++ b/packages/content-fetch/fetch-content.js @@ -6,7 +6,6 @@ require('dotenv').config(); const Url = require('url'); const puppeteer = require('puppeteer-extra'); -const chromium = require('chrome-aws-lambda'); const axios = require('axios'); const jwt = require('jsonwebtoken'); const { promisify } = require('util'); @@ -50,10 +49,34 @@ const userAgentForUrl = (url) => { // launch Puppeteer const getBrowserPromise = (async () => { return puppeteer.launch({ - args: chromium.args, + args: [ + '--allow-running-insecure-content', + '--autoplay-policy=user-gesture-required', + '--disable-component-update', + '--disable-domain-reliability', + '--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process', + '--disable-print-preview', + '--disable-setuid-sandbox', + '--disable-site-isolation-trials', + '--disable-speech-api', + '--disable-web-security', + '--disk-cache-size=33554432', + '--enable-features=SharedArrayBuffer', + '--hide-scrollbars', + '--ignore-gpu-blocklist', + '--in-process-gpu', + '--mute-audio', + '--no-default-browser-check', + '--no-pings', + '--no-sandbox', + '--no-zygote', + '--use-gl=swiftshader', + '--window-size=1920,1080', // https://source.chromium.org/search?q=lang:cpp+symbol:kWindowSize&ss=chromium + process.env.LAUNCH_HEADLESS ? '--single-process' : '--start-maximized', + ], defaultViewport: { height: 1080, width: 1920 }, executablePath: process.env.CHROMIUM_PATH , - headless: true, // process.env.LAUNCH_HEADLESS ? true : false, + headless: !!process.env.LAUNCH_HEADLESS, timeout: 0, userDataDir: '/tmp/puppeteer', }); diff --git a/packages/content-fetch/package.json b/packages/content-fetch/package.json index 7524176a8..aea394edc 100644 --- a/packages/content-fetch/package.json +++ b/packages/content-fetch/package.json @@ -4,18 +4,13 @@ "description": "Service that fetches page content from a URL", "main": "index.js", "dependencies": { - "@cliqz/adblocker-puppeteer": "^1.23.7", - "ad-block-js": "^0.0.2", "axios": "^0.26.0", "dotenv": "^8.2.0", - "chrome-aws-lambda": "^10.1.0", "express": "^4.17.1", - "jsdom": "^19.0.0", "jsonwebtoken": "^8.5.1", "luxon": "^2.3.1", "puppeteer-core": "^13.7.0", "puppeteer-extra": "^3.2.3", - "puppeteer-extra-plugin-adblocker": "^2.12.0", "puppeteer-extra-plugin-stealth": "^2.9.0" }, "scripts": { diff --git a/packages/puppeteer-parse/index.js b/packages/puppeteer-parse/index.js index 56a64afc2..692d0af53 100644 --- a/packages/puppeteer-parse/index.js +++ b/packages/puppeteer-parse/index.js @@ -5,7 +5,6 @@ /* eslint-disable @typescript-eslint/no-require-imports */ require('dotenv').config(); const Url = require('url'); -const chromium = require('chrome-aws-lambda'); const axios = require('axios'); const jwt = require('jsonwebtoken'); const { promisify } = require('util'); @@ -125,10 +124,34 @@ const userAgentForUrl = (url) => { // launch Puppeteer const getBrowserPromise = (async () => { return puppeteer.launch({ - args: chromium.args, + args: [ + '--allow-running-insecure-content', + '--autoplay-policy=user-gesture-required', + '--disable-component-update', + '--disable-domain-reliability', + '--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process', + '--disable-print-preview', + '--disable-setuid-sandbox', + '--disable-site-isolation-trials', + '--disable-speech-api', + '--disable-web-security', + '--disk-cache-size=33554432', + '--enable-features=SharedArrayBuffer', + '--hide-scrollbars', + '--ignore-gpu-blocklist', + '--in-process-gpu', + '--mute-audio', + '--no-default-browser-check', + '--no-pings', + '--no-sandbox', + '--no-zygote', + '--use-gl=swiftshader', + '--window-size=1920,1080', // https://source.chromium.org/search?q=lang:cpp+symbol:kWindowSize&ss=chromium + process.env.LAUNCH_HEADLESS ? '--single-process' : '--start-maximized', + ], defaultViewport: { height: 1080, width: 1920 }, - executablePath: process.env.CHROMIUM_PATH || (await chromium.executablePath), - headless: process.env.LAUNCH_HEADLESS ? true : chromium.headless, + executablePath: process.env.CHROMIUM_PATH, + headless: !!process.env.LAUNCH_HEADLESS, timeout: 0, userDataDir: '/tmp/puppeteer', }); diff --git a/packages/puppeteer-parse/package.json b/packages/puppeteer-parse/package.json index e37537415..3c778b9a6 100644 --- a/packages/puppeteer-parse/package.json +++ b/packages/puppeteer-parse/package.json @@ -8,9 +8,7 @@ "@google-cloud/storage": "^5.18.1", "@sentry/serverless": "^6.13.3", "axios": "^0.26.0", - "chrome-aws-lambda": "^10.1.0", "dotenv": "^8.2.0", - "jsdom": "^19.0.0", "jsonwebtoken": "^8.5.1", "luxon": "^2.3.1", "puppeteer-core": "^13.7.0", diff --git a/yarn.lock b/yarn.lock index 3eee80b2b..e01b2f966 100644 --- a/yarn.lock +++ b/yarn.lock @@ -2017,50 +2017,6 @@ commander "^4.1.0" microtime "^3.0.0" -"@cliqz/adblocker-content@^1.22.6", "@cliqz/adblocker-content@^1.23.7": - version "1.23.7" - resolved "https://registry.yarnpkg.com/@cliqz/adblocker-content/-/adblocker-content-1.23.7.tgz#21a1035e479d8f4dc4e7ecc2500cf8a4149eaace" - integrity sha512-tgCFcWhBty+WI3ObYBLDk56i1G7nScd5TBAOXlQf2EIbVJ+DC+G0YE91YHhQpzfy4ySI/wRGyEHrKzTRgmkuKQ== - dependencies: - "@cliqz/adblocker-extended-selectors" "^1.23.7" - -"@cliqz/adblocker-extended-selectors@^1.23.7": - version "1.23.7" - resolved "https://registry.yarnpkg.com/@cliqz/adblocker-extended-selectors/-/adblocker-extended-selectors-1.23.7.tgz#4782829c97b7ed229789a14d4e39b2339825077d" - integrity sha512-3y1eeHHZS3qvvv3dDZDjVVr6g03GeUBbDExCW/GOiHdo+qKYmM2K40EaLN+Upzc2ppuum9UKvzsHxG8Hs9R/kQ== - -"@cliqz/adblocker-puppeteer@1.22.6": - version "1.22.6" - resolved "https://registry.yarnpkg.com/@cliqz/adblocker-puppeteer/-/adblocker-puppeteer-1.22.6.tgz#6b311c74ab8f144ab71f2c259b9e49fa76243e9b" - integrity sha512-Yg+64gsBfG8NKIJTKRg+sgK8G32W/z4qNEoMGdGJc7mdKVCn+y93WklDMO3pCy64u9jqUVS/Rd7z/Z96dX3K8Q== - dependencies: - "@cliqz/adblocker" "^1.22.6" - "@cliqz/adblocker-content" "^1.22.6" - tldts-experimental "^5.6.21" - -"@cliqz/adblocker-puppeteer@^1.23.7": - version "1.23.7" - resolved "https://registry.yarnpkg.com/@cliqz/adblocker-puppeteer/-/adblocker-puppeteer-1.23.7.tgz#18367ddefa603d9abf03a72e6e4ecb584430df1d" - integrity sha512-n6xPyRHApAirFVFowK5ngxTOF8ilV/QTNxzHjhmEtJXxiLD55Q4UrwgH7mj22wDhSkOKWUjY6CeCaUdYYb9JVA== - dependencies: - "@cliqz/adblocker" "^1.23.7" - "@cliqz/adblocker-content" "^1.23.7" - tldts-experimental "^5.6.21" - -"@cliqz/adblocker@^1.22.6", "@cliqz/adblocker@^1.23.7": - version "1.23.7" - resolved "https://registry.yarnpkg.com/@cliqz/adblocker/-/adblocker-1.23.7.tgz#d439fe8b62d2789c274b5e124cddb930de25f59a" - integrity sha512-HUvC7CcmNbcIftcRhFeuQhHSpyNdOVdXazXDL0+avvWR0cxlI9zcC8yHMRKjxeY6nsiOXdyEFby+V5KgJvfwaQ== - dependencies: - "@cliqz/adblocker-content" "^1.23.7" - "@cliqz/adblocker-extended-selectors" "^1.23.7" - "@remusao/guess-url-type" "^1.1.2" - "@remusao/small" "^1.1.2" - "@remusao/smaz" "^1.7.1" - "@types/chrome" "^0.0.180" - "@types/firefox-webext-browser" "^94.0.0" - tldts-experimental "^5.6.21" - "@cnakazawa/watch@^1.0.3": version "1.0.4" resolved "https://registry.yarnpkg.com/@cnakazawa/watch/-/watch-1.0.4.tgz#f864ae85004d0fcab6f50be9141c4da368d1656a" @@ -4571,6 +4527,22 @@ "@radix-ui/react-use-callback-ref" "0.1.0" "@radix-ui/react-use-layout-effect" "0.1.0" +"@radix-ui/react-checkbox@^0.1.5": + version "0.1.5" + resolved "https://registry.yarnpkg.com/@radix-ui/react-checkbox/-/react-checkbox-0.1.5.tgz#3a6bd54ba1720c8e5c03852acf460e35dfbe9da3" + integrity sha512-M8Y4dSXsKSbF+FryG5VvZKr/1MukMVG7swq9p5s7wYb8Rvn0UM0rQ5w8BWmSWSV4BL/gbJdhwVCznwXXlgZRZg== + dependencies: + "@babel/runtime" "^7.13.10" + "@radix-ui/primitive" "0.1.0" + "@radix-ui/react-compose-refs" "0.1.0" + "@radix-ui/react-context" "0.1.1" + "@radix-ui/react-label" "0.1.5" + "@radix-ui/react-presence" "0.1.2" + "@radix-ui/react-primitive" "0.1.4" + "@radix-ui/react-use-controllable-state" "0.1.0" + "@radix-ui/react-use-previous" "0.1.1" + "@radix-ui/react-use-size" "0.1.1" + "@radix-ui/react-collection@0.1.4": version "0.1.4" resolved "https://registry.yarnpkg.com/@radix-ui/react-collection/-/react-collection-0.1.4.tgz#734061ffd5bb93e88889d49b87391a73a63824c9" @@ -4700,6 +4672,17 @@ "@babel/runtime" "^7.13.10" "@radix-ui/react-use-layout-effect" "0.1.0" +"@radix-ui/react-label@0.1.5": + version "0.1.5" + resolved "https://registry.yarnpkg.com/@radix-ui/react-label/-/react-label-0.1.5.tgz#12cd965bfc983e0148121d4c99fb8e27a917c45c" + integrity sha512-Au9+n4/DhvjR0IHhvZ1LPdx/OW+3CGDie30ZyCkbSHIuLp4/CV4oPPGBwJ1vY99Jog3zyQhsGww9MXj8O9Aj/A== + dependencies: + "@babel/runtime" "^7.13.10" + "@radix-ui/react-compose-refs" "0.1.0" + "@radix-ui/react-context" "0.1.1" + "@radix-ui/react-id" "0.1.5" + "@radix-ui/react-primitive" "0.1.4" + "@radix-ui/react-menu@0.1.6": version "0.1.6" resolved "https://registry.yarnpkg.com/@radix-ui/react-menu/-/react-menu-0.1.6.tgz#7f9521a10f6a9cd819b33b33d5ed9538d79b2e75" @@ -4976,41 +4959,6 @@ dependencies: "@babel/runtime" "^7.13.10" -"@remusao/guess-url-type@^1.1.2": - version "1.2.1" - resolved "https://registry.yarnpkg.com/@remusao/guess-url-type/-/guess-url-type-1.2.1.tgz#b3e7c32abdf98d0fb4f93cc67cad580b5fe4ba57" - integrity sha512-rbOqre2jW8STjheOsOaQHLgYBaBZ9Owbdt8NO7WvNZftJlaG3y/K9oOkl8ZUpuFBisIhmBuMEW6c+YrQl5inRA== - -"@remusao/small@^1.1.2": - version "1.2.1" - resolved "https://registry.yarnpkg.com/@remusao/small/-/small-1.2.1.tgz#63bfe4548832289f94ac868a0c305970c9a0e5f9" - integrity sha512-7MjoGt0TJMVw1GPKgWq6SJPws1SLsUXQRa43Umht+nkyw2jnpy3WpiLNqGdwo5rHr5Wp9B2W/Pm5RQp656UJdw== - -"@remusao/smaz-compress@^1.9.1": - version "1.9.1" - resolved "https://registry.yarnpkg.com/@remusao/smaz-compress/-/smaz-compress-1.9.1.tgz#fc75eaf9bcac2d58bc4c3d518183a7cb9612d275" - integrity sha512-E2f48TwloQu3r6BdLOGF2aczeH7bJ/32oJGqvzT9SKur0cuUnLcZ7ZXP874E2fwmdE+cXzfC7bKzp79cDnmeyw== - dependencies: - "@remusao/trie" "^1.4.1" - -"@remusao/smaz-decompress@^1.9.1": - version "1.9.1" - resolved "https://registry.yarnpkg.com/@remusao/smaz-decompress/-/smaz-decompress-1.9.1.tgz#8094f997e8fb591a678cda9cf08c209c825eba5b" - integrity sha512-TfjKKprYe3n47od8auhvJ/Ikj9kQTbDTe71ynKlxslrvvUhlIV3VQSuwYuMWMbdz1fIs0H/fxCN1Z8/H3km6/A== - -"@remusao/smaz@^1.7.1": - version "1.9.1" - resolved "https://registry.yarnpkg.com/@remusao/smaz/-/smaz-1.9.1.tgz#a2b9b045385f81e1615a68d932b7cc8b04c9db8d" - integrity sha512-e6BLuP8oaXCZ9+v46Is4ilAZ/Vq6YLgmBP204Ixgk1qTjXmqvFYG7+AS7v9nsZdGOy96r9DWGFbbDVgMxwu1rA== - dependencies: - "@remusao/smaz-compress" "^1.9.1" - "@remusao/smaz-decompress" "^1.9.1" - -"@remusao/trie@^1.4.1": - version "1.4.1" - resolved "https://registry.yarnpkg.com/@remusao/trie/-/trie-1.4.1.tgz#755d09f8a007476334e611f42719b2d581f00720" - integrity sha512-yvwa+aCyYI/UjeD39BnpMypG8N06l86wIDW1/PAc6ihBRnodIfZDwccxQN3n1t74wduzaz74m4ZMHZnB06567Q== - "@rushstack/eslint-patch@^1.0.8": version "1.1.0" resolved "https://registry.yarnpkg.com/@rushstack/eslint-patch/-/eslint-patch-1.1.0.tgz#7f698254aadf921e48dda8c0a6b304026b8a9323" @@ -7434,14 +7382,6 @@ resolved "https://registry.yarnpkg.com/@types/chai/-/chai-4.2.21.tgz#9f35a5643129df132cf3b5c1ec64046ea1af0650" integrity sha512-yd+9qKmJxm496BOV9CMNaey8TWsikaZOwMRwPHQIjcOJM9oV+fi9ZMNw3JsVnbEEbo2gRTDnGEBv8pjyn67hNg== -"@types/chrome@^0.0.180": - version "0.0.180" - resolved "https://registry.yarnpkg.com/@types/chrome/-/chrome-0.0.180.tgz#b7fb831848f6b6c49abffd85410b73037c02381e" - integrity sha512-A/CuuKAaHq2CHEpYBAtl0lp2ib7TTXK7VjJI4q+c+1U/HDvZLJ8IlsdEDzzHdvkNdh36bGONxrMnO9YZrKqbAw== - dependencies: - "@types/filesystem" "*" - "@types/har-format" "*" - "@types/color-convert@^2.0.0": version "2.0.0" resolved "https://registry.yarnpkg.com/@types/color-convert/-/color-convert-2.0.0.tgz#8f5ee6b9e863dcbee5703f5a517ffb13d3ea4e22" @@ -7581,28 +7521,11 @@ "@types/qs" "*" "@types/serve-static" "*" -"@types/filesystem@*": - version "0.0.32" - resolved "https://registry.yarnpkg.com/@types/filesystem/-/filesystem-0.0.32.tgz#307df7cc084a2293c3c1a31151b178063e0a8edf" - integrity sha512-Yuf4jR5YYMR2DVgwuCiP11s0xuVRyPKmz8vo6HBY3CGdeMj8af93CFZX+T82+VD1+UqHOxTq31lO7MI7lepBtQ== - dependencies: - "@types/filewriter" "*" - -"@types/filewriter@*": - version "0.0.29" - resolved "https://registry.yarnpkg.com/@types/filewriter/-/filewriter-0.0.29.tgz#a48795ecadf957f6c0d10e0c34af86c098fa5bee" - integrity sha512-BsPXH/irW0ht0Ji6iw/jJaK8Lj3FJemon2gvEqHKpCdDCeemHa+rI3WBGq5z7cDMZgoLjY40oninGxqk+8NzNQ== - "@types/fined@*": version "1.1.3" resolved "https://registry.yarnpkg.com/@types/fined/-/fined-1.1.3.tgz#83f03e8f0a8d3673dfcafb18fce3571f6250e1bc" integrity sha512-CWYnSRnun3CGbt6taXeVo2lCbuaj4mchVJ4UF/BdU5TSuIn3AmS13pGMwCsBUoehGbhZrBrpNJZSZI5EVilXww== -"@types/firefox-webext-browser@^94.0.0": - version "94.0.1" - resolved "https://registry.yarnpkg.com/@types/firefox-webext-browser/-/firefox-webext-browser-94.0.1.tgz#52afb975253dc0fd350d5d58c7fe9fd1a01f64a1" - integrity sha512-I6iHRQJSTZ+gYt2IxdH2RRAMvcUyK8v5Ig7fHQR0IwUNYP7hz9+cziBVIKxLCO6XI7fiyRsNOWObfl3/4Js2Lg== - "@types/glob@*": version "7.2.0" resolved "https://registry.yarnpkg.com/@types/glob/-/glob-7.2.0.tgz#bc1b5bf3aa92f25bd5dd39f35c57361bdce5b2eb" @@ -7633,11 +7556,6 @@ dependencies: graphql "*" -"@types/har-format@*": - version "1.2.8" - resolved "https://registry.yarnpkg.com/@types/har-format/-/har-format-1.2.8.tgz#e6908b76d4c88be3db642846bb8b455f0bfb1c4e" - integrity sha512-OP6L9VuZNdskgNN3zFQQ54ceYD8OLq5IbqO4VK91ORLfOm7WdT/CiT/pHEBSQEqCInJ2y3O6iCm/zGtPElpgJQ== - "@types/hast@^2.0.0": version "2.3.4" resolved "https://registry.yarnpkg.com/@types/hast/-/hast-2.3.4.tgz#8aa5ef92c117d20d974a82bdfb6a648b08c0bafc" @@ -8723,11 +8641,6 @@ acorn@^8.0.4, acorn@^8.2.4, acorn@^8.4.1, acorn@^8.5.0, acorn@^8.7.0: resolved "https://registry.yarnpkg.com/acorn/-/acorn-8.7.0.tgz#90951fde0f8f09df93549481e5fc141445b791cf" integrity sha512-V/LGr1APy+PXIwKebEWrkZPwoeoF+w1jiOBUmuxuiUIaOHtob8Qc9BTrYo7VuI5fR8tqsy+buA2WFooR5olqvQ== -ad-block-js@^0.0.2: - version "0.0.2" - resolved "https://registry.yarnpkg.com/ad-block-js/-/ad-block-js-0.0.2.tgz#294196a1fcc40881bd86abdbad288e20c516cd81" - integrity sha512-1n71M3WihSgFoaQ6S+LL9L3YFfCRtvNfk/yvqY8MAn66njWOJo2s6f/TYBOS2gLbyXuxEUqq9n6slUHi/xQIHg== - add-stream@^1.0.0: version "1.0.0" resolved "https://registry.yarnpkg.com/add-stream/-/add-stream-1.0.0.tgz#6a7990437ca736d5e1288db92bd3266d5f5cb2aa" @@ -10634,13 +10547,6 @@ chownr@^2.0.0: resolved "https://registry.yarnpkg.com/chownr/-/chownr-2.0.0.tgz#15bfbe53d2eab4cf70f18a8cd68ebe5b3cb1dece" integrity sha512-bIomtDF5KGpdogkLd9VspvFzk9KfpyyGlS8YFVZl7TGPBHL5snIOnxeshwVgPteQ9b4Eydl+pVbIyE1DcvCWgQ== -chrome-aws-lambda@^10.1.0: - version "10.1.0" - resolved "https://registry.yarnpkg.com/chrome-aws-lambda/-/chrome-aws-lambda-10.1.0.tgz#ac43b4cdfc1fbb2275c62effada560858099501e" - integrity sha512-NZQVf+J4kqG4sVhRm3WNmOfzY0OtTSm+S8rg77pwePa9RCYHzhnzRs8YvNI6L9tALIW6RpmefWiPURt3vURXcw== - dependencies: - lambdafs "^2.0.3" - chrome-trace-event@^1.0.2: version "1.0.3" resolved "https://registry.yarnpkg.com/chrome-trace-event/-/chrome-trace-event-1.0.3.tgz#1015eced4741e15d06664a957dbbf50d041e26ac" @@ -11769,15 +11675,6 @@ data-urls@^2.0.0: whatwg-mimetype "^2.3.0" whatwg-url "^8.0.0" -data-urls@^3.0.1: - version "3.0.1" - resolved "https://registry.yarnpkg.com/data-urls/-/data-urls-3.0.1.tgz#597fc2ae30f8bc4dbcf731fcd1b1954353afc6f8" - integrity sha512-Ds554NeT5Gennfoo9KN50Vh6tpgtvYEwraYjejXnyTpu1C7oXKxdFk75REooENHE8ndTVOJuv+BEs4/J/xcozw== - dependencies: - abab "^2.0.3" - whatwg-mimetype "^3.0.0" - whatwg-url "^10.0.0" - dataloader@2.0.0, dataloader@^2.0.0: version "2.0.0" resolved "https://registry.yarnpkg.com/dataloader/-/dataloader-2.0.0.tgz#41eaf123db115987e21ca93c005cd7753c55fe6f" @@ -11883,7 +11780,7 @@ decamelize@^4.0.0: resolved "https://registry.yarnpkg.com/decamelize/-/decamelize-4.0.0.tgz#aa472d7bf660eb15f3494efd531cab7f2a709837" integrity sha512-9iE1PgSik9HeIIw2JO94IidnE3eBoQrFJ3w7sFuzSX4DpmZ3v5sZpUiV5Swcf6mQEF+Y0ru8Neo+p+nyh2J+hQ== -decimal.js@^10.2.1, decimal.js@^10.3.1: +decimal.js@^10.2.1: version "10.3.1" resolved "https://registry.yarnpkg.com/decimal.js/-/decimal.js-10.3.1.tgz#d8c3a444a9c6774ba60ca6ad7261c3a94fd5e783" integrity sha512-V0pfhfr8suzyPGOx3nmq4aHqabehUZn6Ch9kyFpV79TGDTWFmHqUqXdabR7QHqxzrYolF4+tVmJhUG4OURg5dQ== @@ -12308,13 +12205,6 @@ domexception@^2.0.1: dependencies: webidl-conversions "^5.0.0" -domexception@^4.0.0: - version "4.0.0" - resolved "https://registry.yarnpkg.com/domexception/-/domexception-4.0.0.tgz#4ad1be56ccadc86fc76d033353999a8037d03673" - integrity sha512-A2is4PLG+eeSfoTMA95/s4pvAoSo2mKtiM5jlHkAVewmiO8ISFTFKZjH7UAM1Atli/OT/7JHOrJRJiMKUZKYBw== - dependencies: - webidl-conversions "^7.0.0" - domhandler@^3.0.0: version "3.3.0" resolved "https://registry.yarnpkg.com/domhandler/-/domhandler-3.3.0.tgz#6db7ea46e4617eb15cf875df68b2b8524ce0037a" @@ -15010,13 +14900,6 @@ html-encoding-sniffer@^2.0.1: dependencies: whatwg-encoding "^1.0.5" -html-encoding-sniffer@^3.0.0: - version "3.0.0" - resolved "https://registry.yarnpkg.com/html-encoding-sniffer/-/html-encoding-sniffer-3.0.0.tgz#2cb1a8cf0db52414776e5b2a7a04d5dd98158de9" - integrity sha512-oWv4T4yJ52iKrufjnyZPkrN0CH3QnrUqdB6In1g5Fe1mia8GmF36gnfNySxoZtxD5+NmYw1EElVXiBk93UeskA== - dependencies: - whatwg-encoding "^2.0.0" - html-entities@^2.1.0: version "2.3.3" resolved "https://registry.yarnpkg.com/html-entities/-/html-entities-2.3.3.tgz#117d7626bece327fc8baace8868fa6f5ef856e46" @@ -15298,13 +15181,6 @@ iconv-lite@0.4.24, iconv-lite@^0.4.24: dependencies: safer-buffer ">= 2.1.2 < 3" -iconv-lite@0.6.3: - version "0.6.3" - resolved "https://registry.yarnpkg.com/iconv-lite/-/iconv-lite-0.6.3.tgz#a52f80bf38da1952eb5c681790719871a1a72501" - integrity sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw== - dependencies: - safer-buffer ">= 2.1.2 < 3.0.0" - iconv-lite@^0.6.2: version "0.6.2" resolved "https://registry.yarnpkg.com/iconv-lite/-/iconv-lite-0.6.2.tgz#ce13d1875b0c3a674bd6a04b7f76b01b1b6ded01" @@ -16898,39 +16774,6 @@ jsdom@^16.6.0: ws "^7.4.6" xml-name-validator "^3.0.0" -jsdom@^19.0.0: - version "19.0.0" - resolved "https://registry.yarnpkg.com/jsdom/-/jsdom-19.0.0.tgz#93e67c149fe26816d38a849ea30ac93677e16b6a" - integrity sha512-RYAyjCbxy/vri/CfnjUWJQQtZ3LKlLnDqj+9XLNnJPgEGeirZs3hllKR20re8LUZ6o1b1X4Jat+Qd26zmP41+A== - dependencies: - abab "^2.0.5" - acorn "^8.5.0" - acorn-globals "^6.0.0" - cssom "^0.5.0" - cssstyle "^2.3.0" - data-urls "^3.0.1" - decimal.js "^10.3.1" - domexception "^4.0.0" - escodegen "^2.0.0" - form-data "^4.0.0" - html-encoding-sniffer "^3.0.0" - http-proxy-agent "^5.0.0" - https-proxy-agent "^5.0.0" - is-potential-custom-element-name "^1.0.1" - nwsapi "^2.2.0" - parse5 "6.0.1" - saxes "^5.0.1" - symbol-tree "^3.2.4" - tough-cookie "^4.0.0" - w3c-hr-time "^1.0.2" - w3c-xmlserializer "^3.0.0" - webidl-conversions "^7.0.0" - whatwg-encoding "^2.0.0" - whatwg-mimetype "^3.0.0" - whatwg-url "^10.0.0" - ws "^8.2.3" - xml-name-validator "^4.0.0" - jsesc@^2.5.1: version "2.5.2" resolved "https://registry.yarnpkg.com/jsesc/-/jsesc-2.5.2.tgz#80564d2e483dacf6e8ef209650a67df3f0c283a4" @@ -17261,13 +17104,6 @@ kuler@^2.0.0: resolved "https://registry.yarnpkg.com/kuler/-/kuler-2.0.0.tgz#e2c570a3800388fb44407e851531c1d670b061b3" integrity sha512-Xq9nH7KlWZmXAtodXDDRE7vs6DU1gTU8zYDHDiWLSip45Egwq3plLHzPn27NgvzL2r1LMPC1vdqh98sQxtqj4A== -lambdafs@^2.0.3: - version "2.1.1" - resolved "https://registry.yarnpkg.com/lambdafs/-/lambdafs-2.1.1.tgz#4bf8d3037b6c61bbb4a22ab05c73ee47964c25ed" - integrity sha512-x5k8JcoJWkWLvCVBzrl4pzvkEHSgSBqFjg3Dpsc4AcTMq7oUMym4cL/gRTZ6VM4mUMY+M0dIbQ+V1c1tsqqanQ== - dependencies: - tar-fs "^2.1.1" - language-subtag-registry@~0.3.2: version "0.3.21" resolved "https://registry.yarnpkg.com/language-subtag-registry/-/language-subtag-registry-0.3.21.tgz#04ac218bea46f04cb039084602c6da9e788dd45a" @@ -18861,7 +18697,7 @@ node-fetch@2.6.1: resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-2.6.1.tgz#045bd323631f76ed2e2b55573394416b639a0052" integrity sha512-V4aYg89jEoVRxRb2fJdAg8FHvI7cEyYdVAh94HH0UIK8oJxUfkjlDQN9RbMx+bEjP7+ggMiFRprSti032Oipxw== -node-fetch@2.6.7, node-fetch@^2.3.0, node-fetch@^2.6.0, node-fetch@^2.6.1, node-fetch@^2.6.7: +node-fetch@2.6.7, node-fetch@^2.3.0, node-fetch@^2.6.1, node-fetch@^2.6.7: version "2.6.7" resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-2.6.7.tgz#24de9fba827e3b4ae44dc8b20256a379160052ad" integrity sha512-ZjMPFEfVx5j+y2yF35Kzx5sF7kDzxuDj6ziH4FFbOp87zKDZNx8yExJIb05OGF4Nlt9IHFIMBkRl41VdvcNdbQ== @@ -20854,16 +20690,6 @@ puppeteer-core@^13.7.0: unbzip2-stream "1.4.3" ws "8.5.0" -puppeteer-extra-plugin-adblocker@^2.12.0: - version "2.12.0" - resolved "https://registry.yarnpkg.com/puppeteer-extra-plugin-adblocker/-/puppeteer-extra-plugin-adblocker-2.12.0.tgz#afd4e78abe37dde5158ceb190bc5e747f91420bd" - integrity sha512-x0/G6suPa28FomEuCGAKGTvqcxt2efZWXE8mjEPRASGQaMe4smHYM1xT2nYS5Z82wIOqPZQVVJ7l3dGY9E+BuQ== - dependencies: - "@cliqz/adblocker-puppeteer" "1.22.6" - debug "^4.1.1" - node-fetch "^2.6.0" - puppeteer-extra-plugin "^3.2.0" - puppeteer-extra-plugin-stealth@^2.9.0: version "2.9.0" resolved "https://registry.yarnpkg.com/puppeteer-extra-plugin-stealth/-/puppeteer-extra-plugin-stealth-2.9.0.tgz#aa39f4469bf47343af4517efe9f97846228b3b01" @@ -23300,7 +23126,7 @@ tar-fs@2.0.0: pump "^3.0.0" tar-stream "^2.0.0" -tar-fs@2.1.1, tar-fs@^2.1.1: +tar-fs@2.1.1: version "2.1.1" resolved "https://registry.yarnpkg.com/tar-fs/-/tar-fs-2.1.1.tgz#489a15ab85f1f0befabb370b7de4f9eb5cbe8784" integrity sha512-V0r2Y9scmbDRLCNex/+hYzvp/zyYjvFbHPNgVTKfQvVrb6guiE/fxP+XblDNR011utopbkex2nM4dHNV6GDsng== @@ -23579,18 +23405,6 @@ title-case@^3.0.3: dependencies: tslib "^2.0.3" -tldts-core@^5.7.77: - version "5.7.77" - resolved "https://registry.yarnpkg.com/tldts-core/-/tldts-core-5.7.77.tgz#ddc7fb6c9b0c03b19cf0985f4eaf6f79c43d9096" - integrity sha512-iEVvz9jdx8zxKPx4qT2bVoewFoU3fctREwYTjlXmBJJKX8JWk90W4pPKqSIFGdfvC0laH3XyZKe1sugHqUpgDQ== - -tldts-experimental@^5.6.21: - version "5.7.77" - resolved "https://registry.yarnpkg.com/tldts-experimental/-/tldts-experimental-5.7.77.tgz#2b2271c4fc3c2956af6809fe865d181c0eaacd00" - integrity sha512-hzZB5ctCHS6ZJJn0o2ip/gVUIKkeY/LI/X5O/jrMw/sp94ebvCDII2Ps6Fg3X+MPK97RhrC9Rboq1aheWf3+ww== - dependencies: - tldts-core "^5.7.77" - tmp@^0.0.33: version "0.0.33" resolved "https://registry.yarnpkg.com/tmp/-/tmp-0.0.33.tgz#6d34335889768d21b2bcda0aa277ced3b1bfadf9" @@ -23723,13 +23537,6 @@ tr46@^2.1.0: dependencies: punycode "^2.1.1" -tr46@^3.0.0: - version "3.0.0" - resolved "https://registry.yarnpkg.com/tr46/-/tr46-3.0.0.tgz#555c4e297a950617e8eeddef633c87d4d9d6cbf9" - integrity sha512-l7FvfAHlcmulp8kr+flpQZmVwtu7nfRV7NZujtN0OqES8EL4O4e0qqzL0DC5gAvx/ZC/9lk6rhcUwYvkBnBnYA== - dependencies: - punycode "^2.1.1" - tr46@~0.0.3: version "0.0.3" resolved "https://registry.yarnpkg.com/tr46/-/tr46-0.0.3.tgz#8184fd347dac9cdc185992f3a6622e14b9d9ab6a" @@ -24627,13 +24434,6 @@ w3c-xmlserializer@^2.0.0: dependencies: xml-name-validator "^3.0.0" -w3c-xmlserializer@^3.0.0: - version "3.0.0" - resolved "https://registry.yarnpkg.com/w3c-xmlserializer/-/w3c-xmlserializer-3.0.0.tgz#06cdc3eefb7e4d0b20a560a5a3aeb0d2d9a65923" - integrity sha512-3WFqGEgSXIyGhOmAFtlicJNMjEps8b1MG31NCA0/vOF9+nKMUW1ckhi9cnNHmf88Rzw5V+dwIwsm2C7X8k9aQg== - dependencies: - xml-name-validator "^4.0.0" - walkdir@^0.4.0: version "0.4.1" resolved "https://registry.yarnpkg.com/walkdir/-/walkdir-0.4.1.tgz#dc119f83f4421df52e3061e514228a2db20afa39" @@ -24730,11 +24530,6 @@ webidl-conversions@^6.1.0: resolved "https://registry.yarnpkg.com/webidl-conversions/-/webidl-conversions-6.1.0.tgz#9111b4d7ea80acd40f5270d666621afa78b69514" integrity sha512-qBIvFLGiBpLjfwmYAaHPXsn+ho5xZnGvyGvsarywGNc8VyQJUMHJ8OBKGGrPER0okBeMDaan4mNBlgBROxuI8w== -webidl-conversions@^7.0.0: - version "7.0.0" - resolved "https://registry.yarnpkg.com/webidl-conversions/-/webidl-conversions-7.0.0.tgz#256b4e1882be7debbf01d05f0aa2039778ea080a" - integrity sha512-VwddBukDzu71offAQR975unBIGqfKZpM+8ZX6ySk8nYhVoo5CYaZyzt3YBvYtRtO+aoGlqxPg/B87NGVZ/fu6g== - webpack-bundle-analyzer@^4.5.0: version "4.5.0" resolved "https://registry.yarnpkg.com/webpack-bundle-analyzer/-/webpack-bundle-analyzer-4.5.0.tgz#1b0eea2947e73528754a6f9af3e91b2b6e0f79d5" @@ -25004,13 +24799,6 @@ whatwg-encoding@^1.0.5: dependencies: iconv-lite "0.4.24" -whatwg-encoding@^2.0.0: - version "2.0.0" - resolved "https://registry.yarnpkg.com/whatwg-encoding/-/whatwg-encoding-2.0.0.tgz#e7635f597fd87020858626805a2729fa7698ac53" - integrity sha512-p41ogyeMUrw3jWclHWTQg1k05DSVXPLcVxRTYsXUk+ZooOCZLcoYgPZ/HL/D/N+uQPOtcp1me1WhBEaX02mhWg== - dependencies: - iconv-lite "0.6.3" - whatwg-fetch@^3.4.1: version "3.6.2" resolved "https://registry.yarnpkg.com/whatwg-fetch/-/whatwg-fetch-3.6.2.tgz#dced24f37f2624ed0281725d51d0e2e3fe677f8c" @@ -25021,19 +24809,6 @@ whatwg-mimetype@^2.3.0: resolved "https://registry.yarnpkg.com/whatwg-mimetype/-/whatwg-mimetype-2.3.0.tgz#3d4b1e0312d2079879f826aff18dbeeca5960fbf" integrity sha512-M4yMwr6mAnQz76TbJm914+gPpB/nCwvZbJU28cUD6dR004SAxDLOOSUaB1JDRqLtaOV/vi0IC5lEAGFgrjGv/g== -whatwg-mimetype@^3.0.0: - version "3.0.0" - resolved "https://registry.yarnpkg.com/whatwg-mimetype/-/whatwg-mimetype-3.0.0.tgz#5fa1a7623867ff1af6ca3dc72ad6b8a4208beba7" - integrity sha512-nt+N2dzIutVRxARx1nghPKGv1xHikU7HKdfafKkLNLindmPU/ch3U31NOCGGA/dmPcmb1VlofO0vnKAcsm0o/Q== - -whatwg-url@^10.0.0: - version "10.0.0" - resolved "https://registry.yarnpkg.com/whatwg-url/-/whatwg-url-10.0.0.tgz#37264f720b575b4a311bd4094ed8c760caaa05da" - integrity sha512-CLxxCmdUby142H5FZzn4D8ikO1cmypvXVQktsgosNy4a4BHrDHeciBBGZhb0bNoR5/MltoCatso+vFjjGx8t0w== - dependencies: - tr46 "^3.0.0" - webidl-conversions "^7.0.0" - whatwg-url@^5.0.0: version "5.0.0" resolved "https://registry.yarnpkg.com/whatwg-url/-/whatwg-url-5.0.0.tgz#966454e8765462e37644d3626f6742ce8b70965d" @@ -25286,11 +25061,6 @@ xml-name-validator@^3.0.0: resolved "https://registry.yarnpkg.com/xml-name-validator/-/xml-name-validator-3.0.0.tgz#6ae73e06de4d8c6e47f9fb181f78d648ad457c6a" integrity sha512-A5CUptxDsvxKJEU3yO6DuWBSJz/qizqzJKOMIfUJHETbBw/sFaDxgd6fxm1ewUaM0jZ444Fc5vC5ROYurg/4Pw== -xml-name-validator@^4.0.0: - version "4.0.0" - resolved "https://registry.yarnpkg.com/xml-name-validator/-/xml-name-validator-4.0.0.tgz#79a006e2e63149a8600f15430f0a4725d1524835" - integrity sha512-ICP2e+jsHvAj2E2lIHxa5tjXRlKDJo4IdvPvCXbXQGdzSfmSpNVyIKMvoZHjDY9DP0zV17iI85o90vRFXNccRw== - xml2js@^0.4.23: version "0.4.23" resolved "https://registry.yarnpkg.com/xml2js/-/xml2js-0.4.23.tgz#a0c69516752421eb2ac758ee4d4ccf58843eac66" From b766e171893b96975c122a5c53770ab49da11bd8 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Thu, 12 May 2022 16:48:59 +0800 Subject: [PATCH 2/7] Remove jsdom in content-fetch --- packages/content-fetch/apple-news-handler.js | 9 +++------ packages/content-fetch/bloomberg-handler.js | 7 +++---- packages/content-fetch/derstandard-handler.js | 11 +++++------ packages/content-fetch/medium-handler.js | 2 -- packages/content-fetch/package.json | 1 + packages/puppeteer-parse/apple-news-handler.js | 9 +++------ packages/puppeteer-parse/bloomberg-handler.js | 7 +++---- packages/puppeteer-parse/derstandard-handler.js | 15 +++++++++------ packages/puppeteer-parse/medium-handler.js | 2 -- packages/puppeteer-parse/package.json | 1 + 10 files changed, 28 insertions(+), 36 deletions(-) diff --git a/packages/content-fetch/apple-news-handler.js b/packages/content-fetch/apple-news-handler.js index ccdfd62ab..8a69e5adf 100644 --- a/packages/content-fetch/apple-news-handler.js +++ b/packages/content-fetch/apple-news-handler.js @@ -9,11 +9,8 @@ const axios = require('axios'); const { promisify } = require('util'); const { DateTime } = require('luxon'); const os = require('os'); -const jsdom = require("jsdom"); const { Cipher } = require('crypto'); -const { JSDOM } = jsdom; - - +const { parseHTML } = require('linkedom'); exports.appleNewsHandler = { @@ -30,10 +27,10 @@ exports.appleNewsHandler = { const response = await axios.get(url, { headers: { 'User-Agent': MOBILE_USER_AGENT } } ); const data = response.data; - const dom = new JSDOM(data); + const dom = new parseHTML(data).document; // make sure its a valid URL by wrapping in new URL - const u = new URL(dom.window.document.querySelector('span.click-here').parentNode.href); + const u = new URL(dom.querySelector('span.click-here').parentNode.href); return { url: u.href }; } } diff --git a/packages/content-fetch/bloomberg-handler.js b/packages/content-fetch/bloomberg-handler.js index 85a467aa1..d6caef6fd 100644 --- a/packages/content-fetch/bloomberg-handler.js +++ b/packages/content-fetch/bloomberg-handler.js @@ -6,8 +6,7 @@ require('dotenv').config(); const axios = require('axios'); const os = require('os'); -const jsdom = require("jsdom"); -const { JSDOM } = jsdom; +const { parseHTML } = require('linkedom'); exports.bloombergHandler = { @@ -30,8 +29,8 @@ exports.bloombergHandler = { 'block_resources': false, } }) - const dom = new JSDOM(response.data); - return { title: dom.window.document.title, content: dom.window.document.querySelector('body').innerHTML, url: url } + const dom = new parseHTML(response.data).document; + return { title: dom.title, content: dom.querySelector('body').innerHTML, url: url } } catch (error) { console.error('error prehandling bloomberg url', error) throw error diff --git a/packages/content-fetch/derstandard-handler.js b/packages/content-fetch/derstandard-handler.js index d9839e7c9..eddc9618f 100644 --- a/packages/content-fetch/derstandard-handler.js +++ b/packages/content-fetch/derstandard-handler.js @@ -5,8 +5,7 @@ /* eslint-disable @typescript-eslint/no-require-imports */ require('dotenv').config(); const axios = require('axios'); -const jsdom = require("jsdom"); -const { JSDOM } = jsdom; +const { parseHTML } = require('linkedom'); exports.derstandardHandler = { shouldPrehandle: (url, env) => { @@ -23,14 +22,14 @@ exports.derstandardHandler = { }); const content = response.data; - var title = undefined - const dom = new JSDOM(content) - const titleElement = dom.window.document.querySelector('.article-title') + var title = undefined; + const dom = new parseHTML(content).document; + const titleElement = dom.querySelector('.article-title') if (!titleElement) { title = titleElement.textContent titleElement.remove() } - return { content: dom.window.document.body.outerHTML, title: title }; + return { content: dom.body.outerHTML, title: title }; } } diff --git a/packages/content-fetch/medium-handler.js b/packages/content-fetch/medium-handler.js index 704ab0f5b..d5bca7652 100644 --- a/packages/content-fetch/medium-handler.js +++ b/packages/content-fetch/medium-handler.js @@ -6,8 +6,6 @@ require('dotenv').config(); const axios = require('axios'); const os = require('os'); -const jsdom = require("jsdom"); -const { JSDOM } = jsdom; exports.mediumHandler = { diff --git a/packages/content-fetch/package.json b/packages/content-fetch/package.json index aea394edc..1651a4c5f 100644 --- a/packages/content-fetch/package.json +++ b/packages/content-fetch/package.json @@ -8,6 +8,7 @@ "dotenv": "^8.2.0", "express": "^4.17.1", "jsonwebtoken": "^8.5.1", + "linkedom": "^0.14.9", "luxon": "^2.3.1", "puppeteer-core": "^13.7.0", "puppeteer-extra": "^3.2.3", diff --git a/packages/puppeteer-parse/apple-news-handler.js b/packages/puppeteer-parse/apple-news-handler.js index ccdfd62ab..8a69e5adf 100644 --- a/packages/puppeteer-parse/apple-news-handler.js +++ b/packages/puppeteer-parse/apple-news-handler.js @@ -9,11 +9,8 @@ const axios = require('axios'); const { promisify } = require('util'); const { DateTime } = require('luxon'); const os = require('os'); -const jsdom = require("jsdom"); const { Cipher } = require('crypto'); -const { JSDOM } = jsdom; - - +const { parseHTML } = require('linkedom'); exports.appleNewsHandler = { @@ -30,10 +27,10 @@ exports.appleNewsHandler = { const response = await axios.get(url, { headers: { 'User-Agent': MOBILE_USER_AGENT } } ); const data = response.data; - const dom = new JSDOM(data); + const dom = new parseHTML(data).document; // make sure its a valid URL by wrapping in new URL - const u = new URL(dom.window.document.querySelector('span.click-here').parentNode.href); + const u = new URL(dom.querySelector('span.click-here').parentNode.href); return { url: u.href }; } } diff --git a/packages/puppeteer-parse/bloomberg-handler.js b/packages/puppeteer-parse/bloomberg-handler.js index 85a467aa1..d6caef6fd 100644 --- a/packages/puppeteer-parse/bloomberg-handler.js +++ b/packages/puppeteer-parse/bloomberg-handler.js @@ -6,8 +6,7 @@ require('dotenv').config(); const axios = require('axios'); const os = require('os'); -const jsdom = require("jsdom"); -const { JSDOM } = jsdom; +const { parseHTML } = require('linkedom'); exports.bloombergHandler = { @@ -30,8 +29,8 @@ exports.bloombergHandler = { 'block_resources': false, } }) - const dom = new JSDOM(response.data); - return { title: dom.window.document.title, content: dom.window.document.querySelector('body').innerHTML, url: url } + const dom = new parseHTML(response.data).document; + return { title: dom.title, content: dom.querySelector('body').innerHTML, url: url } } catch (error) { console.error('error prehandling bloomberg url', error) throw error diff --git a/packages/puppeteer-parse/derstandard-handler.js b/packages/puppeteer-parse/derstandard-handler.js index 8a732c05c..eddc9618f 100644 --- a/packages/puppeteer-parse/derstandard-handler.js +++ b/packages/puppeteer-parse/derstandard-handler.js @@ -5,8 +5,7 @@ /* eslint-disable @typescript-eslint/no-require-imports */ require('dotenv').config(); const axios = require('axios'); -const jsdom = require("jsdom"); -const { JSDOM } = jsdom; +const { parseHTML } = require('linkedom'); exports.derstandardHandler = { shouldPrehandle: (url, env) => { @@ -23,10 +22,14 @@ exports.derstandardHandler = { }); const content = response.data; - const dom = new JSDOM(content) - const titleElement = dom.window.document.querySelector('.article-title') - titleElement?.remove() + var title = undefined; + const dom = new parseHTML(content).document; + const titleElement = dom.querySelector('.article-title') + if (!titleElement) { + title = titleElement.textContent + titleElement.remove() + } - return { content: dom.window.document.body.outerHTML, title: titleElement?.textContent }; + return { content: dom.body.outerHTML, title: title }; } } diff --git a/packages/puppeteer-parse/medium-handler.js b/packages/puppeteer-parse/medium-handler.js index 704ab0f5b..d5bca7652 100644 --- a/packages/puppeteer-parse/medium-handler.js +++ b/packages/puppeteer-parse/medium-handler.js @@ -6,8 +6,6 @@ require('dotenv').config(); const axios = require('axios'); const os = require('os'); -const jsdom = require("jsdom"); -const { JSDOM } = jsdom; exports.mediumHandler = { diff --git a/packages/puppeteer-parse/package.json b/packages/puppeteer-parse/package.json index 3c778b9a6..f248414c1 100644 --- a/packages/puppeteer-parse/package.json +++ b/packages/puppeteer-parse/package.json @@ -10,6 +10,7 @@ "axios": "^0.26.0", "dotenv": "^8.2.0", "jsonwebtoken": "^8.5.1", + "linkedom": "^0.14.9", "luxon": "^2.3.1", "puppeteer-core": "^13.7.0", "puppeteer-extra": "^3.2.3", From 60bbbb6cf3e5993bed3e1e6bc956e80fc9ed6a32 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Thu, 12 May 2022 17:10:38 +0800 Subject: [PATCH 3/7] Block requests to 'font', 'image', 'stylesheet', 'script', 'media' in puppeteer --- packages/content-fetch/fetch-content.js | 11 ++--------- packages/puppeteer-parse/index.js | 11 ++--------- 2 files changed, 4 insertions(+), 18 deletions(-) diff --git a/packages/content-fetch/fetch-content.js b/packages/content-fetch/fetch-content.js index fea99d59b..76204fb98 100644 --- a/packages/content-fetch/fetch-content.js +++ b/packages/content-fetch/fetch-content.js @@ -463,7 +463,7 @@ async function retrievePage(url) { await page.setRequestInterception(true); let requestCount = 0; page.on('request', request => { - if (request.resourceType() === 'font' || request.resourceType() === 'image') { + if (['font', 'image', 'stylesheet', 'script', 'media'].includes(request.resourceType())) { request.abort(); return; } @@ -471,14 +471,7 @@ async function retrievePage(url) { request.abort(); return; } - if ( - request.resourceType() === 'script' && - request.url().toLowerCase().indexOf('mathjax') > -1 - ) { - request.abort(); - } else { - request.continue(); - } + request.continue(); }); // Puppeteer fails during download of PDf files, diff --git a/packages/puppeteer-parse/index.js b/packages/puppeteer-parse/index.js index 692d0af53..2ac49838b 100644 --- a/packages/puppeteer-parse/index.js +++ b/packages/puppeteer-parse/index.js @@ -665,7 +665,7 @@ async function retrievePage(url) { await page.setRequestInterception(true); let requestCount = 0; page.on('request', request => { - if (request.resourceType() === 'font' || request.resourceType() === 'image') { + if (['font', 'image', 'stylesheet', 'script', 'media'].includes(request.resourceType())) { request.abort(); return; } @@ -673,14 +673,7 @@ async function retrievePage(url) { request.abort(); return; } - if ( - request.resourceType() === 'script' && - request.url().toLowerCase().indexOf('mathjax') > -1 - ) { - request.abort(); - } else { - request.continue(); - } + request.continue(); }); From 1b8850ed33e4bea93dec473c823e26ba6cd03555 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Thu, 12 May 2022 17:41:11 +0800 Subject: [PATCH 4/7] Fix tests --- packages/api/src/utils/parser.ts | 4 +- packages/api/test/services/save_email.test.ts | 8 ++-- .../services/save_newsletter_email.test.ts | 37 +++++++++++-------- .../data/substack-forwarded-newsletter.html | 2 +- .../substack-forwarded-welcome-email.html | 2 +- ...substack-private-forwarded-newsletter.html | 4 +- 6 files changed, 33 insertions(+), 24 deletions(-) diff --git a/packages/api/src/utils/parser.ts b/packages/api/src/utils/parser.ts index 06fe8a433..eeba50312 100644 --- a/packages/api/src/utils/parser.ts +++ b/packages/api/src/utils/parser.ts @@ -409,8 +409,8 @@ export const parseUrlMetadata = async ( // as a utility method on each one. export const isProbablyNewsletter = (html: string): boolean => { const dom = parseHTML(html).document - const domCopy = parseHTML(dom.documentElement.outerHTML) - const article = new Readability(domCopy.document, { + const domCopy = parseHTML(dom.documentElement.outerHTML).document + const article = new Readability(domCopy, { debug: false, keepTables: true, }).parse() diff --git a/packages/api/test/services/save_email.test.ts b/packages/api/test/services/save_email.test.ts index fb06c4f95..9e7c49f3e 100644 --- a/packages/api/test/services/save_email.test.ts +++ b/packages/api/test/services/save_email.test.ts @@ -8,6 +8,8 @@ import { getPageByParam } from '../../src/elastic/pages' describe('saveEmail', () => { const username = 'fakeUser' + const fakeContent = 'fake content' + after(async () => { await deleteTestUser(username) }) @@ -21,7 +23,7 @@ describe('saveEmail', () => { } await saveEmail(ctx, { - originalContent: 'fake content', + originalContent: `${fakeContent}`, url: 'https://example.com', title: 'fake title', author: 'fake author', @@ -30,7 +32,7 @@ describe('saveEmail', () => { // This ensures row level security doesnt prevent // resaving the same URL const secondResult = await saveEmail(ctx, { - originalContent: 'fake content', + originalContent: `${fakeContent}`, url: 'https://example.com', title: 'fake title', author: 'fake author', @@ -42,6 +44,6 @@ describe('saveEmail', () => { expect(page?.url).to.equal('https://example.com') expect(page?.title).to.equal('fake title') expect(page?.author).to.equal('fake author') - expect(page?.content).to.contain('fake content') + expect(page?.content).to.contain(fakeContent) }) }) diff --git a/packages/api/test/services/save_newsletter_email.test.ts b/packages/api/test/services/save_newsletter_email.test.ts index 46d09daa6..b655d196e 100644 --- a/packages/api/test/services/save_newsletter_email.test.ts +++ b/packages/api/test/services/save_newsletter_email.test.ts @@ -12,6 +12,7 @@ import { getPageByParam } from '../../src/elastic/pages' describe('saveNewsletterEmail', () => { const username = 'fakeUser' + const fakeContent = 'fake content' let user: User let email: NewsletterEmail @@ -32,13 +33,16 @@ describe('saveNewsletterEmail', () => { }) it('adds the newsletter to the library', async () => { - await saveNewsletterEmail({ - email: email.address, - content: 'fake content', - url: 'https://example.com', - title: 'fake title', - author: 'fake author', - }, ctx) + await saveNewsletterEmail( + { + email: email.address, + content: `${fakeContent}`, + url: 'https://example.com', + title: 'fake title', + author: 'fake author', + }, + ctx + ) setTimeout(async () => { const page = await getPageByParam({ userId: user.id }) @@ -48,7 +52,7 @@ describe('saveNewsletterEmail', () => { expect(page.url).to.equal('https://example.com') expect(page.title).to.equal('fake title') expect(page.author).to.equal('fake author') - expect(page.content).to.contain('fake content') + expect(page.content).to.contain(fakeContent) }) }) @@ -58,13 +62,16 @@ describe('saveNewsletterEmail', () => { color: '#07D2D1', } - await saveNewsletterEmail({ - email: email.address, - content: 'fake content 2', - url: 'https://example.com/2', - title: 'fake title', - author: 'fake author', - }, ctx) + await saveNewsletterEmail( + { + email: email.address, + content: `fake content 2`, + url: 'https://example.com/2', + title: 'fake title', + author: 'fake author', + }, + ctx + ) setTimeout(async () => { const page = await getPageByParam({ userId: user.id }) diff --git a/packages/api/test/utils/data/substack-forwarded-newsletter.html b/packages/api/test/utils/data/substack-forwarded-newsletter.html index ebfa93cee..0f67fd04d 100644 --- a/packages/api/test/utils/data/substack-forwarded-newsletter.html +++ b/packages/api/test/utils/data/substack-forwarded-newsletter.html @@ -1 +1 @@ -


---------- Forwarded message ---------
From: Andrew Methven <slowchinese@substack.com>
Date: Fri, Feb 18, 2022 at 11:57 PM
Subject: Companies that eat people
To: <XXXXXXXXXX@gmail.com>


Slow Chinese 每周漫闻 ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌

Companies that eat people

Slow Chinese 每周漫闻

The phrase, ‘eating people’ (吃人 chī rén), is used to criticise companies in China that exploit their employees.

It’s originally from Lǔ Xùn’s (鲁迅), A Madman's Diary (狂人日记 kuángrén rìjì), published in 1918:

我翻开历史一查,这历史没有年代。歪歪斜斜的每页上都写着“仁义道德”几个字,我横竖睡不着,仔细看了半夜,才从字缝里看出来,满本上都写着两个字“吃人"!

As I look through the pages of history, I see there are no dates. On each page, written messily, are the characters, ‘benevolence and morality’. I can’t sleep. I read into the night. Finally, I find hidden between the characters across the page, the words, ‘eating people’.

The times have changed since Lu Xun made that observation more than 100 years ago, but the culture of ‘eating people’ has not, according to social media comments this week, such as this one:

吃人的事实,从来没有变过,历朝历代都是如此 - The reality of [companies] exploiting their employees is nothing new. It’s been the same throughout history.

Two of China’s biggest tech companies, Tencent and Bilibili, have recently been accused of ‘eating people’, abusing and exploiting their staff.

So that’s what we discuss this week.

  • Conversations worth consuming: interview with Zhāng Yìfēi 张义飞 a former employee of Tencent

  • Words of the week: coverage and social media commentary of a recent death of a Bilibili employee allegedly due to overwork.

The audio version of this newsletter is already live - become a member to access it in your podcast app!

Use this link to claim a one-month free trial of the membership to give the full experience a go:

One-month free trial


1. CONVERSATIONS WITH CONSUMING

腾讯带头“反内卷”:光子工作室拒绝996,保障双休_游戏

Interview with Zhang Yifei

Two weeks ago a 25-year-old programmer at Tencent, Zhāng Yìfēi 张义飞, became an Internet sensation after standing up to his bosses at the company. He announced in an internal group chat that he was quitting his job, which then went viral on social media.

If 20-hour days is what the company wants, he wrote, ‘I’ll resign tomorrow’

36Kr interviewed Zhang this week (in Chinese). He talks more about the overtime culture at Tencent, and why he dared to take on his company in such a public way - he already had another job lined up.

There are some excellent words in his description of life as a working person at Tencent.

Useful words

  • 卡 kǎ - stop, block

    什么时候离职的?有人卡你吗 - When did you leave your job? Did they try to stop you?

  • 剥削 bō xuē - exploit

    加班严重、996工作制、互联网巨头压榨剥削员工等话题再次被拿来讨论

    - Topics such as serious overtime, the 996 work system, and the exploitation of employees by Internet giants are being discussed again.

  • 底气 dǐ qì - confidence, back up

    自己已经提前拿到其他公司的offer,比较有底气 - I already had an offer from another company, so I was relatively confident about doing it.

  • 忌惮 jì dàn - fear, be afraid of

    如果一些互联网大厂因此忌惮、不录用我,我正好也不想去这种加班严重的地方 - If some big Internet companies are afraid to hire me, that’s fine by me. I also don't want to work in a company with such heavy overtime.

  • 手软 shǒu ruǎn - ‘soft hand’, forgiving

    不要特立独行,搞小团体,否则他不会手软 - Do not march to a different beat or form small cliques. He will come down hard on this kind of behaviour.

  • 打硬仗 dǎ yìng zhàng - fight a hard war

    张小龙管理下的企业微信,经常会强调用小而精的团队打硬仗 - The company Wechat, under Zhang Xiaolong’s management, would often emphasise using a small and efficient team to work on tight deadlines.

    • Note: a common phrase used in Chinese companies when a team is working intensely on a project or against a ridiculous deadline.

    • Related: 打胜仗 dǎ shèngzhàng - win a war

  • 喊口号 hǎn kǒuhào - shouting slogans

    但大家普遍的看法是,不想看到空洞地喊口号,只想看到具体行为 - The general view is they don’t want to see people shouting empty slogans. They want to see action.

Idioms

  • 初出茅庐 chūchū máolú - ‘just come out of the thatched cottage’; inexperienced, wet behind the ears

    但对于大众而言,互联网巨头和初出茅庐的应届生,相比较下毕竟力量悬殊 - There’s no comparison between the power of the big internet companies and graduate employees with no experience.

  • 昏昏沉沉 hūnhūn chénchén - feeling sleepy

    来这里入职两个月,感到昏昏沉沉,记忆力下降很多

    - I’ve been here for two months. I feel tired and my memory has declined a lot.

  • 热火朝天 rèhuǒ cháotiān - ‘hot fire face sky’; vigorously, with energy

    到点的时候,差不多一半人还没走,都在热火朝天地讨论工作

    - When it was time to finish at the end of the day, around half of the team stayed behind to talk energetically about their work.


2. WORDS OF THE WEEK

上海之旅-前往Bilibili总部! - 哔哩哔哩

Bilibili eats people

A man who headed a content moderation department at Chinese video-streaming site Bilibili died last week after suffering a cerebral hemorrhage while working a Chinese New Year holiday shift.

The company was heavily criticised (Sohu - in Chinese) of having a toxic work culture.

One of the top comments on social media adapted the line from Lu Xun’s A Madman’s Diary. But instead of looking through the pages of history, overworked netizens find the same message hidden in their payslips:

我翻开工资单一查,这工资单没有工资,歪歪斜斜的每条都写着“迟到扣款”四个字。我横竖睡不着,仔细看了半夜,才从字缝里看出来,满本上都写着两个字“吃人"!

I glance at my payslip. I don’t see any pay on it. All I see are the four characters scrawled across the page: ‘fined for being late’. I can’t sleep. I look at it deep into the night. I finally find hidden between the characters across the page, written the words: ‘eating people’.

The words shared below are from the Sohu article and also from social media comments.

Useful words

  • 猝死 cù sǐ - sudden death, die suddenly

    他2月5日凌晨脑出血猝死 - He died suddenly in the early hours of the morning on 5 February.

  • 嗝屁 gé pì - hiccup, to die

    对大企业嗝屁几个不算啥,对于各个家庭你就是唯一呀 - A few people dying means nothing to a big company, but for a family it’s their only child!

    • Related: 翘辫子 qiào biàn zi - make braids - kick the bucket (Qing Dynasty reference relating to when men had to remove their braids).

      他因为加班严重而翘辫子了 - He died because of too much overtime.

  • 企图 qǐ tú - try to, seek to do something (negative)

    通过各种企图将这件事压下来,我决定发声 - Bilibili attempted to suppress the situation through different means, so I decided to speak up.

    • Note: similar to 试图 shì tú, but more negative connotations

  • 腐朽 fǔ xiǔ - degenerate, rotten

    道出了资本家的腐朽和恶臭 - Reeks of the stench and rot of capitalism.

  • 压垮 yā kuǎ - crush

    就是无情的压榨现有劳动力,能压垮一个是一个,多招一个人算我输 - It’s the callous exploitation of the current employees. The company tries to squeeze as much as possible from each and every one of them rather than hiring one more employee.

Idioms

  • 血汗工厂 xuèhàn gōngchǎng - ‘blood sweat factory’ - sweat shop

    B站因员工猝死一事,被推进了“血汗工厂”的舆论漩涡 - Bilibili has been dragged into a public debate about the company being a ‘sweatshop’ due to the sudden death of an employee.

    • Note: The pronunciation of 血 is normally xiě in colloquial phrases, and xuè in technical terms. But the rule is vague and not very helpful. In this phrase it's always xuè. But in 血汗钱 xiěhàn qián, ‘hard earned money’, xiě is more common. So confusing!

  • 混淆视听 hùnxiáo shìtīng - to muddle or confuse an issue

    晚9到早9确实不属于加班,因为是大夜班的正常时间,大厂就这样混淆视听? - 9pm to 9am does not count as overtime. But that’s because the night shift is a normal working shift for these big tech companies. They are muddling up the matter.

  • 枯燥乏味 kūzào fáwèi - boring

    做审核的确工作强度很大,而且枯燥乏味 - Being a content moderator is a very intense job. It’s also extremely boring.

    • More: 枯燥无味 kūzào wúwèi - boring (same meaning)

  • 恬不知耻 tián bù zhī chǐ - shameless

    觉得正常吗?居然还能如此恬不知耻的说“没有让他加班” - Is this normal? How can they be so shameless in saying the company did not ‘ask him to work overtime’?

  • 难上加难 nán shàng jiā nán - very difficult

    只要企业做大了,普通职工想维权难上加难 - When the company gets big it’s almost impossible for employees to protect their rights.

    • Related: 雪上加霜 xuěshàng jiāshuāng - make matters worse

Colloquial phrases

  • 万变不离其宗 wàn biàn bùlí qízōng - make ten thousand changes but remain the same in essence

    好像这些大公司公关都是万变不离其宗,核心就是推卸责任!- It seems the PR of these big companies tells a nice story, but in essence they don’t change. They are merely avoiding their responsibility.

  • 不见棺材不落泪 bùjiàn guāncai bù luò lèi - won’t cry until they see the coffin

    这也说得出口啊!真是不见血不掉泪啊 - They actually say this? Do they really have to let somebody die before they accept they are in the wrong?

    • More: I wrote more about this colloquialism in SupChina’s phrase of the week.

    • Related: 不到黄河心不死 bù dào huánghé xīn bù sǐ - not to stop until one reaches the Yellow River; refuse to give up until all hope is gone


3. RECOMMENDATIONS

Become a member of the community

As a member of the community you get access to unique resources to help you master modern Mandarin, learn, use, and understand Chinese language the way people speak it today.

  • 📚 Resources: Pleco downloads, word lists, and example sentences print-outs and audio download for each issue.

  • 🔉 Audio: audio version of the newsletter delivered as a member-only podcast every Saturday morning (before the free newsletter is published)

  • 🤓 Archive: full database of all words and phrases in the archive (nearly 1,300!) searchable according to word-type, sector and topic with audio and example sentences for each entry, updated weekly.

Use this link to claim a one-month free trial of the membership to give the full experience a go.

One-month free trial

That’s it for this week.

I look forward to seeing you in your inbox same time next weekend.

Andrew

+++

ps - please do share this newsletter on your social channels and with your networks

Share Slow Chinese 每周漫闻

Like

© 2022 Andrew Methven Unsubscribe
548 Market Street PMB 72296, San Francisco, CA 94104

Publish on Substack

\ No newline at end of file +


---------- Forwarded message ---------
From: Andrew Methven <slowchinese@substack.com>
Date: Fri, Feb 18, 2022 at 11:57 PM
Subject: Companies that eat people
To: <XXXXXXXXXX@gmail.com>


Slow Chinese 每周漫闻 ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌

Companies that eat people

Slow Chinese 每周漫闻

The phrase, ‘eating people’ (吃人 chī rén), is used to criticise companies in China that exploit their employees.

It’s originally from Lǔ Xùn’s (鲁迅), A Madman's Diary (狂人日记 kuángrén rìjì), published in 1918:

我翻开历史一查,这历史没有年代。歪歪斜斜的每页上都写着“仁义道德”几个字,我横竖睡不着,仔细看了半夜,才从字缝里看出来,满本上都写着两个字“吃人"!

As I look through the pages of history, I see there are no dates. On each page, written messily, are the characters, ‘benevolence and morality’. I can’t sleep. I read into the night. Finally, I find hidden between the characters across the page, the words, ‘eating people’.

The times have changed since Lu Xun made that observation more than 100 years ago, but the culture of ‘eating people’ has not, according to social media comments this week, such as this one:

吃人的事实,从来没有变过,历朝历代都是如此 - The reality of [companies] exploiting their employees is nothing new. It’s been the same throughout history.

Two of China’s biggest tech companies, Tencent and Bilibili, have recently been accused of ‘eating people’, abusing and exploiting their staff.

So that’s what we discuss this week.

  • Conversations worth consuming: interview with Zhāng Yìfēi 张义飞 a former employee of Tencent

  • Words of the week: coverage and social media commentary of a recent death of a Bilibili employee allegedly due to overwork.

The audio version of this newsletter is already live - become a member to access it in your podcast app!

Use this link to claim a one-month free trial of the membership to give the full experience a go:

One-month free trial


1. CONVERSATIONS WITH CONSUMING

腾讯带头“反内卷”:光子工作室拒绝996,保障双休_游戏

Interview with Zhang Yifei

Two weeks ago a 25-year-old programmer at Tencent, Zhāng Yìfēi 张义飞, became an Internet sensation after standing up to his bosses at the company. He announced in an internal group chat that he was quitting his job, which then went viral on social media.

If 20-hour days is what the company wants, he wrote, ‘I’ll resign tomorrow’

36Kr interviewed Zhang this week (in Chinese). He talks more about the overtime culture at Tencent, and why he dared to take on his company in such a public way - he already had another job lined up.

There are some excellent words in his description of life as a working person at Tencent.

Useful words

  • 卡 kǎ - stop, block

    什么时候离职的?有人卡你吗 - When did you leave your job? Did they try to stop you?

  • 剥削 bō xuē - exploit

    加班严重、996工作制、互联网巨头压榨剥削员工等话题再次被拿来讨论

    - Topics such as serious overtime, the 996 work system, and the exploitation of employees by Internet giants are being discussed again.

  • 底气 dǐ qì - confidence, back up

    自己已经提前拿到其他公司的offer,比较有底气 - I already had an offer from another company, so I was relatively confident about doing it.

  • 忌惮 jì dàn - fear, be afraid of

    如果一些互联网大厂因此忌惮、不录用我,我正好也不想去这种加班严重的地方 - If some big Internet companies are afraid to hire me, that’s fine by me. I also don't want to work in a company with such heavy overtime.

  • 手软 shǒu ruǎn - ‘soft hand’, forgiving

    不要特立独行,搞小团体,否则他不会手软 - Do not march to a different beat or form small cliques. He will come down hard on this kind of behaviour.

  • 打硬仗 dǎ yìng zhàng - fight a hard war

    张小龙管理下的企业微信,经常会强调用小而精的团队打硬仗 - The company Wechat, under Zhang Xiaolong’s management, would often emphasise using a small and efficient team to work on tight deadlines.

    • Note: a common phrase used in Chinese companies when a team is working intensely on a project or against a ridiculous deadline.

    • Related: 打胜仗 dǎ shèngzhàng - win a war

  • 喊口号 hǎn kǒuhào - shouting slogans

    但大家普遍的看法是,不想看到空洞地喊口号,只想看到具体行为 - The general view is they don’t want to see people shouting empty slogans. They want to see action.

Idioms

  • 初出茅庐 chūchū máolú - ‘just come out of the thatched cottage’; inexperienced, wet behind the ears

    但对于大众而言,互联网巨头和初出茅庐的应届生,相比较下毕竟力量悬殊 - There’s no comparison between the power of the big internet companies and graduate employees with no experience.

  • 昏昏沉沉 hūnhūn chénchén - feeling sleepy

    来这里入职两个月,感到昏昏沉沉,记忆力下降很多

    - I’ve been here for two months. I feel tired and my memory has declined a lot.

  • 热火朝天 rèhuǒ cháotiān - ‘hot fire face sky’; vigorously, with energy

    到点的时候,差不多一半人还没走,都在热火朝天地讨论工作

    - When it was time to finish at the end of the day, around half of the team stayed behind to talk energetically about their work.


2. WORDS OF THE WEEK

上海之旅-前往Bilibili总部! - 哔哩哔哩

Bilibili eats people

A man who headed a content moderation department at Chinese video-streaming site Bilibili died last week after suffering a cerebral hemorrhage while working a Chinese New Year holiday shift.

The company was heavily criticised (Sohu - in Chinese) of having a toxic work culture.

One of the top comments on social media adapted the line from Lu Xun’s A Madman’s Diary. But instead of looking through the pages of history, overworked netizens find the same message hidden in their payslips:

我翻开工资单一查,这工资单没有工资,歪歪斜斜的每条都写着“迟到扣款”四个字。我横竖睡不着,仔细看了半夜,才从字缝里看出来,满本上都写着两个字“吃人"!

I glance at my payslip. I don’t see any pay on it. All I see are the four characters scrawled across the page: ‘fined for being late’. I can’t sleep. I look at it deep into the night. I finally find hidden between the characters across the page, written the words: ‘eating people’.

The words shared below are from the Sohu article and also from social media comments.

Useful words

  • 猝死 cù sǐ - sudden death, die suddenly

    他2月5日凌晨脑出血猝死 - He died suddenly in the early hours of the morning on 5 February.

  • 嗝屁 gé pì - hiccup, to die

    对大企业嗝屁几个不算啥,对于各个家庭你就是唯一呀 - A few people dying means nothing to a big company, but for a family it’s their only child!

    • Related: 翘辫子 qiào biàn zi - make braids - kick the bucket (Qing Dynasty reference relating to when men had to remove their braids).

      他因为加班严重而翘辫子了 - He died because of too much overtime.

  • 企图 qǐ tú - try to, seek to do something (negative)

    通过各种企图将这件事压下来,我决定发声 - Bilibili attempted to suppress the situation through different means, so I decided to speak up.

    • Note: similar to 试图 shì tú, but more negative connotations

  • 腐朽 fǔ xiǔ - degenerate, rotten

    道出了资本家的腐朽和恶臭 - Reeks of the stench and rot of capitalism.

  • 压垮 yā kuǎ - crush

    就是无情的压榨现有劳动力,能压垮一个是一个,多招一个人算我输 - It’s the callous exploitation of the current employees. The company tries to squeeze as much as possible from each and every one of them rather than hiring one more employee.

Idioms

  • 血汗工厂 xuèhàn gōngchǎng - ‘blood sweat factory’ - sweat shop

    B站因员工猝死一事,被推进了“血汗工厂”的舆论漩涡 - Bilibili has been dragged into a public debate about the company being a ‘sweatshop’ due to the sudden death of an employee.

    • Note: The pronunciation of 血 is normally xiě in colloquial phrases, and xuè in technical terms. But the rule is vague and not very helpful. In this phrase it's always xuè. But in 血汗钱 xiěhàn qián, ‘hard earned money’, xiě is more common. So confusing!

  • 混淆视听 hùnxiáo shìtīng - to muddle or confuse an issue

    晚9到早9确实不属于加班,因为是大夜班的正常时间,大厂就这样混淆视听? - 9pm to 9am does not count as overtime. But that’s because the night shift is a normal working shift for these big tech companies. They are muddling up the matter.

  • 枯燥乏味 kūzào fáwèi - boring

    做审核的确工作强度很大,而且枯燥乏味 - Being a content moderator is a very intense job. It’s also extremely boring.

    • More: 枯燥无味 kūzào wúwèi - boring (same meaning)

  • 恬不知耻 tián bù zhī chǐ - shameless

    觉得正常吗?居然还能如此恬不知耻的说“没有让他加班” - Is this normal? How can they be so shameless in saying the company did not ‘ask him to work overtime’?

  • 难上加难 nán shàng jiā nán - very difficult

    只要企业做大了,普通职工想维权难上加难 - When the company gets big it’s almost impossible for employees to protect their rights.

    • Related: 雪上加霜 xuěshàng jiāshuāng - make matters worse

Colloquial phrases

  • 万变不离其宗 wàn biàn bùlí qízōng - make ten thousand changes but remain the same in essence

    好像这些大公司公关都是万变不离其宗,核心就是推卸责任!- It seems the PR of these big companies tells a nice story, but in essence they don’t change. They are merely avoiding their responsibility.

  • 不见棺材不落泪 bùjiàn guāncai bù luò lèi - won’t cry until they see the coffin

    这也说得出口啊!真是不见血不掉泪啊 - They actually say this? Do they really have to let somebody die before they accept they are in the wrong?

    • More: I wrote more about this colloquialism in SupChina’s phrase of the week.

    • Related: 不到黄河心不死 bù dào huánghé xīn bù sǐ - not to stop until one reaches the Yellow River; refuse to give up until all hope is gone


3. RECOMMENDATIONS

Become a member of the community

As a member of the community you get access to unique resources to help you master modern Mandarin, learn, use, and understand Chinese language the way people speak it today.

  • 📚 Resources: Pleco downloads, word lists, and example sentences print-outs and audio download for each issue.

  • 🔉 Audio: audio version of the newsletter delivered as a member-only podcast every Saturday morning (before the free newsletter is published)

  • 🤓 Archive: full database of all words and phrases in the archive (nearly 1,300!) searchable according to word-type, sector and topic with audio and example sentences for each entry, updated weekly.

Use this link to claim a one-month free trial of the membership to give the full experience a go.

One-month free trial

That’s it for this week.

I look forward to seeing you in your inbox same time next weekend.

Andrew

+++

ps - please do share this newsletter on your social channels and with your networks

Share Slow Chinese 每周漫闻

Like

© 2022 Andrew Methven Unsubscribe
548 Market Street PMB 72296, San Francisco, CA 94104

Publish on Substack

diff --git a/packages/api/test/utils/data/substack-forwarded-welcome-email.html b/packages/api/test/utils/data/substack-forwarded-welcome-email.html index f975441aa..feb14eeb3 100644 --- a/packages/api/test/utils/data/substack-forwarded-welcome-email.html +++ b/packages/api/test/utils/data/substack-forwarded-welcome-email.html @@ -1 +1 @@ -


---------- Forwarded message ---------
From: Andrew Methven <slowchinese@substack.com>
Date: Thu, Dec 9, 2021 at 11:27 PM
Subject: How can Slow Chinese 每周漫闻 help you?
To: <XXXXXXXXXX@gmail.com>


Thank you for subscribing to for Slow Chinese 每周漫闻 ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌

Thanks so much for subscribing to Slow Chinese 每周漫闻 and welcome aboard!

I’m excited to help you improve and practice your Chinese language skills.

Here’s a quick way I can help:

Reply to this email and tell me about your story of learning Chinese and what challenges you currently have with the language.

I’ll reply with a specific suggestion to help you.

Also, to make sure the next issue of the newsletter doesn’t land in your spam folder, add my email address to your contacts.

Thanks!

Andrew

© 2021 Andrew Methven Unsubscribe
548 Market Street PMB 72296, San Francisco, CA 94104

Publish on Substack

\ No newline at end of file +


---------- Forwarded message ---------
From: Andrew Methven <slowchinese@substack.com>
Date: Thu, Dec 9, 2021 at 11:27 PM
Subject: How can Slow Chinese 每周漫闻 help you?
To: <XXXXXXXXXX@gmail.com>


Thank you for subscribing to for Slow Chinese 每周漫闻 ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌

Thanks so much for subscribing to Slow Chinese 每周漫闻 and welcome aboard!

I’m excited to help you improve and practice your Chinese language skills.

Here’s a quick way I can help:

Reply to this email and tell me about your story of learning Chinese and what challenges you currently have with the language.

I’ll reply with a specific suggestion to help you.

Also, to make sure the next issue of the newsletter doesn’t land in your spam folder, add my email address to your contacts.

Thanks!

Andrew

© 2021 Andrew Methven Unsubscribe
548 Market Street PMB 72296, San Francisco, CA 94104

Publish on Substack

diff --git a/packages/api/test/utils/data/substack-private-forwarded-newsletter.html b/packages/api/test/utils/data/substack-private-forwarded-newsletter.html index 071bfeb77..bb240dbe8 100644 --- a/packages/api/test/utils/data/substack-private-forwarded-newsletter.html +++ b/packages/api/test/utils/data/substack-private-forwarded-newsletter.html @@ -1,2 +1,2 @@ -


---------- Forwarded message ---------
From: giggs <darkgiggsxx@gmail.com>
Date: Wed, Mar 2, 2022 at 5:29 PM
Subject: Fwd: The German Retreat From Nuclear Power
To: Radek <radoslaw.jurga@gmail.com>



---------- Forwarded message ---------
De : Bismarck Analysis <bismarck@substack.com>
Date: mer. 2 mars 2022 à 15:02
Subject: The German Retreat From Nuclear Power
To: <darkgiggsxx@gmail.com>


Germany's economic priorities are torn between industrial growth and energy degrowth. As its last nuclear plants are shut down, the choice is between relying on fossil fuels or closing factories. ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌

The German Retreat From Nuclear Power

Germany's economic priorities are torn between industrial growth and energy degrowth. As its last nuclear plants are shut down, the choice is between relying on fossil fuels or closing factories.

Isar Nuclear Power Plant near Landshut, Germany in 2016. The Isar station is scheduled to be shut down by the end of 2022. Photo by Dennis Hansch. Source.
-
\ No newline at end of file +


---------- Forwarded message ---------
From: giggs <darkgiggsxx@gmail.com>
Date: Wed, Mar 2, 2022 at 5:29 PM
Subject: Fwd: The German Retreat From Nuclear Power
To: Radek <radoslaw.jurga@gmail.com>



---------- Forwarded message ---------
De : Bismarck Analysis <bismarck@substack.com>
Date: mer. 2 mars 2022 à 15:02
Subject: The German Retreat From Nuclear Power
To: <darkgiggsxx@gmail.com>


Germany's economic priorities are torn between industrial growth and energy degrowth. As its last nuclear plants are shut down, the choice is between relying on fossil fuels or closing factories. ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌

The German Retreat From Nuclear Power

Germany's economic priorities are torn between industrial growth and energy degrowth. As its last nuclear plants are shut down, the choice is between relying on fossil fuels or closing factories.

Isar Nuclear Power Plant near Landshut, Germany in 2016. The Isar station is scheduled to be shut down by the end of 2022. Photo by Dennis Hansch. Source.
+
From ad99f933e554c7551cd05cd23b23c164c564cdd7 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Thu, 12 May 2022 17:53:28 +0800 Subject: [PATCH 5/7] Fix tests cont --- packages/content-fetch/apple-news-handler.js | 2 +- packages/content-fetch/bloomberg-handler.js | 2 +- packages/content-fetch/derstandard-handler.js | 2 +- packages/puppeteer-parse/apple-news-handler.js | 2 +- packages/puppeteer-parse/bloomberg-handler.js | 2 +- packages/puppeteer-parse/derstandard-handler.js | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/packages/content-fetch/apple-news-handler.js b/packages/content-fetch/apple-news-handler.js index 8a69e5adf..0759dec23 100644 --- a/packages/content-fetch/apple-news-handler.js +++ b/packages/content-fetch/apple-news-handler.js @@ -27,7 +27,7 @@ exports.appleNewsHandler = { const response = await axios.get(url, { headers: { 'User-Agent': MOBILE_USER_AGENT } } ); const data = response.data; - const dom = new parseHTML(data).document; + const dom = parseHTML(data).document; // make sure its a valid URL by wrapping in new URL const u = new URL(dom.querySelector('span.click-here').parentNode.href); diff --git a/packages/content-fetch/bloomberg-handler.js b/packages/content-fetch/bloomberg-handler.js index d6caef6fd..d79a568bb 100644 --- a/packages/content-fetch/bloomberg-handler.js +++ b/packages/content-fetch/bloomberg-handler.js @@ -29,7 +29,7 @@ exports.bloombergHandler = { 'block_resources': false, } }) - const dom = new parseHTML(response.data).document; + const dom = parseHTML(response.data).document; return { title: dom.title, content: dom.querySelector('body').innerHTML, url: url } } catch (error) { console.error('error prehandling bloomberg url', error) diff --git a/packages/content-fetch/derstandard-handler.js b/packages/content-fetch/derstandard-handler.js index eddc9618f..a44db6f2a 100644 --- a/packages/content-fetch/derstandard-handler.js +++ b/packages/content-fetch/derstandard-handler.js @@ -23,7 +23,7 @@ exports.derstandardHandler = { const content = response.data; var title = undefined; - const dom = new parseHTML(content).document; + const dom = parseHTML(content).document; const titleElement = dom.querySelector('.article-title') if (!titleElement) { title = titleElement.textContent diff --git a/packages/puppeteer-parse/apple-news-handler.js b/packages/puppeteer-parse/apple-news-handler.js index 8a69e5adf..0759dec23 100644 --- a/packages/puppeteer-parse/apple-news-handler.js +++ b/packages/puppeteer-parse/apple-news-handler.js @@ -27,7 +27,7 @@ exports.appleNewsHandler = { const response = await axios.get(url, { headers: { 'User-Agent': MOBILE_USER_AGENT } } ); const data = response.data; - const dom = new parseHTML(data).document; + const dom = parseHTML(data).document; // make sure its a valid URL by wrapping in new URL const u = new URL(dom.querySelector('span.click-here').parentNode.href); diff --git a/packages/puppeteer-parse/bloomberg-handler.js b/packages/puppeteer-parse/bloomberg-handler.js index d6caef6fd..d79a568bb 100644 --- a/packages/puppeteer-parse/bloomberg-handler.js +++ b/packages/puppeteer-parse/bloomberg-handler.js @@ -29,7 +29,7 @@ exports.bloombergHandler = { 'block_resources': false, } }) - const dom = new parseHTML(response.data).document; + const dom = parseHTML(response.data).document; return { title: dom.title, content: dom.querySelector('body').innerHTML, url: url } } catch (error) { console.error('error prehandling bloomberg url', error) diff --git a/packages/puppeteer-parse/derstandard-handler.js b/packages/puppeteer-parse/derstandard-handler.js index eddc9618f..a44db6f2a 100644 --- a/packages/puppeteer-parse/derstandard-handler.js +++ b/packages/puppeteer-parse/derstandard-handler.js @@ -23,7 +23,7 @@ exports.derstandardHandler = { const content = response.data; var title = undefined; - const dom = new parseHTML(content).document; + const dom = parseHTML(content).document; const titleElement = dom.querySelector('.article-title') if (!titleElement) { title = titleElement.textContent From 37e55add98720da1be4ddcfc2b9380068c3e26cb Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Fri, 13 May 2022 12:09:05 +0800 Subject: [PATCH 6/7] Stop blocking stylesheet and media --- packages/content-fetch/fetch-content.js | 2 +- packages/puppeteer-parse/index.js | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/content-fetch/fetch-content.js b/packages/content-fetch/fetch-content.js index 76204fb98..ad70fec24 100644 --- a/packages/content-fetch/fetch-content.js +++ b/packages/content-fetch/fetch-content.js @@ -463,7 +463,7 @@ async function retrievePage(url) { await page.setRequestInterception(true); let requestCount = 0; page.on('request', request => { - if (['font', 'image', 'stylesheet', 'script', 'media'].includes(request.resourceType())) { + if (['font', 'image', 'script'].includes(request.resourceType())) { request.abort(); return; } diff --git a/packages/puppeteer-parse/index.js b/packages/puppeteer-parse/index.js index 2ac49838b..aa854f597 100644 --- a/packages/puppeteer-parse/index.js +++ b/packages/puppeteer-parse/index.js @@ -665,7 +665,7 @@ async function retrievePage(url) { await page.setRequestInterception(true); let requestCount = 0; page.on('request', request => { - if (['font', 'image', 'stylesheet', 'script', 'media'].includes(request.resourceType())) { + if (['font', 'image', 'script'].includes(request.resourceType())) { request.abort(); return; } From f5003c1370c36294f8f96ff83476138304683df4 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Fri, 13 May 2022 12:17:19 +0800 Subject: [PATCH 7/7] Stop blocking script --- packages/content-fetch/fetch-content.js | 9 ++++++++- packages/puppeteer-parse/index.js | 9 ++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/packages/content-fetch/fetch-content.js b/packages/content-fetch/fetch-content.js index ad70fec24..606980b29 100644 --- a/packages/content-fetch/fetch-content.js +++ b/packages/content-fetch/fetch-content.js @@ -463,7 +463,7 @@ async function retrievePage(url) { await page.setRequestInterception(true); let requestCount = 0; page.on('request', request => { - if (['font', 'image', 'script'].includes(request.resourceType())) { + if (['font', 'image', 'media'].includes(request.resourceType())) { request.abort(); return; } @@ -471,6 +471,13 @@ async function retrievePage(url) { request.abort(); return; } + if ( + request.resourceType() === 'script' && + request.url().toLowerCase().indexOf('mathjax') > -1 + ) { + request.abort(); + return + } request.continue(); }); diff --git a/packages/puppeteer-parse/index.js b/packages/puppeteer-parse/index.js index aa854f597..9700c550a 100644 --- a/packages/puppeteer-parse/index.js +++ b/packages/puppeteer-parse/index.js @@ -665,7 +665,7 @@ async function retrievePage(url) { await page.setRequestInterception(true); let requestCount = 0; page.on('request', request => { - if (['font', 'image', 'script'].includes(request.resourceType())) { + if (['font', 'image', 'media'].includes(request.resourceType())) { request.abort(); return; } @@ -673,6 +673,13 @@ async function retrievePage(url) { request.abort(); return; } + if ( + request.resourceType() === 'script' && + request.url().toLowerCase().indexOf('mathjax') > -1 + ) { + request.abort(); + return + } request.continue(); });