From 5e239d2568f82d644cce301cc43c27dd40de61e4 Mon Sep 17 00:00:00 2001
From: Hongbo Wu <hongbo@omnivore.app>
Date: Wed, 24 Jan 2024 18:57:25 +0800
Subject: [PATCH] run readability in save-page instead of puppeteer

---
 packages/api/src/jobs/save_page.ts            |  15 +-
 packages/api/src/services/save_email.ts       |   1 -
 packages/api/src/services/save_page.ts        |  16 +-
 packages/api/src/utils/parser.ts              | 203 +++++++++---------
 packages/content-fetch/src/request_handler.ts |   1 -
 packages/puppeteer-parse/src/index.ts         | 142 ++----------
 6 files changed, 126 insertions(+), 252 deletions(-)
diff --git a/packages/api/src/jobs/save_page.ts b/packages/api/src/jobs/save_page.ts
index 08f650f38..3d143caf8 100644
--- a/packages/api/src/jobs/save_page.ts
+++ b/packages/api/src/jobs/save_page.ts
@@ -1,4 +1,3 @@
-import { Readability } from '@omnivore/readability'
 import axios from 'axios'
 import jwt from 'jsonwebtoken'
 import { promisify } from 'util'
@@ -63,7 +62,6 @@ interface FetchResult {
   title?: string
   content?: string
   contentType?: string
-  readabilityResult?: Readability.ParseResult
 }
 
 const isFetchResult = (obj: unknown): obj is FetchResult => {
@@ -297,7 +295,7 @@ export const savePageJob = async (data: Data, attemptsMade: number) => {
 
     // get the fetch result from cache
     const fetchedResult = await getCachedFetchResult(url)
-    const { title, contentType, readabilityResult } = fetchedResult
+    const { title, contentType } = fetchedResult
     let content = fetchedResult.content
 
     // for pdf content, we need to upload the pdf
@@ -350,7 +348,6 @@ export const savePageJob = async (data: Data, attemptsMade: number) => {
         clientRequestId: articleSavingRequestId,
         title,
         originalContent: content,
-        parseResult: readabilityResult,
         state: state ? (state as ArticleSavingRequestStatus) : undefined,
         labels: labels,
         rssFeedUrl,
@@ -362,13 +359,11 @@ export const savePageJob = async (data: Data, attemptsMade: number) => {
       user
     )
 
-    // if (result.__typename == 'SaveError') {
-    //   logger.error('Error saving page', { userId, url, result })
-    //   throw new Error('Error saving page')
-    // }
+    if (result.__typename == 'SaveError') {
+      throw new Error(result.message || 'Failed to save page')
+    }
 
-    // if the readability result is not parsed, the import is failed
-    isImported = !!readabilityResult
+    isImported = true
     isSaved = true
   } catch (e) {
     if (e instanceof Error) {
diff --git a/packages/api/src/services/save_email.ts b/packages/api/src/services/save_email.ts
index 092c2b98f..0c0f4f456 100644
--- a/packages/api/src/services/save_email.ts
+++ b/packages/api/src/services/save_email.ts
@@ -53,7 +53,6 @@ export const saveEmail = async (
         // can leave this empty for now
       },
     },
-    null,
     true
   )
 
diff --git a/packages/api/src/services/save_page.ts b/packages/api/src/services/save_page.ts
index a91f90549..9d55961e7 100644
--- a/packages/api/src/services/save_page.ts
+++ b/packages/api/src/services/save_page.ts
@@ -68,17 +68,13 @@ export const savePage = async (
   input: SavePageInput,
   user: User
 ): Promise<SaveResult> => {
-  const parseResult = await parsePreparedContent(
-    input.url,
-    {
-      document: input.originalContent,
-      pageInfo: {
-        title: input.title,
-        canonicalUrl: input.url,
-      },
+  const parseResult = await parsePreparedContent(input.url, {
+    document: input.originalContent,
+    pageInfo: {
+      title: input.title,
+      canonicalUrl: input.url,
     },
-    input.parseResult
-  )
+  })
   const [newSlug, croppedPathname] = createSlug(input.url, input.title)
   let slug = newSlug
   let clientRequestId = input.clientRequestId
diff --git a/packages/api/src/utils/parser.ts b/packages/api/src/utils/parser.ts
index 54373baea..ae57ee6d0 100644
--- a/packages/api/src/utils/parser.ts
+++ b/packages/api/src/utils/parser.ts
@@ -223,7 +223,6 @@ const getReadabilityResult = async (
 export const parsePreparedContent = async (
   url: string,
   preparedDocument: PreparedDocumentInput,
-  parseResult?: Readability.ParseResult | null,
   isNewsletter?: boolean,
   allowRetry = true
 ): Promise<ParsedContentPuppeteer> => {
@@ -232,9 +231,6 @@ export const parsePreparedContent = async (
     labels: { source: 'parsePreparedContent' },
   }
 
-  // If we have a parse result, use it
-  let article = parseResult || null
-  let highlightData = undefined
   const { document, pageInfo } = preparedDocument
 
   if (!document) {
@@ -262,137 +258,134 @@ export const parsePreparedContent = async (
     }
   }
 
-  let dom: Document | null = null
+  const { title: pageInfoTitle, canonicalUrl } = pageInfo
+
+  let parsedContent = null
+  let pageType = PageType.Unknown
+  let highlightData = undefined
 
   try {
-    dom = parseHTML(document).document
+    const dom = parseHTML(document).document
+    pageType = parseOriginalContent(dom)
 
-    if (!article) {
-      // Attempt to parse the article
-      // preParse content
-      dom = (await preParseContent(url, dom)) || dom
+    // Run readability
+    await preParseContent(url, dom)
 
-      article = await getReadabilityResult(url, document, dom, isNewsletter)
+    parsedContent = await getReadabilityResult(url, document, dom, isNewsletter)
+
+    if (!parsedContent || !parsedContent.content) {
+      logger.info('No parsed content')
+
+      if (allowRetry) {
+        logger.info('Retrying with content wrapped in html body')
+
+        const newDocument = {
+          ...preparedDocument,
+          document: '<html><body>' + document + '</body></html>', // wrap in body
+        }
+        return parsePreparedContent(url, newDocument, isNewsletter, false)
+      }
+
+      return {
+        canonicalUrl,
+        parsedContent,
+        domContent: document,
+        pageType,
+      }
     }
 
-    if (!article?.textContent && allowRetry) {
-      const newDocument = {
-        ...preparedDocument,
-        document: '<html><body>' + document + '</body></html>', // wrap in body
-      }
-      return parsePreparedContent(
-        url,
-        newDocument,
-        parseResult,
-        isNewsletter,
-        false
-      )
+    // use title if not found after running readability
+    if (!parsedContent.title && pageInfoTitle) {
+      parsedContent.title = pageInfoTitle
     }
 
     // Format code blocks
     // TODO: we probably want to move this type of thing
     // to the handlers, and have some concept of postHandle
-    if (article?.content) {
-      const articleDom = parseHTML(article.content).document
-      const codeBlocks = articleDom.querySelectorAll(
-        'code, pre[class^="prism-"], pre[class^="language-"]'
-      )
-      if (codeBlocks.length > 0) {
-        codeBlocks.forEach((e) => {
-          if (e.textContent) {
-            const att = hljs.highlightAuto(e.textContent)
-            const code = articleDom.createElement('code')
-            const langClass =
-              `hljs language-${att.language}` +
-              (att.second_best?.language
-                ? ` language-${att.second_best?.language}`
-                : '')
-            code.setAttribute('class', langClass)
-            code.innerHTML = att.value
-            e.replaceWith(code)
-          }
-        })
-        article.content = articleDom.documentElement.outerHTML
-      }
-
-      highlightData = findEmbeddedHighlight(articleDom.documentElement)
-
-      const ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES = [
-        'omnivore-highlight-id',
-        'data-twitter-tweet-id',
-        'data-instagram-id',
-      ]
-
-      // Get the top level element?
-      const pageNode = articleDom.firstElementChild as HTMLElement
-      const nodesToVisitStack: [HTMLElement] = [pageNode]
-      const visitedNodeList = []
-
-      while (nodesToVisitStack.length > 0) {
-        const currentNode = nodesToVisitStack.pop()
-        if (
-          currentNode?.nodeType !== 1 ||
-          // Avoiding dynamic elements from being counted as anchor-allowed elements
-          ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES.some((attrib) =>
-            currentNode.hasAttribute(attrib)
-          )
-        ) {
-          continue
+    const newDom = parseHTML(parsedContent.content).document
+    const codeBlocks = newDom.querySelectorAll(
+      'code, pre[class^="prism-"], pre[class^="language-"]'
+    )
+    if (codeBlocks.length > 0) {
+      codeBlocks.forEach((e) => {
+        if (e.textContent) {
+          const att = hljs.highlightAuto(e.textContent)
+          const code = newDom.createElement('code')
+          const langClass =
+            `hljs language-${att.language}` +
+            (att.second_best?.language
+              ? ` language-${att.second_best?.language}`
+              : '')
+          code.setAttribute('class', langClass)
+          code.innerHTML = att.value
+          e.replaceWith(code)
         }
-        visitedNodeList.push(currentNode)
-        ;[].slice
-          .call(currentNode.childNodes)
-          .reverse()
-          .forEach(function (node) {
-            nodesToVisitStack.push(node)
-          })
-      }
-
-      visitedNodeList.shift()
-      visitedNodeList.forEach((node, index) => {
-        // start from index 1, index 0 reserved for anchor unknown.
-        node.setAttribute('data-omnivore-anchor-idx', (index + 1).toString())
       })
-
-      article.content = articleDom.documentElement.outerHTML
     }
 
+    highlightData = findEmbeddedHighlight(newDom.documentElement)
+
+    const ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES = [
+      'omnivore-highlight-id',
+      'data-twitter-tweet-id',
+      'data-instagram-id',
+    ]
+
+    // Get the top level element?
+    const pageNode = newDom.firstElementChild as HTMLElement
+    const nodesToVisitStack: [HTMLElement] = [pageNode]
+    const visitedNodeList = []
+
+    while (nodesToVisitStack.length > 0) {
+      const currentNode = nodesToVisitStack.pop()
+      if (
+        currentNode?.nodeType !== 1 ||
+        // Avoiding dynamic elements from being counted as anchor-allowed elements
+        ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES.some((attrib) =>
+          currentNode.hasAttribute(attrib)
+        )
+      ) {
+        continue
+      }
+      visitedNodeList.push(currentNode)
+      ;[].slice
+        .call(currentNode.childNodes)
+        .reverse()
+        .forEach(function (node) {
+          nodesToVisitStack.push(node)
+        })
+    }
+
+    visitedNodeList.shift()
+    visitedNodeList.forEach((node, index) => {
+      // start from index 1, index 0 reserved for anchor unknown.
+      node.setAttribute('data-omnivore-anchor-idx', (index + 1).toString())
+    })
+
+    const newHtml = newDom.documentElement.outerHTML
     const newWindow = parseHTML('')
     const DOMPurify = createDOMPurify(newWindow)
     DOMPurify.addHook('uponSanitizeElement', domPurifySanitizeHook)
-    const clean = DOMPurify.sanitize(article?.content || '', DOM_PURIFY_CONFIG)
+    const cleanHtml = DOMPurify.sanitize(newHtml, DOM_PURIFY_CONFIG)
+    parsedContent.content = cleanHtml
 
-    Object.assign(article || {}, {
-      content: clean,
-      title: article?.title,
-      previewImage: article?.previewImage,
-      siteName: article?.siteName,
-      siteIcon: article?.siteIcon,
-      byline: article?.byline,
-      language: article?.language,
-    })
     logRecord.parseSuccess = true
   } catch (error) {
-    logger.info('Error parsing content', error)
+    logger.error('Error parsing content', error)
+
     Object.assign(logRecord, {
       parseSuccess: false,
       parseError: error,
     })
   }
 
-  const { title, canonicalUrl } = pageInfo
-
-  Object.assign(article || {}, {
-    title: article?.title || title,
-  })
-
-  logger.info('parse-article completed')
+  logger.info('parse-article completed', logRecord)
 
   return {
-    domContent: document,
-    parsedContent: article,
     canonicalUrl,
-    pageType: dom ? parseOriginalContent(dom) : PageType.Unknown,
+    parsedContent,
+    domContent: document,
+    pageType,
     highlightData,
   }
 }
diff --git a/packages/content-fetch/src/request_handler.ts b/packages/content-fetch/src/request_handler.ts
index 0106d8058..56356ec1e 100644
--- a/packages/content-fetch/src/request_handler.ts
+++ b/packages/content-fetch/src/request_handler.ts
@@ -50,7 +50,6 @@ interface FetchResult {
   title?: string
   content?: string
   contentType?: string
-  readabilityResult?: unknown
 }
 
 export const cacheFetchResult = async (fetchResult: FetchResult) => {
diff --git a/packages/puppeteer-parse/src/index.ts b/packages/puppeteer-parse/src/index.ts
index 2629689c2..a02a3595b 100644
--- a/packages/puppeteer-parse/src/index.ts
+++ b/packages/puppeteer-parse/src/index.ts
@@ -1,10 +1,7 @@
 /* eslint-disable @typescript-eslint/no-unsafe-member-access */
 /* eslint-disable @typescript-eslint/no-unsafe-assignment */
-import { preHandleContent, preParseContent } from '@omnivore/content-handler'
-import { Readability } from '@omnivore/readability'
+import { preHandleContent } from '@omnivore/content-handler'
 import axios from 'axios'
-import crypto from 'crypto'
-import createDOMPurify, { SanitizeElementHookEvent } from 'dompurify'
 // const { Storage } = require('@google-cloud/storage');
 import { parseHTML } from 'linkedom'
 import path from 'path'
@@ -13,7 +10,6 @@ import puppeteer from 'puppeteer-extra'
 import AdblockerPlugin from 'puppeteer-extra-plugin-adblocker'
 import StealthPlugin from 'puppeteer-extra-plugin-stealth'
 import Url from 'url'
-import { encode } from 'urlsafe-base64'
 
 // Add stealth plugin to hide puppeteer usage
 puppeteer.use(StealthPlugin())
@@ -28,12 +24,12 @@ puppeteer.use(AdblockerPlugin({ blockTrackers: true }))
 
 // const filePath = `${os.tmpdir()}/previewImage.png`
 
-const MOBILE_USER_AGENT =
-  'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.62 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
+// const MOBILE_USER_AGENT =
+//   'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.62 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
 const DESKTOP_USER_AGENT =
   'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
-const BOT_DESKTOP_USER_AGENT =
-  'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
+// const BOT_DESKTOP_USER_AGENT =
+//   'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
 const NON_BOT_DESKTOP_USER_AGENT =
   'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
 const NON_BOT_HOSTS = ['bloomberg.com', 'forbes.com']
@@ -156,8 +152,8 @@ export const fetchContent = async (
     page: Page | undefined,
     title: string | undefined,
     content: string | undefined,
-    contentType: string | undefined,
-    readabilityResult: Readability.ParseResult | null | undefined
+    contentType: string | undefined
+
   try {
     url = getUrl(url)
     if (!url) {
@@ -230,34 +226,26 @@ export const fetchContent = async (
       console.info('fallback to scrapingbee', url)
 
       const sbResult = await fetchContentWithScrapingBee(url)
-      content = sbResult.domContent
-      title = sbResult.title
-    } else {
-      throw e
+
+      return {
+        finalUrl: url,
+        title: sbResult.title,
+        content: sbResult.domContent,
+        contentType,
+      }
     }
+
+    throw e
   } finally {
     // close browser context if it was opened
     if (context) {
       await context.close()
     }
-    // save non pdf content
-    if (url && contentType !== 'application/pdf') {
-      // parse content if it is not empty
-      if (content) {
-        let document = parseHTML(content).document
-        // preParse content
-        const preParsedDom = await preParseContent(url, document)
-        if (preParsedDom) {
-          document = preParsedDom
-        }
-        readabilityResult = await getReadabilityResult(url, document)
-      }
-    }
 
     console.info(`content-fetch result`, logRecord)
   }
 
-  return { finalUrl: url, title, content, readabilityResult, contentType }
+  return { finalUrl: url, title, content, contentType }
 }
 
 function validateUrlString(url: string) {
@@ -741,99 +729,3 @@ async function retrieveHtml(page: Page, logRecord: Record<string, any>) {
 //   console.info(`preview-image`, logRecord);
 //   return res.redirect(`${process.env.PREVIEW_IMAGE_CDN_ORIGIN}/${destination}`);
 // }
-
-const DOM_PURIFY_CONFIG = {
-  ADD_TAGS: ['iframe'],
-  ADD_ATTR: ['allow', 'allowfullscreen', 'frameborder', 'scrolling'],
-  FORBID_ATTR: [
-    'data-ml-dynamic',
-    'data-ml-dynamic-type',
-    'data-orig-url',
-    'data-ml-id',
-    'data-ml',
-    'data-xid',
-    'data-feature',
-  ],
-}
-
-function domPurifySanitizeHook(node: Element, data: SanitizeElementHookEvent) {
-  if (data.tagName === 'iframe') {
-    const urlRegex = /^(https?:)?\/\/www\.youtube(-nocookie)?\.com\/embed\//i
-    const src = node.getAttribute('src') || ''
-    const dataSrc = node.getAttribute('data-src') || ''
-
-    if (src && urlRegex.test(src)) {
-      return
-    }
-
-    if (dataSrc && urlRegex.test(dataSrc)) {
-      node.setAttribute('src', dataSrc)
-      return
-    }
-
-    node.parentNode?.removeChild(node)
-  }
-}
-
-function getPurifiedContent(html: Document) {
-  const newWindow = parseHTML('')
-  const DOMPurify = createDOMPurify(newWindow)
-  DOMPurify.addHook('uponSanitizeElement', domPurifySanitizeHook)
-  const clean = DOMPurify.sanitize(html, DOM_PURIFY_CONFIG)
-  return parseHTML(clean).document
-}
-
-function signImageProxyUrl(url: string) {
-  return encode(
-    crypto
-      .createHmac('sha256', process.env.IMAGE_PROXY_SECRET || '')
-      .update(url)
-      .digest()
-  )
-}
-
-function createImageProxyUrl(url: string, width = 0, height = 0) {
-  if (!process.env.IMAGE_PROXY_URL || !process.env.IMAGE_PROXY_SECRET) {
-    return url
-  }
-
-  const urlWithOptions = `${url}#${width}x${height}`
-  const signature = signImageProxyUrl(urlWithOptions)
-
-  return `${process.env.IMAGE_PROXY_URL}/${width}x${height},s${signature}/${url}`
-}
-
-async function getReadabilityResult(url: string, document: Document) {
-  // First attempt to read the article as is.
-  // if that fails attempt to purify then read
-  const sources = [
-    () => {
-      return document
-    },
-    () => {
-      return getPurifiedContent(document)
-    },
-  ]
-
-  for (const source of sources) {
-    const document = source()
-    if (!document) {
-      continue
-    }
-
-    try {
-      const article = await new Readability(document, {
-        createImageProxyUrl,
-        url,
-      }).parse()
-
-      if (article) {
-        return article
-      }
-    } catch (error) {
-      console.log('parsing error for url', url, error)
-    }
-  }
-
-  return null
-}