From 7c6b810522991ea470283063134c09f322328e8b Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Fri, 6 May 2022 10:53:36 +0800 Subject: [PATCH 1/3] Remove redundant JSDOM --- packages/api/src/resolvers/article/index.ts | 4 +--- packages/api/src/services/save_email.ts | 9 ++------- packages/api/src/services/save_page.ts | 6 ++---- packages/api/src/utils/parser.ts | 8 +++++--- 4 files changed, 10 insertions(+), 17 deletions(-) diff --git a/packages/api/src/resolvers/article/index.ts b/packages/api/src/resolvers/article/index.ts index b7865a5c6..1eee8e0ae 100644 --- a/packages/api/src/resolvers/article/index.ts +++ b/packages/api/src/resolvers/article/index.ts @@ -54,7 +54,6 @@ import { } from '../../utils/helpers' import { ParsedContentPuppeteer, - parseOriginalContent, parsePreparedContent, } from '../../utils/parser' import { isSiteBlockedForParse } from '../../utils/blocked' @@ -236,8 +235,7 @@ export const createArticleResolver = authorized< parsedContent = parseResults.parsedContent canonicalUrl = parseResults.canonicalUrl domContent = parseResults.domContent - - pageType = parseOriginalContent(url, domContent) + pageType = parseResults.pageType } else if (!preparedDocument?.document) { // We have a URL but no document, so we try to send this to puppeteer // and return a dummy response. diff --git a/packages/api/src/services/save_email.ts b/packages/api/src/services/save_email.ts index e62175c47..95ee5c576 100644 --- a/packages/api/src/services/save_email.ts +++ b/packages/api/src/services/save_email.ts @@ -1,9 +1,5 @@ import { generateSlug, stringToHash, validatedDate } from '../utils/helpers' -import { - parseOriginalContent, - parsePreparedContent, - parseUrlMetadata, -} from '../utils/parser' +import { parsePreparedContent, parseUrlMetadata } from '../utils/parser' import normalizeUrl from 'normalize-url' import { PubsubClient } from '../datalayer/pubsub' import { ArticleSavingRequestStatus, Page } from '../elastic/types' @@ -44,7 +40,6 @@ export const saveEmail = async ( const content = parseResult.parsedContent?.content || input.originalContent const slug = generateSlug(title) - const pageType = parseOriginalContent(url, input.originalContent) const metadata = await parseUrlMetadata(url) const articleToSave: Page = { @@ -60,7 +55,7 @@ export const saveEmail = async ( stripHash: true, stripWWW: false, }), - pageType: pageType, + pageType: parseResult.pageType, hash: stringToHash(content), image: metadata?.previewImage || parseResult.parsedContent?.previewImage, publishedAt: validatedDate(parseResult.parsedContent?.publishedDate), diff --git a/packages/api/src/services/save_page.ts b/packages/api/src/services/save_page.ts index e6fd79110..f56f95884 100644 --- a/packages/api/src/services/save_page.ts +++ b/packages/api/src/services/save_page.ts @@ -3,7 +3,7 @@ import { homePageURL } from '../env' import { Maybe, SavePageInput, SaveResult } from '../generated/graphql' import { DataModels } from '../resolvers/types' import { generateSlug, stringToHash, validatedDate } from '../utils/helpers' -import { parseOriginalContent, parsePreparedContent } from '../utils/parser' +import { parsePreparedContent } from '../utils/parser' import normalizeUrl from 'normalize-url' import { createPageSaveRequest } from './create_page_save_request' @@ -72,8 +72,6 @@ export const savePage = async ( }, }) - const pageType = parseOriginalContent(input.url, input.originalContent) - const articleToSave: Page = { id: input.clientRequestId, slug, @@ -87,7 +85,7 @@ export const savePage = async ( stripHash: true, stripWWW: false, }), - pageType: pageType, + pageType: parseResult.pageType, hash: stringToHash(parseResult.parsedContent?.content || input.url), image: parseResult.parsedContent?.previewImage, publishedAt: validatedDate(parseResult.parsedContent?.publishedDate), diff --git a/packages/api/src/utils/parser.ts b/packages/api/src/utils/parser.ts index 3e7de931f..94e8502cb 100644 --- a/packages/api/src/utils/parser.ts +++ b/packages/api/src/utils/parser.ts @@ -80,6 +80,7 @@ export type ParsedContentPuppeteer = { domContent: string parsedContent: Readability.ParseResult | null canonicalUrl?: string | null + pageType: PageType } /* eslint-disable @typescript-eslint/no-explicit-any */ @@ -101,9 +102,8 @@ type ArticleParseLogRecord = LogRecord & { const DEBUG_MODE = process.env.DEBUG === 'true' || false -export const parseOriginalContent = (url: string, html: string): PageType => { +const parseOriginalContent = (window: DOMWindow): PageType => { try { - const { window } = new JSDOM(html, { url }) const e = window.document.querySelector("head meta[property='og:type']") const content = e?.getAttribute('content') if (!content) { @@ -121,7 +121,7 @@ export const parseOriginalContent = (url: string, html: string): PageType => { return PageType.Website } } catch (error) { - logger.error('Error extracting og:type from content for url', url, error) + logger.error('Error extracting og:type from content', error) } return PageType.Unknown @@ -232,6 +232,7 @@ export const parsePreparedContent = async ( canonicalUrl: url, parsedContent: null, domContent: preparedDocument.document, + pageType: PageType.Unknown, } } @@ -310,6 +311,7 @@ export const parsePreparedContent = async ( domContent: preparedDocument.document, parsedContent: article, canonicalUrl, + pageType: parseOriginalContent(window), } } From 5f5076e86410c853e35036fe828fbad0ab21a91a Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Fri, 6 May 2022 12:20:54 +0800 Subject: [PATCH 2/3] Highlight code element without reinitialize jsdom --- packages/api/src/readability.d.ts | 3 +++ packages/api/src/resolvers/article/index.ts | 2 +- packages/api/src/utils/parser.ts | 8 ++++---- packages/readabilityjs/Readability.js | 1 + 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/packages/api/src/readability.d.ts b/packages/api/src/readability.d.ts index b18e6d4ba..d6a0128ee 100644 --- a/packages/api/src/readability.d.ts +++ b/packages/api/src/readability.d.ts @@ -5,6 +5,8 @@ // TypeScript Version: 2.2 declare module '@omnivore/readability' { + import { DOMWindow } from 'jsdom' + /** * A standalone version of the readability library used for Firefox Reader View. * @@ -163,6 +165,7 @@ declare module '@omnivore/readability' { previewImage?: string /** Article published date */ publishedDate?: Date + window?: DOMWindow } } diff --git a/packages/api/src/resolvers/article/index.ts b/packages/api/src/resolvers/article/index.ts index 1eee8e0ae..7494c32ab 100644 --- a/packages/api/src/resolvers/article/index.ts +++ b/packages/api/src/resolvers/article/index.ts @@ -229,7 +229,7 @@ export const createArticleResolver = authorized< const parseResults = await traceAs>( { spanName: 'article.parse' }, async (): Promise => { - return await parsePreparedContent(url, preparedDocument) + return parsePreparedContent(url, preparedDocument) } ) parsedContent = parseResults.parsedContent diff --git a/packages/api/src/utils/parser.ts b/packages/api/src/utils/parser.ts index 94e8502cb..6680e2dd8 100644 --- a/packages/api/src/utils/parser.ts +++ b/packages/api/src/utils/parser.ts @@ -254,9 +254,9 @@ export const parsePreparedContent = async ( // Format code blocks // TODO: we probably want to move this type of thing // to the handlers, and have some concept of postHandle - if (article?.content) { - const cWindow = new JSDOM(article?.content).window - cWindow.document.querySelectorAll('code').forEach((e) => { + if (article?.window) { + // const cWindow = new JSDOM(article?.content).window + article.window.document.querySelectorAll('code').forEach((e) => { console.log(e.textContent) if (e.textContent) { const att = hljs.highlightAuto(e.textContent) @@ -271,7 +271,7 @@ export const parsePreparedContent = async ( e.replaceWith(code) } }) - article.content = cWindow.document.body.outerHTML + article.content = article.window.document.body.outerHTML } const newWindow = new JSDOM('').window diff --git a/packages/readabilityjs/Readability.js b/packages/readabilityjs/Readability.js index bdaa3f029..999f85cec 100644 --- a/packages/readabilityjs/Readability.js +++ b/packages/readabilityjs/Readability.js @@ -2899,6 +2899,7 @@ Readability.prototype = { siteIcon: metadata.siteIcon, previewImage: metadata.previewImage, publishedDate: metadata.publishedDate || publishedAt || this._articlePublishedDate, + window: articleContent, }; } }; From eaad96acddb903cebc6b09b24368dfeb3b01d6a8 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Fri, 6 May 2022 12:29:08 +0800 Subject: [PATCH 3/3] Return parsed dom back to backend --- packages/api/src/readability.d.ts | 4 +--- packages/api/src/utils/parser.ts | 7 +++---- packages/readabilityjs/Readability.js | 2 +- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/packages/api/src/readability.d.ts b/packages/api/src/readability.d.ts index d6a0128ee..54f8846ec 100644 --- a/packages/api/src/readability.d.ts +++ b/packages/api/src/readability.d.ts @@ -5,8 +5,6 @@ // TypeScript Version: 2.2 declare module '@omnivore/readability' { - import { DOMWindow } from 'jsdom' - /** * A standalone version of the readability library used for Firefox Reader View. * @@ -165,7 +163,7 @@ declare module '@omnivore/readability' { previewImage?: string /** Article published date */ publishedDate?: Date - window?: DOMWindow + dom?: Element } } diff --git a/packages/api/src/utils/parser.ts b/packages/api/src/utils/parser.ts index 6680e2dd8..2472988b2 100644 --- a/packages/api/src/utils/parser.ts +++ b/packages/api/src/utils/parser.ts @@ -254,9 +254,8 @@ export const parsePreparedContent = async ( // Format code blocks // TODO: we probably want to move this type of thing // to the handlers, and have some concept of postHandle - if (article?.window) { - // const cWindow = new JSDOM(article?.content).window - article.window.document.querySelectorAll('code').forEach((e) => { + if (article?.dom) { + article.dom.querySelectorAll('code').forEach((e) => { console.log(e.textContent) if (e.textContent) { const att = hljs.highlightAuto(e.textContent) @@ -271,7 +270,7 @@ export const parsePreparedContent = async ( e.replaceWith(code) } }) - article.content = article.window.document.body.outerHTML + article.content = article.dom.outerHTML } const newWindow = new JSDOM('').window diff --git a/packages/readabilityjs/Readability.js b/packages/readabilityjs/Readability.js index 999f85cec..d1bce8bdb 100644 --- a/packages/readabilityjs/Readability.js +++ b/packages/readabilityjs/Readability.js @@ -2899,7 +2899,7 @@ Readability.prototype = { siteIcon: metadata.siteIcon, previewImage: metadata.previewImage, publishedDate: metadata.publishedDate || publishedAt || this._articlePublishedDate, - window: articleContent, + dom: articleContent, }; } };