diff --git a/packages/api/src/readability.d.ts b/packages/api/src/readability.d.ts index b18e6d4ba..54f8846ec 100644 --- a/packages/api/src/readability.d.ts +++ b/packages/api/src/readability.d.ts @@ -163,6 +163,7 @@ declare module '@omnivore/readability' { previewImage?: string /** Article published date */ publishedDate?: Date + dom?: Element } } diff --git a/packages/api/src/resolvers/article/index.ts b/packages/api/src/resolvers/article/index.ts index b7865a5c6..7494c32ab 100644 --- a/packages/api/src/resolvers/article/index.ts +++ b/packages/api/src/resolvers/article/index.ts @@ -54,7 +54,6 @@ import { } from '../../utils/helpers' import { ParsedContentPuppeteer, - parseOriginalContent, parsePreparedContent, } from '../../utils/parser' import { isSiteBlockedForParse } from '../../utils/blocked' @@ -230,14 +229,13 @@ export const createArticleResolver = authorized< const parseResults = await traceAs>( { spanName: 'article.parse' }, async (): Promise => { - return await parsePreparedContent(url, preparedDocument) + return parsePreparedContent(url, preparedDocument) } ) parsedContent = parseResults.parsedContent canonicalUrl = parseResults.canonicalUrl domContent = parseResults.domContent - - pageType = parseOriginalContent(url, domContent) + pageType = parseResults.pageType } else if (!preparedDocument?.document) { // We have a URL but no document, so we try to send this to puppeteer // and return a dummy response. diff --git a/packages/api/src/services/save_email.ts b/packages/api/src/services/save_email.ts index e62175c47..95ee5c576 100644 --- a/packages/api/src/services/save_email.ts +++ b/packages/api/src/services/save_email.ts @@ -1,9 +1,5 @@ import { generateSlug, stringToHash, validatedDate } from '../utils/helpers' -import { - parseOriginalContent, - parsePreparedContent, - parseUrlMetadata, -} from '../utils/parser' +import { parsePreparedContent, parseUrlMetadata } from '../utils/parser' import normalizeUrl from 'normalize-url' import { PubsubClient } from '../datalayer/pubsub' import { ArticleSavingRequestStatus, Page } from '../elastic/types' @@ -44,7 +40,6 @@ export const saveEmail = async ( const content = parseResult.parsedContent?.content || input.originalContent const slug = generateSlug(title) - const pageType = parseOriginalContent(url, input.originalContent) const metadata = await parseUrlMetadata(url) const articleToSave: Page = { @@ -60,7 +55,7 @@ export const saveEmail = async ( stripHash: true, stripWWW: false, }), - pageType: pageType, + pageType: parseResult.pageType, hash: stringToHash(content), image: metadata?.previewImage || parseResult.parsedContent?.previewImage, publishedAt: validatedDate(parseResult.parsedContent?.publishedDate), diff --git a/packages/api/src/services/save_page.ts b/packages/api/src/services/save_page.ts index e6fd79110..f56f95884 100644 --- a/packages/api/src/services/save_page.ts +++ b/packages/api/src/services/save_page.ts @@ -3,7 +3,7 @@ import { homePageURL } from '../env' import { Maybe, SavePageInput, SaveResult } from '../generated/graphql' import { DataModels } from '../resolvers/types' import { generateSlug, stringToHash, validatedDate } from '../utils/helpers' -import { parseOriginalContent, parsePreparedContent } from '../utils/parser' +import { parsePreparedContent } from '../utils/parser' import normalizeUrl from 'normalize-url' import { createPageSaveRequest } from './create_page_save_request' @@ -72,8 +72,6 @@ export const savePage = async ( }, }) - const pageType = parseOriginalContent(input.url, input.originalContent) - const articleToSave: Page = { id: input.clientRequestId, slug, @@ -87,7 +85,7 @@ export const savePage = async ( stripHash: true, stripWWW: false, }), - pageType: pageType, + pageType: parseResult.pageType, hash: stringToHash(parseResult.parsedContent?.content || input.url), image: parseResult.parsedContent?.previewImage, publishedAt: validatedDate(parseResult.parsedContent?.publishedDate), diff --git a/packages/api/src/utils/parser.ts b/packages/api/src/utils/parser.ts index 3e7de931f..2472988b2 100644 --- a/packages/api/src/utils/parser.ts +++ b/packages/api/src/utils/parser.ts @@ -80,6 +80,7 @@ export type ParsedContentPuppeteer = { domContent: string parsedContent: Readability.ParseResult | null canonicalUrl?: string | null + pageType: PageType } /* eslint-disable @typescript-eslint/no-explicit-any */ @@ -101,9 +102,8 @@ type ArticleParseLogRecord = LogRecord & { const DEBUG_MODE = process.env.DEBUG === 'true' || false -export const parseOriginalContent = (url: string, html: string): PageType => { +const parseOriginalContent = (window: DOMWindow): PageType => { try { - const { window } = new JSDOM(html, { url }) const e = window.document.querySelector("head meta[property='og:type']") const content = e?.getAttribute('content') if (!content) { @@ -121,7 +121,7 @@ export const parseOriginalContent = (url: string, html: string): PageType => { return PageType.Website } } catch (error) { - logger.error('Error extracting og:type from content for url', url, error) + logger.error('Error extracting og:type from content', error) } return PageType.Unknown @@ -232,6 +232,7 @@ export const parsePreparedContent = async ( canonicalUrl: url, parsedContent: null, domContent: preparedDocument.document, + pageType: PageType.Unknown, } } @@ -253,9 +254,8 @@ export const parsePreparedContent = async ( // Format code blocks // TODO: we probably want to move this type of thing // to the handlers, and have some concept of postHandle - if (article?.content) { - const cWindow = new JSDOM(article?.content).window - cWindow.document.querySelectorAll('code').forEach((e) => { + if (article?.dom) { + article.dom.querySelectorAll('code').forEach((e) => { console.log(e.textContent) if (e.textContent) { const att = hljs.highlightAuto(e.textContent) @@ -270,7 +270,7 @@ export const parsePreparedContent = async ( e.replaceWith(code) } }) - article.content = cWindow.document.body.outerHTML + article.content = article.dom.outerHTML } const newWindow = new JSDOM('').window @@ -310,6 +310,7 @@ export const parsePreparedContent = async ( domContent: preparedDocument.document, parsedContent: article, canonicalUrl, + pageType: parseOriginalContent(window), } } diff --git a/packages/readabilityjs/Readability.js b/packages/readabilityjs/Readability.js index bdaa3f029..d1bce8bdb 100644 --- a/packages/readabilityjs/Readability.js +++ b/packages/readabilityjs/Readability.js @@ -2899,6 +2899,7 @@ Readability.prototype = { siteIcon: metadata.siteIcon, previewImage: metadata.previewImage, publishedDate: metadata.publishedDate || publishedAt || this._articlePublishedDate, + dom: articleContent, }; } };