Merge pull request #561 from omnivore-app/faster-save

Faster save by reusing JSDOM
This commit is contained in:
Hongbo Wu
2022-05-07 10:30:40 +08:00
committed by GitHub
6 changed files with 16 additions and 22 deletions

View File

@ -163,6 +163,7 @@ declare module '@omnivore/readability' {
previewImage?: string
/** Article published date */
publishedDate?: Date
dom?: Element
}
}

View File

@ -54,7 +54,6 @@ import {
} from '../../utils/helpers'
import {
ParsedContentPuppeteer,
parseOriginalContent,
parsePreparedContent,
} from '../../utils/parser'
import { isSiteBlockedForParse } from '../../utils/blocked'
@ -230,14 +229,13 @@ export const createArticleResolver = authorized<
const parseResults = await traceAs<Promise<ParsedContentPuppeteer>>(
{ spanName: 'article.parse' },
async (): Promise<ParsedContentPuppeteer> => {
return await parsePreparedContent(url, preparedDocument)
return parsePreparedContent(url, preparedDocument)
}
)
parsedContent = parseResults.parsedContent
canonicalUrl = parseResults.canonicalUrl
domContent = parseResults.domContent
pageType = parseOriginalContent(url, domContent)
pageType = parseResults.pageType
} else if (!preparedDocument?.document) {
// We have a URL but no document, so we try to send this to puppeteer
// and return a dummy response.

View File

@ -1,9 +1,5 @@
import { generateSlug, stringToHash, validatedDate } from '../utils/helpers'
import {
parseOriginalContent,
parsePreparedContent,
parseUrlMetadata,
} from '../utils/parser'
import { parsePreparedContent, parseUrlMetadata } from '../utils/parser'
import normalizeUrl from 'normalize-url'
import { PubsubClient } from '../datalayer/pubsub'
import { ArticleSavingRequestStatus, Page } from '../elastic/types'
@ -44,7 +40,6 @@ export const saveEmail = async (
const content = parseResult.parsedContent?.content || input.originalContent
const slug = generateSlug(title)
const pageType = parseOriginalContent(url, input.originalContent)
const metadata = await parseUrlMetadata(url)
const articleToSave: Page = {
@ -60,7 +55,7 @@ export const saveEmail = async (
stripHash: true,
stripWWW: false,
}),
pageType: pageType,
pageType: parseResult.pageType,
hash: stringToHash(content),
image: metadata?.previewImage || parseResult.parsedContent?.previewImage,
publishedAt: validatedDate(parseResult.parsedContent?.publishedDate),

View File

@ -3,7 +3,7 @@ import { homePageURL } from '../env'
import { Maybe, SavePageInput, SaveResult } from '../generated/graphql'
import { DataModels } from '../resolvers/types'
import { generateSlug, stringToHash, validatedDate } from '../utils/helpers'
import { parseOriginalContent, parsePreparedContent } from '../utils/parser'
import { parsePreparedContent } from '../utils/parser'
import normalizeUrl from 'normalize-url'
import { createPageSaveRequest } from './create_page_save_request'
@ -72,8 +72,6 @@ export const savePage = async (
},
})
const pageType = parseOriginalContent(input.url, input.originalContent)
const articleToSave: Page = {
id: input.clientRequestId,
slug,
@ -87,7 +85,7 @@ export const savePage = async (
stripHash: true,
stripWWW: false,
}),
pageType: pageType,
pageType: parseResult.pageType,
hash: stringToHash(parseResult.parsedContent?.content || input.url),
image: parseResult.parsedContent?.previewImage,
publishedAt: validatedDate(parseResult.parsedContent?.publishedDate),

View File

@ -80,6 +80,7 @@ export type ParsedContentPuppeteer = {
domContent: string
parsedContent: Readability.ParseResult | null
canonicalUrl?: string | null
pageType: PageType
}
/* eslint-disable @typescript-eslint/no-explicit-any */
@ -101,9 +102,8 @@ type ArticleParseLogRecord = LogRecord & {
const DEBUG_MODE = process.env.DEBUG === 'true' || false
export const parseOriginalContent = (url: string, html: string): PageType => {
const parseOriginalContent = (window: DOMWindow): PageType => {
try {
const { window } = new JSDOM(html, { url })
const e = window.document.querySelector("head meta[property='og:type']")
const content = e?.getAttribute('content')
if (!content) {
@ -121,7 +121,7 @@ export const parseOriginalContent = (url: string, html: string): PageType => {
return PageType.Website
}
} catch (error) {
logger.error('Error extracting og:type from content for url', url, error)
logger.error('Error extracting og:type from content', error)
}
return PageType.Unknown
@ -232,6 +232,7 @@ export const parsePreparedContent = async (
canonicalUrl: url,
parsedContent: null,
domContent: preparedDocument.document,
pageType: PageType.Unknown,
}
}
@ -253,9 +254,8 @@ export const parsePreparedContent = async (
// Format code blocks
// TODO: we probably want to move this type of thing
// to the handlers, and have some concept of postHandle
if (article?.content) {
const cWindow = new JSDOM(article?.content).window
cWindow.document.querySelectorAll('code').forEach((e) => {
if (article?.dom) {
article.dom.querySelectorAll('code').forEach((e) => {
console.log(e.textContent)
if (e.textContent) {
const att = hljs.highlightAuto(e.textContent)
@ -270,7 +270,7 @@ export const parsePreparedContent = async (
e.replaceWith(code)
}
})
article.content = cWindow.document.body.outerHTML
article.content = article.dom.outerHTML
}
const newWindow = new JSDOM('').window
@ -310,6 +310,7 @@ export const parsePreparedContent = async (
domContent: preparedDocument.document,
parsedContent: article,
canonicalUrl,
pageType: parseOriginalContent(window),
}
}

View File

@ -2899,6 +2899,7 @@ Readability.prototype = {
siteIcon: metadata.siteIcon,
previewImage: metadata.previewImage,
publishedDate: metadata.publishedDate || publishedAt || this._articlePublishedDate,
dom: articleContent,
};
}
};