Merge pull request #561 from omnivore-app/faster-save
Faster save by reusing JSDOM
This commit is contained in:
1
packages/api/src/readability.d.ts
vendored
1
packages/api/src/readability.d.ts
vendored
@ -163,6 +163,7 @@ declare module '@omnivore/readability' {
|
||||
previewImage?: string
|
||||
/** Article published date */
|
||||
publishedDate?: Date
|
||||
dom?: Element
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -54,7 +54,6 @@ import {
|
||||
} from '../../utils/helpers'
|
||||
import {
|
||||
ParsedContentPuppeteer,
|
||||
parseOriginalContent,
|
||||
parsePreparedContent,
|
||||
} from '../../utils/parser'
|
||||
import { isSiteBlockedForParse } from '../../utils/blocked'
|
||||
@ -230,14 +229,13 @@ export const createArticleResolver = authorized<
|
||||
const parseResults = await traceAs<Promise<ParsedContentPuppeteer>>(
|
||||
{ spanName: 'article.parse' },
|
||||
async (): Promise<ParsedContentPuppeteer> => {
|
||||
return await parsePreparedContent(url, preparedDocument)
|
||||
return parsePreparedContent(url, preparedDocument)
|
||||
}
|
||||
)
|
||||
parsedContent = parseResults.parsedContent
|
||||
canonicalUrl = parseResults.canonicalUrl
|
||||
domContent = parseResults.domContent
|
||||
|
||||
pageType = parseOriginalContent(url, domContent)
|
||||
pageType = parseResults.pageType
|
||||
} else if (!preparedDocument?.document) {
|
||||
// We have a URL but no document, so we try to send this to puppeteer
|
||||
// and return a dummy response.
|
||||
|
||||
@ -1,9 +1,5 @@
|
||||
import { generateSlug, stringToHash, validatedDate } from '../utils/helpers'
|
||||
import {
|
||||
parseOriginalContent,
|
||||
parsePreparedContent,
|
||||
parseUrlMetadata,
|
||||
} from '../utils/parser'
|
||||
import { parsePreparedContent, parseUrlMetadata } from '../utils/parser'
|
||||
import normalizeUrl from 'normalize-url'
|
||||
import { PubsubClient } from '../datalayer/pubsub'
|
||||
import { ArticleSavingRequestStatus, Page } from '../elastic/types'
|
||||
@ -44,7 +40,6 @@ export const saveEmail = async (
|
||||
const content = parseResult.parsedContent?.content || input.originalContent
|
||||
const slug = generateSlug(title)
|
||||
|
||||
const pageType = parseOriginalContent(url, input.originalContent)
|
||||
const metadata = await parseUrlMetadata(url)
|
||||
|
||||
const articleToSave: Page = {
|
||||
@ -60,7 +55,7 @@ export const saveEmail = async (
|
||||
stripHash: true,
|
||||
stripWWW: false,
|
||||
}),
|
||||
pageType: pageType,
|
||||
pageType: parseResult.pageType,
|
||||
hash: stringToHash(content),
|
||||
image: metadata?.previewImage || parseResult.parsedContent?.previewImage,
|
||||
publishedAt: validatedDate(parseResult.parsedContent?.publishedDate),
|
||||
|
||||
@ -3,7 +3,7 @@ import { homePageURL } from '../env'
|
||||
import { Maybe, SavePageInput, SaveResult } from '../generated/graphql'
|
||||
import { DataModels } from '../resolvers/types'
|
||||
import { generateSlug, stringToHash, validatedDate } from '../utils/helpers'
|
||||
import { parseOriginalContent, parsePreparedContent } from '../utils/parser'
|
||||
import { parsePreparedContent } from '../utils/parser'
|
||||
|
||||
import normalizeUrl from 'normalize-url'
|
||||
import { createPageSaveRequest } from './create_page_save_request'
|
||||
@ -72,8 +72,6 @@ export const savePage = async (
|
||||
},
|
||||
})
|
||||
|
||||
const pageType = parseOriginalContent(input.url, input.originalContent)
|
||||
|
||||
const articleToSave: Page = {
|
||||
id: input.clientRequestId,
|
||||
slug,
|
||||
@ -87,7 +85,7 @@ export const savePage = async (
|
||||
stripHash: true,
|
||||
stripWWW: false,
|
||||
}),
|
||||
pageType: pageType,
|
||||
pageType: parseResult.pageType,
|
||||
hash: stringToHash(parseResult.parsedContent?.content || input.url),
|
||||
image: parseResult.parsedContent?.previewImage,
|
||||
publishedAt: validatedDate(parseResult.parsedContent?.publishedDate),
|
||||
|
||||
@ -80,6 +80,7 @@ export type ParsedContentPuppeteer = {
|
||||
domContent: string
|
||||
parsedContent: Readability.ParseResult | null
|
||||
canonicalUrl?: string | null
|
||||
pageType: PageType
|
||||
}
|
||||
|
||||
/* eslint-disable @typescript-eslint/no-explicit-any */
|
||||
@ -101,9 +102,8 @@ type ArticleParseLogRecord = LogRecord & {
|
||||
|
||||
const DEBUG_MODE = process.env.DEBUG === 'true' || false
|
||||
|
||||
export const parseOriginalContent = (url: string, html: string): PageType => {
|
||||
const parseOriginalContent = (window: DOMWindow): PageType => {
|
||||
try {
|
||||
const { window } = new JSDOM(html, { url })
|
||||
const e = window.document.querySelector("head meta[property='og:type']")
|
||||
const content = e?.getAttribute('content')
|
||||
if (!content) {
|
||||
@ -121,7 +121,7 @@ export const parseOriginalContent = (url: string, html: string): PageType => {
|
||||
return PageType.Website
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error('Error extracting og:type from content for url', url, error)
|
||||
logger.error('Error extracting og:type from content', error)
|
||||
}
|
||||
|
||||
return PageType.Unknown
|
||||
@ -232,6 +232,7 @@ export const parsePreparedContent = async (
|
||||
canonicalUrl: url,
|
||||
parsedContent: null,
|
||||
domContent: preparedDocument.document,
|
||||
pageType: PageType.Unknown,
|
||||
}
|
||||
}
|
||||
|
||||
@ -253,9 +254,8 @@ export const parsePreparedContent = async (
|
||||
// Format code blocks
|
||||
// TODO: we probably want to move this type of thing
|
||||
// to the handlers, and have some concept of postHandle
|
||||
if (article?.content) {
|
||||
const cWindow = new JSDOM(article?.content).window
|
||||
cWindow.document.querySelectorAll('code').forEach((e) => {
|
||||
if (article?.dom) {
|
||||
article.dom.querySelectorAll('code').forEach((e) => {
|
||||
console.log(e.textContent)
|
||||
if (e.textContent) {
|
||||
const att = hljs.highlightAuto(e.textContent)
|
||||
@ -270,7 +270,7 @@ export const parsePreparedContent = async (
|
||||
e.replaceWith(code)
|
||||
}
|
||||
})
|
||||
article.content = cWindow.document.body.outerHTML
|
||||
article.content = article.dom.outerHTML
|
||||
}
|
||||
|
||||
const newWindow = new JSDOM('').window
|
||||
@ -310,6 +310,7 @@ export const parsePreparedContent = async (
|
||||
domContent: preparedDocument.document,
|
||||
parsedContent: article,
|
||||
canonicalUrl,
|
||||
pageType: parseOriginalContent(window),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -2899,6 +2899,7 @@ Readability.prototype = {
|
||||
siteIcon: metadata.siteIcon,
|
||||
previewImage: metadata.previewImage,
|
||||
publishedDate: metadata.publishedDate || publishedAt || this._articlePublishedDate,
|
||||
dom: articleContent,
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user