diff --git a/packages/api/package.json b/packages/api/package.json index 9060f0641..7d5420686 100644 --- a/packages/api/package.json +++ b/packages/api/package.json @@ -155,4 +155,4 @@ "volta": { "extends": "../../package.json" } -} \ No newline at end of file +} diff --git a/packages/api/src/readability.d.ts b/packages/api/src/readability.d.ts index 4722588cd..fcfcc1e55 100644 --- a/packages/api/src/readability.d.ts +++ b/packages/api/src/readability.d.ts @@ -166,6 +166,7 @@ declare module '@omnivore/readability' { /** Article published date */ publishedDate?: Date | null language?: string | null + documentElement: HTMLElement } } diff --git a/packages/api/src/utils/parser.ts b/packages/api/src/utils/parser.ts index ae57ee6d0..05fdcc3a2 100644 --- a/packages/api/src/utils/parser.ts +++ b/packages/api/src/utils/parser.ts @@ -231,9 +231,8 @@ export const parsePreparedContent = async ( labels: { source: 'parsePreparedContent' }, } - const { document, pageInfo } = preparedDocument - - if (!document) { + const { document: domContent, pageInfo } = preparedDocument + if (!domContent) { logger.info('No document') return { canonicalUrl: url, @@ -253,25 +252,30 @@ export const parsePreparedContent = async ( return { canonicalUrl: url, parsedContent: null, - domContent: document, + domContent, pageType: PageType.Unknown, } } const { title: pageInfoTitle, canonicalUrl } = pageInfo - let parsedContent = null + let parsedContent: Readability.ParseResult | null = null let pageType = PageType.Unknown let highlightData = undefined try { - const dom = parseHTML(document).document - pageType = parseOriginalContent(dom) + const document = parseHTML(domContent).document + pageType = parseOriginalContent(document) // Run readability - await preParseContent(url, dom) + await preParseContent(url, document) - parsedContent = await getReadabilityResult(url, document, dom, isNewsletter) + parsedContent = await getReadabilityResult( + url, + domContent, + document, + isNewsletter + ) if (!parsedContent || !parsedContent.content) { logger.info('No parsed content') @@ -281,7 +285,7 @@ export const parsePreparedContent = async ( const newDocument = { ...preparedDocument, - document: '
' + document + '', // wrap in body + document: '' + domContent + '', // wrap in body } return parsePreparedContent(url, newDocument, isNewsletter, false) } @@ -289,7 +293,7 @@ export const parsePreparedContent = async ( return { canonicalUrl, parsedContent, - domContent: document, + domContent, pageType, } } @@ -299,31 +303,29 @@ export const parsePreparedContent = async ( parsedContent.title = pageInfoTitle } + const newDocumentElement = parsedContent.documentElement // Format code blocks // TODO: we probably want to move this type of thing // to the handlers, and have some concept of postHandle - const newDom = parseHTML(parsedContent.content).document - const codeBlocks = newDom.querySelectorAll( - 'code, pre[class^="prism-"], pre[class^="language-"]' + const codeBlocks = newDocumentElement.querySelectorAll