856 lines
23 KiB
TypeScript
856 lines
23 KiB
TypeScript
/* eslint-disable @typescript-eslint/no-unsafe-member-access */
|
|
/* eslint-disable @typescript-eslint/no-unsafe-assignment */
|
|
/* eslint-disable @typescript-eslint/no-unused-vars */
|
|
/* eslint-disable @typescript-eslint/no-base-to-string */
|
|
|
|
import { preParseContent } from '@omnivore/content-handler'
|
|
import { Readability } from '@omnivore/readability'
|
|
import addressparser from 'addressparser'
|
|
import axios from 'axios'
|
|
import createDOMPurify, { SanitizeElementHookEvent } from 'dompurify'
|
|
import * as hljs from 'highlightjs'
|
|
import { decode } from 'html-entities'
|
|
import * as jwt from 'jsonwebtoken'
|
|
import { parseHTML } from 'linkedom'
|
|
import { NodeHtmlMarkdown, TranslatorConfigObject } from 'node-html-markdown'
|
|
import { ElementNode } from 'node-html-markdown/dist/nodes'
|
|
import Parser from 'rss-parser'
|
|
import { parser } from 'sax'
|
|
import { ILike } from 'typeorm'
|
|
import { promisify } from 'util'
|
|
import { v4 as uuid } from 'uuid'
|
|
import { Highlight } from '../entity/highlight'
|
|
import { StatusType } from '../entity/user'
|
|
import { env } from '../env'
|
|
import { PageType, PreparedDocumentInput } from '../generated/graphql'
|
|
import { userRepository } from '../repository/user'
|
|
import { ArticleFormat } from '../resolvers/article'
|
|
import {
|
|
EmbeddedHighlightData,
|
|
findEmbeddedHighlight,
|
|
getArticleTextNodes,
|
|
highlightIdAttribute,
|
|
makeHighlightNodeAttributes,
|
|
} from './highlightGenerator'
|
|
import { createImageProxyUrl } from './imageproxy'
|
|
import { buildLogger, LogRecord } from './logger'
|
|
|
|
interface Feed {
|
|
title: string
|
|
url: string
|
|
type: string
|
|
thumbnail?: string
|
|
description?: string
|
|
}
|
|
|
|
const logger = buildLogger('utils.parse')
|
|
const signToken = promisify(jwt.sign)
|
|
|
|
const axiosInstance = axios.create({
|
|
timeout: 5000,
|
|
headers: {
|
|
'User-Agent': 'Mozilla/5.0',
|
|
Accept: 'text/html',
|
|
},
|
|
responseType: 'text',
|
|
})
|
|
|
|
export const ALLOWED_CONTENT_TYPES = [
|
|
'text/html',
|
|
'application/octet-stream',
|
|
'text/plain',
|
|
]
|
|
|
|
const DOM_PURIFY_CONFIG = {
|
|
ADD_TAGS: ['iframe'],
|
|
ADD_ATTR: ['allow', 'allowfullscreen', 'frameborder', 'scrolling'],
|
|
FORBID_ATTR: [
|
|
'data-ml-dynamic',
|
|
'data-ml-dynamic-type',
|
|
'data-orig-url',
|
|
'data-ml-id',
|
|
'data-ml',
|
|
'data-xid',
|
|
'data-feature',
|
|
],
|
|
}
|
|
const ARTICLE_PREFIX = 'omnivore:'
|
|
|
|
export const FAKE_URL_PREFIX = 'https://omnivore.app/no_url?q='
|
|
export const RSS_PARSER_CONFIG = {
|
|
timeout: 5000, // 5 seconds
|
|
headers: {
|
|
// some rss feeds require user agent
|
|
'User-Agent':
|
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
|
|
Accept:
|
|
'application/rss+xml, application/rdf+xml;q=0.8, application/atom+xml;q=0.6, application/xml;q=0.4, text/xml;q=0.4, text/html;q=0.2',
|
|
},
|
|
}
|
|
|
|
/** Hook that prevents DOMPurify from removing youtube iframes */
|
|
const domPurifySanitizeHook = (
|
|
node: Element,
|
|
data: SanitizeElementHookEvent
|
|
): void => {
|
|
if (data.tagName === 'iframe') {
|
|
const urlRegex = /^(https?:)?\/\/www\.youtube(-nocookie)?\.com\/embed\//i
|
|
const src = node.getAttribute('src') || ''
|
|
const dataSrc = node.getAttribute('data-src') || ''
|
|
|
|
if (src && urlRegex.test(src)) {
|
|
return
|
|
}
|
|
|
|
if (dataSrc && urlRegex.test(dataSrc)) {
|
|
node.setAttribute('src', dataSrc)
|
|
return
|
|
}
|
|
|
|
node.parentNode?.removeChild(node)
|
|
}
|
|
}
|
|
|
|
export type ParsedContentPuppeteer = {
|
|
domContent: string
|
|
parsedContent: Readability.ParseResult | null
|
|
canonicalUrl?: string | null
|
|
pageType: PageType
|
|
highlightData?: EmbeddedHighlightData
|
|
}
|
|
|
|
/* eslint-disable @typescript-eslint/no-explicit-any */
|
|
type ArticleParseLogRecord = LogRecord & {
|
|
url: string
|
|
userAgent?: string
|
|
pageInfo?: { [key: string]: any }
|
|
blockedByClient?: boolean
|
|
parsedOrigin?: boolean
|
|
origin?: string
|
|
puppeteerSuccess?: boolean
|
|
puppeteerError?: { [key: string]: any }
|
|
parseSuccess?: boolean
|
|
parseError?: { [key: string]: any }
|
|
scrollError?: boolean
|
|
isAllowedContentType?: boolean
|
|
}
|
|
/* eslint-enable @typescript-eslint/no-explicit-any */
|
|
|
|
const DEBUG_MODE = process.env.DEBUG === 'true' || false
|
|
|
|
const parseOriginalContent = (document: Document): PageType => {
|
|
try {
|
|
const e = document.querySelector("head meta[property='og:type']")
|
|
const content = e?.getAttribute('content')
|
|
if (!content) {
|
|
return PageType.Unknown
|
|
}
|
|
|
|
switch (content.toLowerCase()) {
|
|
case 'article':
|
|
return PageType.Article
|
|
case 'book':
|
|
return PageType.Book
|
|
case 'profile':
|
|
return PageType.Profile
|
|
case 'website':
|
|
return PageType.Website
|
|
case 'tweet':
|
|
return PageType.Tweet
|
|
case 'image':
|
|
return PageType.Image
|
|
default:
|
|
if (content.toLowerCase().startsWith('video')) {
|
|
return PageType.Video
|
|
}
|
|
return PageType.Unknown
|
|
}
|
|
} catch (error) {
|
|
logger.error('Error extracting og:type from content', error)
|
|
return PageType.Unknown
|
|
}
|
|
}
|
|
|
|
const getPurifiedContent = (html: string): Document => {
|
|
const newWindow = parseHTML('')
|
|
const DOMPurify = createDOMPurify(newWindow)
|
|
DOMPurify.addHook('uponSanitizeElement', domPurifySanitizeHook)
|
|
const clean = DOMPurify.sanitize(html, DOM_PURIFY_CONFIG)
|
|
return parseHTML(clean).document
|
|
}
|
|
|
|
const getReadabilityResult = async (
|
|
url: string,
|
|
html: string,
|
|
document?: Document,
|
|
isNewsletter?: boolean
|
|
): Promise<Readability.ParseResult | null> => {
|
|
// First attempt to read the article as is.
|
|
// if that fails attempt to purify then read
|
|
const sources = [
|
|
() => {
|
|
return document
|
|
},
|
|
() => {
|
|
return getPurifiedContent(html)
|
|
},
|
|
]
|
|
|
|
for (const source of sources) {
|
|
const document = source()
|
|
if (!document) {
|
|
continue
|
|
}
|
|
|
|
try {
|
|
const article = await new Readability(document, {
|
|
debug: DEBUG_MODE,
|
|
createImageProxyUrl,
|
|
keepTables: isNewsletter,
|
|
ignoreLinkDensity: isNewsletter,
|
|
url,
|
|
}).parse()
|
|
|
|
if (article) {
|
|
return article
|
|
}
|
|
} catch (error) {
|
|
logger.info('parsing error for url', { url, error })
|
|
}
|
|
}
|
|
|
|
return null
|
|
}
|
|
|
|
export const parsePreparedContent = async (
|
|
url: string,
|
|
preparedDocument: PreparedDocumentInput,
|
|
isNewsletter?: boolean,
|
|
allowRetry = true
|
|
): Promise<ParsedContentPuppeteer> => {
|
|
const logRecord: ArticleParseLogRecord = {
|
|
url: url,
|
|
labels: { source: 'parsePreparedContent' },
|
|
}
|
|
|
|
const { document: domContent, pageInfo } = preparedDocument
|
|
if (!domContent) {
|
|
logger.info('No document')
|
|
return {
|
|
canonicalUrl: url,
|
|
parsedContent: null,
|
|
domContent: '',
|
|
pageType: PageType.Unknown,
|
|
}
|
|
}
|
|
|
|
// Checking for content type acceptance or if there are no contentType
|
|
// at all (backward extension versions compatibility)
|
|
if (
|
|
pageInfo.contentType &&
|
|
!ALLOWED_CONTENT_TYPES.includes(pageInfo.contentType)
|
|
) {
|
|
logger.info(`Not allowed content type: ${pageInfo.contentType}`)
|
|
return {
|
|
canonicalUrl: url,
|
|
parsedContent: null,
|
|
domContent,
|
|
pageType: PageType.Unknown,
|
|
}
|
|
}
|
|
|
|
const { title: pageInfoTitle, canonicalUrl } = pageInfo
|
|
|
|
let parsedContent: Readability.ParseResult | null = null
|
|
let pageType = PageType.Unknown
|
|
let highlightData = undefined
|
|
|
|
try {
|
|
const document = parseHTML(domContent).document
|
|
pageType = parseOriginalContent(document)
|
|
|
|
// Run readability
|
|
await preParseContent(url, document)
|
|
|
|
parsedContent = await getReadabilityResult(
|
|
url,
|
|
domContent,
|
|
document,
|
|
isNewsletter
|
|
)
|
|
|
|
if (!parsedContent || !parsedContent.content) {
|
|
logger.info('No parsed content')
|
|
|
|
if (allowRetry) {
|
|
logger.info('Retrying with content wrapped in html body')
|
|
|
|
const newDocument = {
|
|
...preparedDocument,
|
|
document: '<html><body>' + domContent + '</body></html>', // wrap in body
|
|
}
|
|
return parsePreparedContent(url, newDocument, isNewsletter, false)
|
|
}
|
|
|
|
return {
|
|
canonicalUrl,
|
|
parsedContent,
|
|
domContent,
|
|
pageType,
|
|
}
|
|
}
|
|
|
|
// use title if not found after running readability
|
|
if (!parsedContent.title && pageInfoTitle) {
|
|
parsedContent.title = pageInfoTitle
|
|
}
|
|
|
|
const newDocumentElement = parsedContent.documentElement
|
|
// Format code blocks
|
|
// TODO: we probably want to move this type of thing
|
|
// to the handlers, and have some concept of postHandle
|
|
const codeBlocks = newDocumentElement.querySelectorAll(
|
|
'pre[class^="prism-"], pre[class^="language-"], code'
|
|
)
|
|
codeBlocks.forEach((e) => {
|
|
if (!e.textContent) {
|
|
return e.parentNode?.removeChild(e)
|
|
}
|
|
|
|
// replace <br> or <p> or </p> with \n
|
|
e.innerHTML = e.innerHTML.replace(/<(br|p|\/p)>/g, '\n')
|
|
|
|
const att = hljs.highlightAuto(e.textContent)
|
|
const code = document.createElement('code')
|
|
const langClass =
|
|
`hljs language-${att.language}` +
|
|
(att.second_best?.language
|
|
? ` language-${att.second_best?.language}`
|
|
: '')
|
|
code.setAttribute('class', langClass)
|
|
code.innerHTML = att.value
|
|
e.replaceWith(code)
|
|
})
|
|
|
|
highlightData = findEmbeddedHighlight(newDocumentElement)
|
|
|
|
const ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES = [
|
|
'omnivore-highlight-id',
|
|
'data-twitter-tweet-id',
|
|
'data-instagram-id',
|
|
]
|
|
|
|
// Get the top level element?
|
|
// const pageNode = newDocumentElement.firstElementChild as HTMLElement
|
|
const nodesToVisitStack: [HTMLElement] = [newDocumentElement]
|
|
const visitedNodeList = []
|
|
|
|
while (nodesToVisitStack.length > 0) {
|
|
const currentNode = nodesToVisitStack.pop()
|
|
if (
|
|
currentNode?.nodeType !== 1 ||
|
|
// Avoiding dynamic elements from being counted as anchor-allowed elements
|
|
ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES.some((attrib) =>
|
|
currentNode.hasAttribute(attrib)
|
|
)
|
|
) {
|
|
continue
|
|
}
|
|
visitedNodeList.push(currentNode)
|
|
;[].slice
|
|
.call(currentNode.childNodes)
|
|
.reverse()
|
|
.forEach(function (node) {
|
|
nodesToVisitStack.push(node)
|
|
})
|
|
}
|
|
|
|
visitedNodeList.shift()
|
|
visitedNodeList.forEach((node, index) => {
|
|
// start from index 1, index 0 reserved for anchor unknown.
|
|
node.setAttribute('data-omnivore-anchor-idx', (index + 1).toString())
|
|
})
|
|
|
|
const newHtml = newDocumentElement.outerHTML
|
|
const newWindow = parseHTML('')
|
|
const DOMPurify = createDOMPurify(newWindow)
|
|
DOMPurify.addHook('uponSanitizeElement', domPurifySanitizeHook)
|
|
const cleanHtml = DOMPurify.sanitize(newHtml, DOM_PURIFY_CONFIG)
|
|
parsedContent.content = cleanHtml
|
|
|
|
logRecord.parseSuccess = true
|
|
} catch (error) {
|
|
logger.error('Error parsing content', error)
|
|
|
|
Object.assign(logRecord, {
|
|
parseSuccess: false,
|
|
parseError: error,
|
|
})
|
|
}
|
|
|
|
logger.info('parse-article completed', logRecord)
|
|
|
|
return {
|
|
canonicalUrl,
|
|
parsedContent,
|
|
domContent,
|
|
pageType,
|
|
highlightData,
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Fetches the JSONLD link if found and parses an article metadata if presented
|
|
*
|
|
* Example article: https://thoughtsofstone.com/the-great-feminization/
|
|
*
|
|
* JSONLD Link example: https://thoughtsofstone.com/wp-json/oembed/1.0/embed?url=https%3A%2F%2Fthoughtsofstone.com%2Fthe-great-feminization%2F
|
|
* @param document - JSDOM Document object of the content to parse link from
|
|
* @returns Parsed article partial result from the JSONLD link if found (possibly not)
|
|
*/
|
|
const getJSONLdLinkMetadata = async (
|
|
document: Document
|
|
): Promise<Partial<Readability.ParseResult>> => {
|
|
const result: Partial<Readability.ParseResult> = {}
|
|
try {
|
|
const jsonLdLink = document.querySelector<HTMLLinkElement>(
|
|
"link[type='application/json+oembed']"
|
|
)
|
|
if (!jsonLdLink || !jsonLdLink.href) return result
|
|
|
|
const jsonLd =
|
|
(await axios.get(jsonLdLink.href, { timeout: 5000 })).data || {}
|
|
|
|
result.byline = decode(jsonLd['author_name'])
|
|
result.previewImage = decode(jsonLd['thumbnail_url'])
|
|
result.siteName = decode(jsonLd['provider_name'])
|
|
result.title = decode(jsonLd['title'])
|
|
|
|
return result
|
|
} catch (error) {
|
|
logger.error('Unable to get JSONLD link of the article')
|
|
return result
|
|
}
|
|
}
|
|
|
|
type Metadata = {
|
|
title?: string
|
|
author?: string
|
|
description: string
|
|
previewImage: string
|
|
}
|
|
|
|
export const parsePageMetadata = (html: string): Metadata | undefined => {
|
|
try {
|
|
const document = parseHTML(html).document
|
|
|
|
// get open graph metadata
|
|
const description =
|
|
document
|
|
.querySelector("head meta[property='og:description']")
|
|
?.getAttribute('content') || ''
|
|
|
|
const previewImage =
|
|
document
|
|
.querySelector("head meta[property='og:image']")
|
|
?.getAttribute('content') || ''
|
|
|
|
const title =
|
|
document
|
|
.querySelector("head meta[property='og:title']")
|
|
?.getAttribute('content') || undefined
|
|
|
|
const author =
|
|
document
|
|
.querySelector("head meta[name='author']")
|
|
?.getAttribute('content') || undefined
|
|
|
|
// TODO: we should be able to apply the JSONLD metadata
|
|
// here too
|
|
|
|
return { title, author, description, previewImage }
|
|
} catch (e) {
|
|
logger.info('failed to parse page:', e)
|
|
return undefined
|
|
}
|
|
}
|
|
|
|
export const parseUrlMetadata = async (
|
|
url: string
|
|
): Promise<Metadata | undefined> => {
|
|
try {
|
|
const res = await axios.get(url)
|
|
return parsePageMetadata(res.data)
|
|
} catch (error) {
|
|
if (axios.isAxiosError(error)) {
|
|
logger.error(error.response)
|
|
} else {
|
|
logger.error(error)
|
|
}
|
|
return undefined
|
|
}
|
|
}
|
|
|
|
export const isProbablyArticle = async (
|
|
email: string,
|
|
subject: string
|
|
): Promise<boolean> => {
|
|
const user = await userRepository.findOneBy({
|
|
email: ILike(email),
|
|
status: StatusType.Active,
|
|
})
|
|
return !!user || subject.includes(ARTICLE_PREFIX)
|
|
}
|
|
|
|
export const generateUniqueUrl = () => FAKE_URL_PREFIX + uuid()
|
|
|
|
export const getTitleFromEmailSubject = (subject: string) => {
|
|
const title = subject.replace(ARTICLE_PREFIX, '')
|
|
return title.trim()
|
|
}
|
|
|
|
export const parseEmailAddress = (from: string): addressparser.EmailAddress => {
|
|
// get author name from email
|
|
// e.g. 'Jackson Harper from Omnivore App <jacksonh@substack.com>'
|
|
// or 'Mike Allen <mike@axios.com>'
|
|
const parsed = addressparser(from)
|
|
if (parsed.length > 0) {
|
|
return parsed[0]
|
|
}
|
|
return { name: '', address: from }
|
|
}
|
|
|
|
export const fetchFavicon = async (
|
|
url: string
|
|
): Promise<string | undefined> => {
|
|
// don't fetch favicon for fake urls
|
|
if (url.startsWith(FAKE_URL_PREFIX)) return undefined
|
|
try {
|
|
// get the correct url if it's a redirect
|
|
const response = await axios.head(url, { timeout: 5000 })
|
|
const realUrl = response.request.res.responseUrl
|
|
const domain = new URL(realUrl).hostname
|
|
return `https://api.faviconkit.com/${domain}/128`
|
|
} catch (e) {
|
|
if (axios.isAxiosError(e)) {
|
|
logger.info('failed to get favicon', e.response)
|
|
} else {
|
|
logger.info('failed to get favicon', e)
|
|
}
|
|
return undefined
|
|
}
|
|
}
|
|
|
|
// custom transformer to wrap <span class="highlight"> tags in markdown highlight tags `==`
|
|
export const highlightTranslators: TranslatorConfigObject = {
|
|
/* Link */
|
|
a: ({ node, options, visitor }) => {
|
|
const href = node.getAttribute('href')
|
|
if (!href) return {}
|
|
|
|
// Encodes symbols that can cause problems in markdown
|
|
let encodedHref = ''
|
|
for (const chr of href) {
|
|
switch (chr) {
|
|
case '(':
|
|
encodedHref += '%28'
|
|
break
|
|
case ')':
|
|
encodedHref += '%29'
|
|
break
|
|
case '_':
|
|
encodedHref += '%5F'
|
|
break
|
|
case '*':
|
|
encodedHref += '%2A'
|
|
break
|
|
default:
|
|
encodedHref += chr
|
|
}
|
|
}
|
|
|
|
const title = node.getAttribute('title')
|
|
|
|
let hasHighlight = false
|
|
// If the link is a highlight, wrap it in `==` tags
|
|
node.childNodes.forEach((child) => {
|
|
if (
|
|
child.nodeType === 1 &&
|
|
(child as ElementNode).getAttribute(highlightIdAttribute)
|
|
) {
|
|
hasHighlight = true
|
|
return
|
|
}
|
|
})
|
|
|
|
// Inline link, when possible
|
|
// See: https://github.com/crosstype/node-html-markdown/issues/17
|
|
if (node.textContent === href && options.useInlineLinks)
|
|
return {
|
|
prefix: hasHighlight ? '==' : undefined,
|
|
postfix: hasHighlight ? '==' : undefined,
|
|
content: `<${encodedHref}>`,
|
|
}
|
|
|
|
const prefix = hasHighlight ? '==[' : '['
|
|
const postfix =
|
|
']' +
|
|
(!options.useLinkReferenceDefinitions
|
|
? `(${encodedHref}${title ? ` "${title}"` : ''})`
|
|
: `[${visitor.addOrGetUrlDefinition(encodedHref)}]`) +
|
|
`${hasHighlight ? '==' : ''}`
|
|
|
|
return {
|
|
postprocess: ({ content }) => content.replace(/(?:\r?\n)+/g, ' '),
|
|
childTranslators: visitor.instance.aTagTranslators,
|
|
prefix,
|
|
postfix,
|
|
}
|
|
},
|
|
|
|
span: ({ node }) => {
|
|
const id = node.getAttribute(highlightIdAttribute)
|
|
if (!id) return {}
|
|
|
|
const hasLeadingSpace = node.innerHTML.startsWith(' ')
|
|
const hasTrailingSpace = node.innerHTML.endsWith(' ')
|
|
// remove the leading and trailing space
|
|
const content = node.innerHTML.trim()
|
|
const prefix = hasLeadingSpace ? ' ==' : '=='
|
|
const postfix = hasTrailingSpace ? '== ' : '=='
|
|
|
|
return {
|
|
prefix,
|
|
postfix,
|
|
content,
|
|
}
|
|
},
|
|
}
|
|
|
|
/* ********************************************************* *
|
|
* Re-use
|
|
* If using it several times, creating an instance saves time
|
|
* ********************************************************* */
|
|
const nhm = new NodeHtmlMarkdown(
|
|
/* options (optional) */ {},
|
|
/* customTransformers (optional) */ highlightTranslators,
|
|
/* customCodeBlockTranslators (optional) */ undefined
|
|
)
|
|
|
|
type contentConverterFunc = (html: string, highlights?: Highlight[]) => string
|
|
|
|
export const contentConverter = (
|
|
format: string
|
|
): contentConverterFunc | undefined => {
|
|
switch (format) {
|
|
case ArticleFormat.Markdown:
|
|
return htmlToMarkdown
|
|
case ArticleFormat.HighlightedMarkdown:
|
|
return htmlToHighlightedMarkdown
|
|
default:
|
|
return undefined
|
|
}
|
|
}
|
|
|
|
export const htmlToHighlightedMarkdown = (
|
|
html: string,
|
|
highlights?: Highlight[]
|
|
): string => {
|
|
if (!highlights || highlights.length == 0) {
|
|
return nhm.translate(/* html */ html)
|
|
}
|
|
|
|
let document: Document
|
|
|
|
try {
|
|
document = parseHTML(html).document
|
|
|
|
if (!document || !document.documentElement) {
|
|
// the html is invalid
|
|
throw new Error('Invalid html content')
|
|
}
|
|
} catch (err) {
|
|
logger.info(err)
|
|
return nhm.translate(/* html */ html)
|
|
}
|
|
|
|
const articleTextNodes = getArticleTextNodes(document)
|
|
if (!articleTextNodes) {
|
|
return nhm.translate(/* html */ html)
|
|
}
|
|
|
|
// wrap highlights in special tags
|
|
highlights
|
|
.filter((h) => h.highlightType == 'HIGHLIGHT' && h.patch)
|
|
.forEach((highlight) => {
|
|
try {
|
|
makeHighlightNodeAttributes(
|
|
highlight.id,
|
|
highlight.patch as string,
|
|
articleTextNodes
|
|
)
|
|
} catch (err) {
|
|
logger.info(err)
|
|
}
|
|
})
|
|
html = document.documentElement.outerHTML
|
|
|
|
return nhm.translate(/* html */ html)
|
|
}
|
|
|
|
export const htmlToMarkdown = (html: string) => {
|
|
return nhm.translate(/* html */ html)
|
|
}
|
|
|
|
export const markdownToHtml = (markdown: string) => {
|
|
return nhm.translate(/* markdown */ markdown)
|
|
}
|
|
|
|
export const getDistillerResult = async (
|
|
uid: string,
|
|
html: string
|
|
): Promise<string | undefined> => {
|
|
try {
|
|
const url = process.env.DISTILLER_URL
|
|
if (!url) {
|
|
logger.info('No distiller url')
|
|
return undefined
|
|
}
|
|
|
|
const exp = Math.floor(Date.now() / 1000) + 60 * 60 // 1 hour
|
|
const auth = (await signToken({ uid, exp }, env.server.jwtSecret)) as string
|
|
|
|
logger.info(`Parsing by distiller: ${url}`)
|
|
const response = await axios.post<string>(url, html, {
|
|
headers: {
|
|
Authorization: auth,
|
|
},
|
|
timeout: 5000,
|
|
})
|
|
return response.data
|
|
} catch (error) {
|
|
if (axios.isAxiosError(error)) {
|
|
logger.error(error.response)
|
|
} else {
|
|
logger.error(error)
|
|
}
|
|
return undefined
|
|
}
|
|
}
|
|
|
|
const fetchHtml = async (url: string): Promise<string | null> => {
|
|
try {
|
|
const response = await axiosInstance.get(url)
|
|
return response.data as string
|
|
} catch (error) {
|
|
logger.error('Error fetching html', error)
|
|
return null
|
|
}
|
|
}
|
|
|
|
export const parseOpml = (opml: string): Feed[] | undefined => {
|
|
const xmlParser = parser(true, { lowercase: true })
|
|
const feeds: Feed[] = []
|
|
const existingFeeds = new Map<string, boolean>()
|
|
|
|
xmlParser.onopentag = function (node) {
|
|
if (node.name === 'outline') {
|
|
// folders also are outlines, make sure an xmlUrl is available
|
|
const feedUrl = node.attributes.xmlUrl.toString()
|
|
if (feedUrl && !existingFeeds.has(feedUrl)) {
|
|
feeds.push({
|
|
title: node.attributes.title.toString() || '',
|
|
url: feedUrl,
|
|
type: node.attributes.type.toString() || 'rss',
|
|
})
|
|
existingFeeds.set(feedUrl, true)
|
|
}
|
|
}
|
|
}
|
|
|
|
xmlParser.onend = function () {
|
|
return feeds
|
|
}
|
|
|
|
try {
|
|
xmlParser.write(opml).close()
|
|
} catch (error) {
|
|
logger.error('Error parsing opml', error)
|
|
return undefined
|
|
}
|
|
}
|
|
|
|
export const parseHtml = async (url: string): Promise<Feed[] | undefined> => {
|
|
// fetch HTML and parse feeds
|
|
const html = await fetchHtml(url)
|
|
if (!html) return undefined
|
|
|
|
try {
|
|
const dom = parseHTML(html).document
|
|
const links = dom.querySelectorAll('link[type="application/rss+xml"]')
|
|
const feeds = Array.from(links)
|
|
.map((link) => ({
|
|
url: link.getAttribute('href') || '',
|
|
title: link.getAttribute('title') || '',
|
|
type: 'rss',
|
|
}))
|
|
.filter((feed) => feed.url)
|
|
|
|
return feeds
|
|
} catch (error) {
|
|
logger.error('Error parsing html', error)
|
|
return undefined
|
|
}
|
|
}
|
|
|
|
export const parseFeed = async (
|
|
url: string,
|
|
content?: string | null
|
|
): Promise<Feed | null> => {
|
|
try {
|
|
// check if url is a telegram channel
|
|
const telegramRegex = /https:\/\/t\.me\/([a-zA-Z0-9_]+)/
|
|
const telegramMatch = url.match(telegramRegex)
|
|
if (telegramMatch) {
|
|
if (!content) {
|
|
// fetch HTML and parse feeds
|
|
content = await fetchHtml(url)
|
|
}
|
|
|
|
if (!content) return null
|
|
|
|
const dom = parseHTML(content).document
|
|
const title = dom.querySelector('meta[property="og:title"]')
|
|
const thumbnail = dom.querySelector('meta[property="og:image"]')
|
|
const description = dom.querySelector('meta[property="og:description"]')
|
|
|
|
return {
|
|
title: title?.getAttribute('content') || url,
|
|
url,
|
|
type: 'telegram',
|
|
thumbnail: thumbnail?.getAttribute('content') || '',
|
|
description: description?.getAttribute('content') || '',
|
|
}
|
|
}
|
|
|
|
const parser = new Parser(RSS_PARSER_CONFIG)
|
|
|
|
const feed = content
|
|
? await parser.parseString(content)
|
|
: await parser.parseURL(url)
|
|
|
|
const feedUrl = feed.feedUrl || url
|
|
|
|
return {
|
|
title: feed.title || feedUrl,
|
|
url: feedUrl,
|
|
thumbnail: feed.image?.url,
|
|
type: 'rss',
|
|
description: feed.description,
|
|
}
|
|
} catch (error) {
|
|
logger.error('Error parsing feed', error)
|
|
return null
|
|
}
|
|
}
|