diff --git a/packages/api/src/utils/parser.ts b/packages/api/src/utils/parser.ts index fc6d722cc..eb69c8a87 100644 --- a/packages/api/src/utils/parser.ts +++ b/packages/api/src/utils/parser.ts @@ -723,13 +723,13 @@ export const getDistillerResult = async ( } } -const fetchHtml = async (url: string): Promise => { +const fetchHtml = async (url: string): Promise => { try { const response = await axiosInstance.get(url) return response.data as string } catch (error) { logger.error('Error fetching html', error) - return undefined + return null } } @@ -788,7 +788,7 @@ export const parseHtml = async (url: string): Promise => { } } -export const parseFeed = async (url: string): Promise => { +export const parseFeed = async (url: string): Promise => { try { // check if url is a telegram channel const telegramRegex = /https:\/\/t\.me\/([a-zA-Z0-9_]+)/ @@ -796,7 +796,7 @@ export const parseFeed = async (url: string): Promise => { if (telegramMatch) { // fetch HTML and parse feeds const html = await fetchHtml(url) - if (!html) return undefined + if (!html) return null const dom = parseHTML(html).document const title = dom.querySelector('meta[property="og:title"]') @@ -835,6 +835,6 @@ export const parseFeed = async (url: string): Promise => { } } catch (error) { logger.error('Error parsing feed', error) - return undefined + return null } } diff --git a/packages/rss-handler/package.json b/packages/rss-handler/package.json index 8bb55def0..49452e22e 100644 --- a/packages/rss-handler/package.json +++ b/packages/rss-handler/package.json @@ -28,6 +28,7 @@ "axios": "^1.4.0", "dotenv": "^16.0.1", "jsonwebtoken": "^8.5.1", + "linkedom": "^0.16.4", "rss-parser": "^3.13.0" }, "volta": { diff --git a/packages/rss-handler/src/index.ts b/packages/rss-handler/src/index.ts index 75a621ee5..76384720f 100644 --- a/packages/rss-handler/src/index.ts +++ b/packages/rss-handler/src/index.ts @@ -3,6 +3,7 @@ import axios from 'axios' import crypto from 'crypto' import * as dotenv from 'dotenv' // see https://github.com/motdotla/dotenv#how-do-i-use-dotenv-with-import import * as jwt from 'jsonwebtoken' +import { parseHTML } from 'linkedom' import Parser, { Item } from 'rss-parser' import { promisify } from 'util' import { CONTENT_FETCH_URL, createCloudTask } from './task' @@ -87,6 +88,48 @@ export const fetchAndChecksum = async (url: string) => { } } +const parseFeed = async (url: string, content: string) => { + try { + // check if url is a telegram channel + const telegramRegex = /https:\/\/t\.me\/([a-zA-Z0-9_]+)/ + const telegramMatch = url.match(telegramRegex) + if (telegramMatch) { + const dom = parseHTML(content).document + const title = dom.querySelector('meta[property="og:title"]') + // post has attribute data-post + const posts = dom.querySelectorAll('[data-post]') + const items = Array.from(posts) + .map((post) => { + const id = post.getAttribute('data-post') + if (!id) { + return null + } + + const url = `https://t.me/${telegramMatch[1]}/${id}` + // find the