diff --git a/packages/api/package.json b/packages/api/package.json index d4801f5ed..3418c2962 100644 --- a/packages/api/package.json +++ b/packages/api/package.json @@ -83,6 +83,7 @@ "private-ip": "^2.3.3", "rss-parser": "^3.13.0", "sanitize-html": "^2.3.2", + "sax": "^1.3.0", "search-query-parser": "^1.6.0", "snake-case": "^3.0.3", "supertest": "^6.2.2", @@ -122,6 +123,7 @@ "@types/oauth": "^0.9.1", "@types/private-ip": "^1.0.0", "@types/sanitize-html": "^1.27.1", + "@types/sax": "^1.2.7", "@types/sinon": "^10.0.13", "@types/sinon-chai": "^3.2.8", "@types/supertest": "^2.0.11", diff --git a/packages/api/src/generated/graphql.ts b/packages/api/src/generated/graphql.ts index fe6a395b8..49eacf09a 100644 --- a/packages/api/src/generated/graphql.ts +++ b/packages/api/src/generated/graphql.ts @@ -729,13 +729,14 @@ export type Feature = { export type Feed = { __typename?: 'Feed'; author?: Maybe; - createdAt: Scalars['Date']; + createdAt?: Maybe; description?: Maybe; - id: Scalars['ID']; + id?: Maybe; image?: Maybe; publishedAt?: Maybe; title: Scalars['String']; - updatedAt: Scalars['Date']; + type?: Maybe; + updatedAt?: Maybe; url: Scalars['String']; }; @@ -1807,6 +1808,7 @@ export type Query = { recentEmails: RecentEmailsResult; recentSearches: RecentSearchesResult; rules: RulesResult; + scanFeeds: ScanFeedsResult; search: SearchResult; sendInstallInstructions: SendInstallInstructionsResult; subscriptions: SubscriptionsResult; @@ -1843,6 +1845,11 @@ export type QueryRulesArgs = { }; +export type QueryScanFeedsArgs = { + input: ScanFeedsInput; +}; + + export type QuerySearchArgs = { after?: InputMaybe; first?: InputMaybe; @@ -2272,6 +2279,33 @@ export type SaveUrlInput = { url: Scalars['String']; }; +export type ScanFeedsError = { + __typename?: 'ScanFeedsError'; + errorCodes: Array; +}; + +export enum ScanFeedsErrorCode { + BadRequest = 'BAD_REQUEST' +} + +export type ScanFeedsInput = { + opml?: InputMaybe; + type: ScanFeedsType; + url?: InputMaybe; +}; + +export type ScanFeedsResult = ScanFeedsError | ScanFeedsSuccess; + +export type ScanFeedsSuccess = { + __typename?: 'ScanFeedsSuccess'; + feeds: Array; +}; + +export enum ScanFeedsType { + Html = 'HTML', + Opml = 'OPML' +} + export type SearchError = { __typename?: 'SearchError'; errorCodes: Array; @@ -3739,6 +3773,12 @@ export type ResolversTypes = { SaveResult: ResolversTypes['SaveError'] | ResolversTypes['SaveSuccess']; SaveSuccess: ResolverTypeWrapper; SaveUrlInput: SaveUrlInput; + ScanFeedsError: ResolverTypeWrapper; + ScanFeedsErrorCode: ScanFeedsErrorCode; + ScanFeedsInput: ScanFeedsInput; + ScanFeedsResult: ResolversTypes['ScanFeedsError'] | ResolversTypes['ScanFeedsSuccess']; + ScanFeedsSuccess: ResolverTypeWrapper; + ScanFeedsType: ScanFeedsType; SearchError: ResolverTypeWrapper; SearchErrorCode: SearchErrorCode; SearchItem: ResolverTypeWrapper; @@ -4190,6 +4230,10 @@ export type ResolversParentTypes = { SaveResult: ResolversParentTypes['SaveError'] | ResolversParentTypes['SaveSuccess']; SaveSuccess: SaveSuccess; SaveUrlInput: SaveUrlInput; + ScanFeedsError: ScanFeedsError; + ScanFeedsInput: ScanFeedsInput; + ScanFeedsResult: ResolversParentTypes['ScanFeedsError'] | ResolversParentTypes['ScanFeedsSuccess']; + ScanFeedsSuccess: ScanFeedsSuccess; SearchError: SearchError; SearchItem: SearchItem; SearchItemEdge: SearchItemEdge; @@ -4845,13 +4889,14 @@ export type FeatureResolvers = { author?: Resolver, ParentType, ContextType>; - createdAt?: Resolver; + createdAt?: Resolver, ParentType, ContextType>; description?: Resolver, ParentType, ContextType>; - id?: Resolver; + id?: Resolver, ParentType, ContextType>; image?: Resolver, ParentType, ContextType>; publishedAt?: Resolver, ParentType, ContextType>; title?: Resolver; - updatedAt?: Resolver; + type?: Resolver, ParentType, ContextType>; + updatedAt?: Resolver, ParentType, ContextType>; url?: Resolver; __isTypeOf?: IsTypeOfResolverFn; }; @@ -5436,6 +5481,7 @@ export type QueryResolvers; recentSearches?: Resolver; rules?: Resolver>; + scanFeeds?: Resolver>; search?: Resolver>; sendInstallInstructions?: Resolver; subscriptions?: Resolver>; @@ -5689,6 +5735,20 @@ export type SaveSuccessResolvers; }; +export type ScanFeedsErrorResolvers = { + errorCodes?: Resolver, ParentType, ContextType>; + __isTypeOf?: IsTypeOfResolverFn; +}; + +export type ScanFeedsResultResolvers = { + __resolveType: TypeResolveFn<'ScanFeedsError' | 'ScanFeedsSuccess', ParentType, ContextType>; +}; + +export type ScanFeedsSuccessResolvers = { + feeds?: Resolver, ParentType, ContextType>; + __isTypeOf?: IsTypeOfResolverFn; +}; + export type SearchErrorResolvers = { errorCodes?: Resolver, ParentType, ContextType>; __isTypeOf?: IsTypeOfResolverFn; @@ -6581,6 +6641,9 @@ export type Resolvers = { SaveFilterSuccess?: SaveFilterSuccessResolvers; SaveResult?: SaveResultResolvers; SaveSuccess?: SaveSuccessResolvers; + ScanFeedsError?: ScanFeedsErrorResolvers; + ScanFeedsResult?: ScanFeedsResultResolvers; + ScanFeedsSuccess?: ScanFeedsSuccessResolvers; SearchError?: SearchErrorResolvers; SearchItem?: SearchItemResolvers; SearchItemEdge?: SearchItemEdgeResolvers; diff --git a/packages/api/src/generated/schema.graphql b/packages/api/src/generated/schema.graphql index 2903d85fb..537a7e696 100644 --- a/packages/api/src/generated/schema.graphql +++ b/packages/api/src/generated/schema.graphql @@ -644,13 +644,14 @@ type Feature { type Feed { author: String - createdAt: Date! + createdAt: Date description: String - id: ID! + id: ID image: String publishedAt: Date title: String! - updatedAt: Date! + type: String + updatedAt: Date url: String! } @@ -1364,6 +1365,7 @@ type Query { recentEmails: RecentEmailsResult! recentSearches: RecentSearchesResult! rules(enabled: Boolean): RulesResult! + scanFeeds(input: ScanFeedsInput!): ScanFeedsResult! search(after: String, first: Int, format: String, includeContent: Boolean, query: String): SearchResult! sendInstallInstructions: SendInstallInstructionsResult! subscriptions(sort: SortParams, type: SubscriptionType): SubscriptionsResult! @@ -1729,6 +1731,31 @@ input SaveUrlInput { url: String! } +type ScanFeedsError { + errorCodes: [ScanFeedsErrorCode!]! +} + +enum ScanFeedsErrorCode { + BAD_REQUEST +} + +input ScanFeedsInput { + opml: String + type: ScanFeedsType! + url: String +} + +union ScanFeedsResult = ScanFeedsError | ScanFeedsSuccess + +type ScanFeedsSuccess { + feeds: [Feed!]! +} + +enum ScanFeedsType { + HTML + OPML +} + type SearchError { errorCodes: [SearchErrorCode!]! } diff --git a/packages/api/src/resolvers/article/index.ts b/packages/api/src/resolvers/article/index.ts index 7ac91c69c..5ca6cd557 100644 --- a/packages/api/src/resolvers/article/index.ts +++ b/packages/api/src/resolvers/article/index.ts @@ -21,8 +21,12 @@ import { CreateArticleError, CreateArticleErrorCode, CreateArticleSuccess, + MoveToFolderError, + MoveToFolderErrorCode, + MoveToFolderSuccess, MutationBulkActionArgs, MutationCreateArticleArgs, + MutationMoveToFolderArgs, MutationSaveArticleReadingProgressArgs, MutationSetBookmarkArticleArgs, MutationSetFavoriteArticleArgs, @@ -85,6 +89,7 @@ import { generateSlug, isParsingTimeout, libraryItemToArticle, + libraryItemToArticleSavingRequest, libraryItemToSearchItem, titleForFilePath, userDataToUser, @@ -870,6 +875,80 @@ export const setFavoriteArticleResolver = authorized< } }) +export const moveToFolderResolver = authorized< + MoveToFolderSuccess, + MoveToFolderError, + MutationMoveToFolderArgs +>(async (_, { id, folder }, { authTrx, pubsub, uid }) => { + analytics.track({ + userId: uid, + event: 'move_to_folder', + properties: { + id, + folder, + }, + }) + + const item = await authTrx((tx) => + tx.getRepository(LibraryItem).findOne({ + where: { + id, + }, + relations: ['user'], + }) + ) + + if (!item) { + return { + errorCodes: [MoveToFolderErrorCode.Unauthorized], + } + } + + if (item.folder === folder) { + return { + errorCodes: [MoveToFolderErrorCode.AlreadyExists], + } + } + + const savedAt = new Date() + + // if the content is not fetched yet, create a page save request + if (!item.readableContent) { + const articleSavingRequest = await createPageSaveRequest({ + userId: uid, + url: item.originalUrl, + articleSavingRequestId: id, + priority: 'high', + publishedAt: item.publishedAt || undefined, + savedAt, + pubsub, + }) + + return { + __typename: 'MoveToFolderSuccess', + articleSavingRequest, + } + } + + const updatedItem = await updateLibraryItem( + item.id, + { + folder, + savedAt, + }, + uid, + pubsub + ) + + return { + __typename: 'MoveToFolderSuccess', + articleSavingRequest: libraryItemToArticleSavingRequest( + updatedItem.user, + updatedItem + ), + } +}) + const getUpdateReason = (libraryItem: LibraryItem, since: Date) => { if (libraryItem.deletedAt) { return UpdateReason.Deleted diff --git a/packages/api/src/resolvers/following/index.ts b/packages/api/src/resolvers/following/index.ts deleted file mode 100644 index 6c8f1deb7..000000000 --- a/packages/api/src/resolvers/following/index.ts +++ /dev/null @@ -1,146 +0,0 @@ -import { LibraryItem } from '../../entity/library_item' -import { - FeedEdge, - FeedsError, - FeedsErrorCode, - FeedsSuccess, - MoveToFolderError, - MoveToFolderErrorCode, - MoveToFolderSuccess, - MutationMoveToFolderArgs, - QueryFeedsArgs, -} from '../../generated/graphql' -import { feedRepository } from '../../repository/feed' -import { createPageSaveRequest } from '../../services/create_page_save_request' -import { updateLibraryItem } from '../../services/library_item' -import { analytics } from '../../utils/analytics' -import { - authorized, - libraryItemToArticleSavingRequest, -} from '../../utils/helpers' - -export const feedsResolver = authorized< - FeedsSuccess, - FeedsError, - QueryFeedsArgs ->(async (_, { input }, { log }) => { - try { - const startCursor = input.after || '' - const start = - startCursor && !isNaN(Number(startCursor)) ? Number(startCursor) : 0 - const first = Math.min(input.first || 10, 100) // cap at 100 - - const { feeds, count } = await feedRepository.searchFeeds( - input.query || '', - first + 1, // fetch one extra to check if there is a next page - start, - input.sort?.by, - input.sort?.order || undefined - ) - - const hasNextPage = feeds.length > first - const endCursor = String(start + feeds.length - (hasNextPage ? 1 : 0)) - - if (hasNextPage) { - // remove an extra if exists - feeds.pop() - } - - const edges: FeedEdge[] = feeds.map((feed) => ({ - node: feed, - cursor: endCursor, - })) - - return { - __typename: 'FeedsSuccess', - edges, - pageInfo: { - hasPreviousPage: start > 0, - hasNextPage, - startCursor, - endCursor, - totalCount: count, - }, - } - } catch (error) { - log.error('Error fetching feeds', error) - - return { - errorCodes: [FeedsErrorCode.BadRequest], - } - } -}) - -export const moveToFolderResolver = authorized< - MoveToFolderSuccess, - MoveToFolderError, - MutationMoveToFolderArgs ->(async (_, { id, folder }, { authTrx, pubsub, uid }) => { - analytics.track({ - userId: uid, - event: 'move_to_folder', - properties: { - id, - folder, - }, - }) - - const item = await authTrx((tx) => - tx.getRepository(LibraryItem).findOne({ - where: { - id, - }, - relations: ['user'], - }) - ) - - if (!item) { - return { - errorCodes: [MoveToFolderErrorCode.Unauthorized], - } - } - - if (item.folder === folder) { - return { - errorCodes: [MoveToFolderErrorCode.AlreadyExists], - } - } - - const savedAt = new Date() - - // if the content is not fetched yet, create a page save request - if (!item.readableContent) { - const articleSavingRequest = await createPageSaveRequest({ - userId: uid, - url: item.originalUrl, - articleSavingRequestId: id, - priority: 'high', - publishedAt: item.publishedAt || undefined, - savedAt, - pubsub, - }) - - return { - __typename: 'MoveToFolderSuccess', - articleSavingRequest, - } - } - - const updatedItem = await updateLibraryItem( - item.id, - { - folder, - savedAt, - }, - uid, - pubsub - ) - - return { - __typename: 'MoveToFolderSuccess', - articleSavingRequest: libraryItemToArticleSavingRequest( - updatedItem.user, - updatedItem - ), - } -}) diff --git a/packages/api/src/resolvers/function_resolvers.ts b/packages/api/src/resolvers/function_resolvers.ts index ff363e78f..909aeb121 100644 --- a/packages/api/src/resolvers/function_resolvers.ts +++ b/packages/api/src/resolvers/function_resolvers.ts @@ -29,7 +29,6 @@ import { generateUploadFilePathName, } from '../utils/uploads' import { optInFeatureResolver } from './features' -import { feedsResolver, moveToFolderResolver } from './following' import { uploadImportFileResolver } from './importers/uploadImportFileResolver' import { addPopularReadResolver, @@ -53,6 +52,7 @@ import { deleteRuleResolver, deleteWebhookResolver, deviceTokensResolver, + feedsResolver, filtersResolver, generateApiKeyResolver, getAllUsersResolver, @@ -76,6 +76,7 @@ import { mergeHighlightResolver, moveFilterResolver, moveLabelResolver, + moveToFolderResolver, newsletterEmailsResolver, recommendHighlightsResolver, recommendResolver, @@ -88,6 +89,7 @@ import { saveFilterResolver, savePageResolver, saveUrlResolver, + scanFeedsResolver, searchResolver, sendInstallInstructionsResolver, setBookmarkArticleResolver, @@ -249,6 +251,7 @@ export const functionResolvers = { groups: groupsResolver, recentEmails: recentEmailsResolver, feeds: feedsResolver, + scanFeeds: scanFeedsResolver, }, User: { async intercomHash( diff --git a/packages/api/src/resolvers/subscriptions/index.ts b/packages/api/src/resolvers/subscriptions/index.ts index 0680339ff..98e0d9c79 100644 --- a/packages/api/src/resolvers/subscriptions/index.ts +++ b/packages/api/src/resolvers/subscriptions/index.ts @@ -1,12 +1,24 @@ +import axios from 'axios' +import { parseHTML } from 'linkedom' import Parser from 'rss-parser' import { Brackets } from 'typeorm' import { Subscription } from '../../entity/subscription' import { env } from '../../env' import { + FeedEdge, + FeedsError, + FeedsErrorCode, + FeedsSuccess, MutationSubscribeArgs, MutationUnsubscribeArgs, MutationUpdateSubscriptionArgs, + QueryFeedsArgs, + QueryScanFeedsArgs, QuerySubscriptionsArgs, + ScanFeedsError, + ScanFeedsErrorCode, + ScanFeedsSuccess, + ScanFeedsType, SortBy, SortOrder, SubscribeError, @@ -25,11 +37,13 @@ import { UpdateSubscriptionSuccess, } from '../../generated/graphql' import { getRepository } from '../../repository' +import { feedRepository } from '../../repository/feed' import { unsubscribe } from '../../services/subscriptions' import { Merge } from '../../util' import { analytics } from '../../utils/analytics' import { enqueueRssFeedFetch } from '../../utils/createTask' import { authorized } from '../../utils/helpers' +import { parseFeed, parseOpml } from '../../utils/parser' type PartialSubscription = Omit @@ -175,7 +189,7 @@ export const subscribeResolver = authorized< SubscribeSuccessPartial, SubscribeError, MutationSubscribeArgs ->(async (_, { input }, { authTrx, uid, log }) => { +>(async (_, { input }, { uid, log }) => { try { analytics.track({ userId: uid, @@ -224,7 +238,12 @@ export const subscribeResolver = authorized< // create new rss subscription const MAX_RSS_SUBSCRIPTIONS = 150 // validate rss feed - const feed = await parser.parseURL(input.url) + const feed = await parseFeed(input.url) + if (!feed) { + return { + errorCodes: [SubscribeErrorCode.NotFound], + } + } // limit number of rss subscriptions to 150 const results = (await getRepository(Subscription).query( @@ -235,11 +254,11 @@ export const subscribeResolver = authorized< returning *;`, [ feed.title, - feed.feedUrl, + feed.url, feed.description || null, SubscriptionType.Rss, uid, - feed.image?.url || null, + feed.thumbnail || null, input.autoAddToLibrary ?? null, input.isPrivate ?? null, MAX_RSS_SUBSCRIPTIONS, @@ -336,3 +355,132 @@ export const updateSubscriptionResolver = authorized< } } }) + +export const feedsResolver = authorized< + FeedsSuccess, + FeedsError, + QueryFeedsArgs +>(async (_, { input }, { log }) => { + try { + const startCursor = input.after || '' + const start = + startCursor && !isNaN(Number(startCursor)) ? Number(startCursor) : 0 + const first = Math.min(input.first || 10, 100) // cap at 100 + + const { feeds, count } = await feedRepository.searchFeeds( + input.query || '', + first + 1, // fetch one extra to check if there is a next page + start, + input.sort?.by, + input.sort?.order || undefined + ) + + const hasNextPage = feeds.length > first + const endCursor = String(start + feeds.length - (hasNextPage ? 1 : 0)) + + if (hasNextPage) { + // remove an extra if exists + feeds.pop() + } + + const edges: FeedEdge[] = feeds.map((feed) => ({ + node: feed, + cursor: endCursor, + })) + + return { + __typename: 'FeedsSuccess', + edges, + pageInfo: { + hasPreviousPage: start > 0, + hasNextPage, + startCursor, + endCursor, + totalCount: count, + }, + } + } catch (error) { + log.error('Error fetching feeds', error) + + return { + errorCodes: [FeedsErrorCode.BadRequest], + } + } +}) + +export const scanFeedsResolver = authorized< + ScanFeedsSuccess, + ScanFeedsError, + QueryScanFeedsArgs +>(async (_, { input: { type, opml, url } }, { log, uid }) => { + analytics.track({ + userId: uid, + event: 'scan_feeds', + properties: { + type, + }, + }) + + if (type === ScanFeedsType.Opml) { + if (!opml) { + return { + errorCodes: [ScanFeedsErrorCode.BadRequest], + } + } + + // parse opml + const feeds = parseOpml(opml) + if (!feeds) { + return { + errorCodes: [ScanFeedsErrorCode.BadRequest], + } + } + + return { + __typename: 'ScanFeedsSuccess', + feeds: feeds.map((feed) => ({ + url: feed.url, + title: feed.title, + type: feed.type || 'rss', + })), + } + } + + if (!url) { + return { + errorCodes: [ScanFeedsErrorCode.BadRequest], + } + } + + try { + // fetch HTML and parse feeds + const response = await axios.get(url, { + timeout: 5000, + headers: { + 'User-Agent': 'Mozilla/5.0', + Accept: 'text/html', + }, + }) + const html = response.data as string + const dom = parseHTML(html).document + const links = dom.querySelectorAll('link[type="application/rss+xml"]') + const feeds = Array.from(links) + .map((link) => ({ + url: link.getAttribute('href') || '', + title: link.getAttribute('title') || '', + type: 'rss', + })) + .filter((feed) => feed.url) + + return { + __typename: 'ScanFeedsSuccess', + feeds, + } + } catch (error) { + log.error('Error scanning HTML', error) + + return { + errorCodes: [ScanFeedsErrorCode.BadRequest], + } + } +}) diff --git a/packages/api/src/schema.ts b/packages/api/src/schema.ts index 1e774fc0b..fca5ff9c9 100755 --- a/packages/api/src/schema.ts +++ b/packages/api/src/schema.ts @@ -2646,15 +2646,16 @@ const schema = gql` } type Feed { - id: ID! + id: ID title: String! url: String! description: String image: String - createdAt: Date! - updatedAt: Date! + createdAt: Date + updatedAt: Date publishedAt: Date author: String + type: String } union MoveToFolderResult = MoveToFolderSuccess | MoveToFolderError @@ -2673,6 +2674,31 @@ const schema = gql` ALREADY_EXISTS } + input ScanFeedsInput { + type: ScanFeedsType! + url: String + opml: String + } + + enum ScanFeedsType { + OPML + HTML + } + + union ScanFeedsResult = ScanFeedsSuccess | ScanFeedsError + + type ScanFeedsSuccess { + feeds: [Feed!]! + } + + type ScanFeedsError { + errorCodes: [ScanFeedsErrorCode!]! + } + + enum ScanFeedsErrorCode { + BAD_REQUEST + } + # Mutations type Mutation { googleLogin(input: GoogleLoginInput!): LoginResult! @@ -2837,6 +2863,7 @@ const schema = gql` groups: GroupsResult! recentEmails: RecentEmailsResult! feeds(input: FeedsInput!): FeedsResult! + scanFeeds(input: ScanFeedsInput!): ScanFeedsResult! } ` diff --git a/packages/api/src/utils/helpers.ts b/packages/api/src/utils/helpers.ts index fedfe34f0..b0ff7cfcf 100644 --- a/packages/api/src/utils/helpers.ts +++ b/packages/api/src/utils/helpers.ts @@ -30,7 +30,6 @@ import { validateUrl } from '../services/create_page_save_request' import { updateLibraryItem } from '../services/library_item' import { Merge } from '../util' import { logger } from './logger' -import { InFilter } from './search' interface InputObject { // eslint-disable-next-line @typescript-eslint/no-explicit-any [key: string]: any diff --git a/packages/api/src/utils/parser.ts b/packages/api/src/utils/parser.ts index eb8ece9c6..eb69c8a87 100644 --- a/packages/api/src/utils/parser.ts +++ b/packages/api/src/utils/parser.ts @@ -12,6 +12,8 @@ import * as jwt from 'jsonwebtoken' import { parseHTML } from 'linkedom' import { NodeHtmlMarkdown, TranslatorConfigObject } from 'node-html-markdown' import { ElementNode } from 'node-html-markdown/dist/nodes' +import Parser from 'rss-parser' +import { parser } from 'sax' import { ILike } from 'typeorm' import { promisify } from 'util' import { v4 as uuid } from 'uuid' @@ -31,9 +33,26 @@ import { import { createImageProxyUrl } from './imageproxy' import { buildLogger, LogRecord } from './logger' +interface Feed { + title: string + url: string + type: string + thumbnail?: string + description?: string +} + const logger = buildLogger('utils.parse') const signToken = promisify(jwt.sign) +const axiosInstance = axios.create({ + timeout: 5000, + headers: { + 'User-Agent': 'Mozilla/5.0', + Accept: 'text/html', + }, + responseType: 'text', +}) + export const ALLOWED_CONTENT_TYPES = [ 'text/html', 'application/octet-stream', @@ -703,3 +722,119 @@ export const getDistillerResult = async ( return undefined } } + +const fetchHtml = async (url: string): Promise => { + try { + const response = await axiosInstance.get(url) + return response.data as string + } catch (error) { + logger.error('Error fetching html', error) + return null + } +} + +export const parseOpml = (opml: string): Feed[] | undefined => { + const xmlParser = parser(true, { lowercase: true }) + const feeds: Feed[] = [] + const existingFeeds = new Map() + + xmlParser.onopentag = function (node) { + if (node.name === 'outline') { + // folders also are outlines, make sure an xmlUrl is available + const feedUrl = node.attributes.xmlUrl.toString() + if (feedUrl && !existingFeeds.has(feedUrl)) { + feeds.push({ + title: node.attributes.title.toString() || '', + url: feedUrl, + type: node.attributes.type.toString() || 'rss', + }) + existingFeeds.set(feedUrl, true) + } + } + } + + xmlParser.onend = function () { + return feeds + } + + try { + xmlParser.write(opml).close() + } catch (error) { + logger.error('Error parsing opml', error) + return undefined + } +} + +export const parseHtml = async (url: string): Promise => { + // fetch HTML and parse feeds + const html = await fetchHtml(url) + if (!html) return undefined + + try { + const dom = parseHTML(html).document + const links = dom.querySelectorAll('link[type="application/rss+xml"]') + const feeds = Array.from(links) + .map((link) => ({ + url: link.getAttribute('href') || '', + title: link.getAttribute('title') || '', + type: 'rss', + })) + .filter((feed) => feed.url) + + return feeds + } catch (error) { + logger.error('Error parsing html', error) + return undefined + } +} + +export const parseFeed = async (url: string): Promise => { + try { + // check if url is a telegram channel + const telegramRegex = /https:\/\/t\.me\/([a-zA-Z0-9_]+)/ + const telegramMatch = url.match(telegramRegex) + if (telegramMatch) { + // fetch HTML and parse feeds + const html = await fetchHtml(url) + if (!html) return null + + const dom = parseHTML(html).document + const title = dom.querySelector('meta[property="og:title"]') + const thumbnail = dom.querySelector('meta[property="og:image"]') + const description = dom.querySelector('meta[property="og:description"]') + + return { + title: title?.getAttribute('content') || url, + url, + type: 'telegram', + thumbnail: thumbnail?.getAttribute('content') || '', + description: description?.getAttribute('content') || '', + } + } + + const parser = new Parser({ + timeout: 5000, // 5 seconds + headers: { + // some rss feeds require user agent + 'User-Agent': + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', + Accept: + 'application/rss+xml, application/rdf+xml;q=0.8, application/atom+xml;q=0.6, application/xml;q=0.4, text/xml;q=0.4', + }, + }) + + const feed = await parser.parseURL(url) + const feedUrl = feed.feedUrl || url + + return { + title: feed.title || feedUrl, + url: feedUrl, + thumbnail: feed.image?.url, + type: 'rss', + description: feed.description, + } + } catch (error) { + logger.error('Error parsing feed', error) + return null + } +} diff --git a/packages/rss-handler/package.json b/packages/rss-handler/package.json index 8bb55def0..49452e22e 100644 --- a/packages/rss-handler/package.json +++ b/packages/rss-handler/package.json @@ -28,6 +28,7 @@ "axios": "^1.4.0", "dotenv": "^16.0.1", "jsonwebtoken": "^8.5.1", + "linkedom": "^0.16.4", "rss-parser": "^3.13.0" }, "volta": { diff --git a/packages/rss-handler/src/index.ts b/packages/rss-handler/src/index.ts index a3abb1416..c81383019 100644 --- a/packages/rss-handler/src/index.ts +++ b/packages/rss-handler/src/index.ts @@ -3,6 +3,7 @@ import axios from 'axios' import crypto from 'crypto' import * as dotenv from 'dotenv' // see https://github.com/motdotla/dotenv#how-do-i-use-dotenv-with-import import * as jwt from 'jsonwebtoken' +import { parseHTML } from 'linkedom' import Parser, { Item } from 'rss-parser' import { promisify } from 'util' import { CONTENT_FETCH_URL, createCloudTask } from './task' @@ -87,6 +88,48 @@ export const fetchAndChecksum = async (url: string) => { } } +const parseFeed = async (url: string, content: string) => { + try { + // check if url is a telegram channel + const telegramRegex = /https:\/\/t\.me\/([a-zA-Z0-9_]+)/ + const telegramMatch = url.match(telegramRegex) + if (telegramMatch) { + const dom = parseHTML(content).document + const title = dom.querySelector('meta[property="og:title"]') + // post has attribute data-post + const posts = dom.querySelectorAll('[data-post]') + const items = Array.from(posts) + .map((post) => { + const id = post.getAttribute('data-post') + if (!id) { + return null + } + + const url = `https://t.me/${telegramMatch[1]}/${id}` + // find the