Merge pull request #3166 from omnivore-app/feature/scan-feeds-api

add scan feeds api to scan feeds from opml or html
This commit is contained in:
Hongbo Wu
2023-11-28 11:27:36 +08:00
committed by GitHub
13 changed files with 600 additions and 167 deletions

View File

@ -83,6 +83,7 @@
"private-ip": "^2.3.3",
"rss-parser": "^3.13.0",
"sanitize-html": "^2.3.2",
"sax": "^1.3.0",
"search-query-parser": "^1.6.0",
"snake-case": "^3.0.3",
"supertest": "^6.2.2",
@ -122,6 +123,7 @@
"@types/oauth": "^0.9.1",
"@types/private-ip": "^1.0.0",
"@types/sanitize-html": "^1.27.1",
"@types/sax": "^1.2.7",
"@types/sinon": "^10.0.13",
"@types/sinon-chai": "^3.2.8",
"@types/supertest": "^2.0.11",

View File

@ -729,13 +729,14 @@ export type Feature = {
export type Feed = {
__typename?: 'Feed';
author?: Maybe<Scalars['String']>;
createdAt: Scalars['Date'];
createdAt?: Maybe<Scalars['Date']>;
description?: Maybe<Scalars['String']>;
id: Scalars['ID'];
id?: Maybe<Scalars['ID']>;
image?: Maybe<Scalars['String']>;
publishedAt?: Maybe<Scalars['Date']>;
title: Scalars['String'];
updatedAt: Scalars['Date'];
type?: Maybe<Scalars['String']>;
updatedAt?: Maybe<Scalars['Date']>;
url: Scalars['String'];
};
@ -1807,6 +1808,7 @@ export type Query = {
recentEmails: RecentEmailsResult;
recentSearches: RecentSearchesResult;
rules: RulesResult;
scanFeeds: ScanFeedsResult;
search: SearchResult;
sendInstallInstructions: SendInstallInstructionsResult;
subscriptions: SubscriptionsResult;
@ -1843,6 +1845,11 @@ export type QueryRulesArgs = {
};
export type QueryScanFeedsArgs = {
input: ScanFeedsInput;
};
export type QuerySearchArgs = {
after?: InputMaybe<Scalars['String']>;
first?: InputMaybe<Scalars['Int']>;
@ -2272,6 +2279,33 @@ export type SaveUrlInput = {
url: Scalars['String'];
};
export type ScanFeedsError = {
__typename?: 'ScanFeedsError';
errorCodes: Array<ScanFeedsErrorCode>;
};
export enum ScanFeedsErrorCode {
BadRequest = 'BAD_REQUEST'
}
export type ScanFeedsInput = {
opml?: InputMaybe<Scalars['String']>;
type: ScanFeedsType;
url?: InputMaybe<Scalars['String']>;
};
export type ScanFeedsResult = ScanFeedsError | ScanFeedsSuccess;
export type ScanFeedsSuccess = {
__typename?: 'ScanFeedsSuccess';
feeds: Array<Feed>;
};
export enum ScanFeedsType {
Html = 'HTML',
Opml = 'OPML'
}
export type SearchError = {
__typename?: 'SearchError';
errorCodes: Array<SearchErrorCode>;
@ -3739,6 +3773,12 @@ export type ResolversTypes = {
SaveResult: ResolversTypes['SaveError'] | ResolversTypes['SaveSuccess'];
SaveSuccess: ResolverTypeWrapper<SaveSuccess>;
SaveUrlInput: SaveUrlInput;
ScanFeedsError: ResolverTypeWrapper<ScanFeedsError>;
ScanFeedsErrorCode: ScanFeedsErrorCode;
ScanFeedsInput: ScanFeedsInput;
ScanFeedsResult: ResolversTypes['ScanFeedsError'] | ResolversTypes['ScanFeedsSuccess'];
ScanFeedsSuccess: ResolverTypeWrapper<ScanFeedsSuccess>;
ScanFeedsType: ScanFeedsType;
SearchError: ResolverTypeWrapper<SearchError>;
SearchErrorCode: SearchErrorCode;
SearchItem: ResolverTypeWrapper<SearchItem>;
@ -4190,6 +4230,10 @@ export type ResolversParentTypes = {
SaveResult: ResolversParentTypes['SaveError'] | ResolversParentTypes['SaveSuccess'];
SaveSuccess: SaveSuccess;
SaveUrlInput: SaveUrlInput;
ScanFeedsError: ScanFeedsError;
ScanFeedsInput: ScanFeedsInput;
ScanFeedsResult: ResolversParentTypes['ScanFeedsError'] | ResolversParentTypes['ScanFeedsSuccess'];
ScanFeedsSuccess: ScanFeedsSuccess;
SearchError: SearchError;
SearchItem: SearchItem;
SearchItemEdge: SearchItemEdge;
@ -4845,13 +4889,14 @@ export type FeatureResolvers<ContextType = ResolverContext, ParentType extends R
export type FeedResolvers<ContextType = ResolverContext, ParentType extends ResolversParentTypes['Feed'] = ResolversParentTypes['Feed']> = {
author?: Resolver<Maybe<ResolversTypes['String']>, ParentType, ContextType>;
createdAt?: Resolver<ResolversTypes['Date'], ParentType, ContextType>;
createdAt?: Resolver<Maybe<ResolversTypes['Date']>, ParentType, ContextType>;
description?: Resolver<Maybe<ResolversTypes['String']>, ParentType, ContextType>;
id?: Resolver<ResolversTypes['ID'], ParentType, ContextType>;
id?: Resolver<Maybe<ResolversTypes['ID']>, ParentType, ContextType>;
image?: Resolver<Maybe<ResolversTypes['String']>, ParentType, ContextType>;
publishedAt?: Resolver<Maybe<ResolversTypes['Date']>, ParentType, ContextType>;
title?: Resolver<ResolversTypes['String'], ParentType, ContextType>;
updatedAt?: Resolver<ResolversTypes['Date'], ParentType, ContextType>;
type?: Resolver<Maybe<ResolversTypes['String']>, ParentType, ContextType>;
updatedAt?: Resolver<Maybe<ResolversTypes['Date']>, ParentType, ContextType>;
url?: Resolver<ResolversTypes['String'], ParentType, ContextType>;
__isTypeOf?: IsTypeOfResolverFn<ParentType, ContextType>;
};
@ -5436,6 +5481,7 @@ export type QueryResolvers<ContextType = ResolverContext, ParentType extends Res
recentEmails?: Resolver<ResolversTypes['RecentEmailsResult'], ParentType, ContextType>;
recentSearches?: Resolver<ResolversTypes['RecentSearchesResult'], ParentType, ContextType>;
rules?: Resolver<ResolversTypes['RulesResult'], ParentType, ContextType, Partial<QueryRulesArgs>>;
scanFeeds?: Resolver<ResolversTypes['ScanFeedsResult'], ParentType, ContextType, RequireFields<QueryScanFeedsArgs, 'input'>>;
search?: Resolver<ResolversTypes['SearchResult'], ParentType, ContextType, Partial<QuerySearchArgs>>;
sendInstallInstructions?: Resolver<ResolversTypes['SendInstallInstructionsResult'], ParentType, ContextType>;
subscriptions?: Resolver<ResolversTypes['SubscriptionsResult'], ParentType, ContextType, Partial<QuerySubscriptionsArgs>>;
@ -5689,6 +5735,20 @@ export type SaveSuccessResolvers<ContextType = ResolverContext, ParentType exten
__isTypeOf?: IsTypeOfResolverFn<ParentType, ContextType>;
};
export type ScanFeedsErrorResolvers<ContextType = ResolverContext, ParentType extends ResolversParentTypes['ScanFeedsError'] = ResolversParentTypes['ScanFeedsError']> = {
errorCodes?: Resolver<Array<ResolversTypes['ScanFeedsErrorCode']>, ParentType, ContextType>;
__isTypeOf?: IsTypeOfResolverFn<ParentType, ContextType>;
};
export type ScanFeedsResultResolvers<ContextType = ResolverContext, ParentType extends ResolversParentTypes['ScanFeedsResult'] = ResolversParentTypes['ScanFeedsResult']> = {
__resolveType: TypeResolveFn<'ScanFeedsError' | 'ScanFeedsSuccess', ParentType, ContextType>;
};
export type ScanFeedsSuccessResolvers<ContextType = ResolverContext, ParentType extends ResolversParentTypes['ScanFeedsSuccess'] = ResolversParentTypes['ScanFeedsSuccess']> = {
feeds?: Resolver<Array<ResolversTypes['Feed']>, ParentType, ContextType>;
__isTypeOf?: IsTypeOfResolverFn<ParentType, ContextType>;
};
export type SearchErrorResolvers<ContextType = ResolverContext, ParentType extends ResolversParentTypes['SearchError'] = ResolversParentTypes['SearchError']> = {
errorCodes?: Resolver<Array<ResolversTypes['SearchErrorCode']>, ParentType, ContextType>;
__isTypeOf?: IsTypeOfResolverFn<ParentType, ContextType>;
@ -6581,6 +6641,9 @@ export type Resolvers<ContextType = ResolverContext> = {
SaveFilterSuccess?: SaveFilterSuccessResolvers<ContextType>;
SaveResult?: SaveResultResolvers<ContextType>;
SaveSuccess?: SaveSuccessResolvers<ContextType>;
ScanFeedsError?: ScanFeedsErrorResolvers<ContextType>;
ScanFeedsResult?: ScanFeedsResultResolvers<ContextType>;
ScanFeedsSuccess?: ScanFeedsSuccessResolvers<ContextType>;
SearchError?: SearchErrorResolvers<ContextType>;
SearchItem?: SearchItemResolvers<ContextType>;
SearchItemEdge?: SearchItemEdgeResolvers<ContextType>;

View File

@ -644,13 +644,14 @@ type Feature {
type Feed {
author: String
createdAt: Date!
createdAt: Date
description: String
id: ID!
id: ID
image: String
publishedAt: Date
title: String!
updatedAt: Date!
type: String
updatedAt: Date
url: String!
}
@ -1364,6 +1365,7 @@ type Query {
recentEmails: RecentEmailsResult!
recentSearches: RecentSearchesResult!
rules(enabled: Boolean): RulesResult!
scanFeeds(input: ScanFeedsInput!): ScanFeedsResult!
search(after: String, first: Int, format: String, includeContent: Boolean, query: String): SearchResult!
sendInstallInstructions: SendInstallInstructionsResult!
subscriptions(sort: SortParams, type: SubscriptionType): SubscriptionsResult!
@ -1729,6 +1731,31 @@ input SaveUrlInput {
url: String!
}
type ScanFeedsError {
errorCodes: [ScanFeedsErrorCode!]!
}
enum ScanFeedsErrorCode {
BAD_REQUEST
}
input ScanFeedsInput {
opml: String
type: ScanFeedsType!
url: String
}
union ScanFeedsResult = ScanFeedsError | ScanFeedsSuccess
type ScanFeedsSuccess {
feeds: [Feed!]!
}
enum ScanFeedsType {
HTML
OPML
}
type SearchError {
errorCodes: [SearchErrorCode!]!
}

View File

@ -21,8 +21,12 @@ import {
CreateArticleError,
CreateArticleErrorCode,
CreateArticleSuccess,
MoveToFolderError,
MoveToFolderErrorCode,
MoveToFolderSuccess,
MutationBulkActionArgs,
MutationCreateArticleArgs,
MutationMoveToFolderArgs,
MutationSaveArticleReadingProgressArgs,
MutationSetBookmarkArticleArgs,
MutationSetFavoriteArticleArgs,
@ -85,6 +89,7 @@ import {
generateSlug,
isParsingTimeout,
libraryItemToArticle,
libraryItemToArticleSavingRequest,
libraryItemToSearchItem,
titleForFilePath,
userDataToUser,
@ -870,6 +875,80 @@ export const setFavoriteArticleResolver = authorized<
}
})
export const moveToFolderResolver = authorized<
MoveToFolderSuccess,
MoveToFolderError,
MutationMoveToFolderArgs
>(async (_, { id, folder }, { authTrx, pubsub, uid }) => {
analytics.track({
userId: uid,
event: 'move_to_folder',
properties: {
id,
folder,
},
})
const item = await authTrx((tx) =>
tx.getRepository(LibraryItem).findOne({
where: {
id,
},
relations: ['user'],
})
)
if (!item) {
return {
errorCodes: [MoveToFolderErrorCode.Unauthorized],
}
}
if (item.folder === folder) {
return {
errorCodes: [MoveToFolderErrorCode.AlreadyExists],
}
}
const savedAt = new Date()
// if the content is not fetched yet, create a page save request
if (!item.readableContent) {
const articleSavingRequest = await createPageSaveRequest({
userId: uid,
url: item.originalUrl,
articleSavingRequestId: id,
priority: 'high',
publishedAt: item.publishedAt || undefined,
savedAt,
pubsub,
})
return {
__typename: 'MoveToFolderSuccess',
articleSavingRequest,
}
}
const updatedItem = await updateLibraryItem(
item.id,
{
folder,
savedAt,
},
uid,
pubsub
)
return {
__typename: 'MoveToFolderSuccess',
articleSavingRequest: libraryItemToArticleSavingRequest(
updatedItem.user,
updatedItem
),
}
})
const getUpdateReason = (libraryItem: LibraryItem, since: Date) => {
if (libraryItem.deletedAt) {
return UpdateReason.Deleted

View File

@ -1,146 +0,0 @@
import { LibraryItem } from '../../entity/library_item'
import {
FeedEdge,
FeedsError,
FeedsErrorCode,
FeedsSuccess,
MoveToFolderError,
MoveToFolderErrorCode,
MoveToFolderSuccess,
MutationMoveToFolderArgs,
QueryFeedsArgs,
} from '../../generated/graphql'
import { feedRepository } from '../../repository/feed'
import { createPageSaveRequest } from '../../services/create_page_save_request'
import { updateLibraryItem } from '../../services/library_item'
import { analytics } from '../../utils/analytics'
import {
authorized,
libraryItemToArticleSavingRequest,
} from '../../utils/helpers'
export const feedsResolver = authorized<
FeedsSuccess,
FeedsError,
QueryFeedsArgs
>(async (_, { input }, { log }) => {
try {
const startCursor = input.after || ''
const start =
startCursor && !isNaN(Number(startCursor)) ? Number(startCursor) : 0
const first = Math.min(input.first || 10, 100) // cap at 100
const { feeds, count } = await feedRepository.searchFeeds(
input.query || '',
first + 1, // fetch one extra to check if there is a next page
start,
input.sort?.by,
input.sort?.order || undefined
)
const hasNextPage = feeds.length > first
const endCursor = String(start + feeds.length - (hasNextPage ? 1 : 0))
if (hasNextPage) {
// remove an extra if exists
feeds.pop()
}
const edges: FeedEdge[] = feeds.map((feed) => ({
node: feed,
cursor: endCursor,
}))
return {
__typename: 'FeedsSuccess',
edges,
pageInfo: {
hasPreviousPage: start > 0,
hasNextPage,
startCursor,
endCursor,
totalCount: count,
},
}
} catch (error) {
log.error('Error fetching feeds', error)
return {
errorCodes: [FeedsErrorCode.BadRequest],
}
}
})
export const moveToFolderResolver = authorized<
MoveToFolderSuccess,
MoveToFolderError,
MutationMoveToFolderArgs
>(async (_, { id, folder }, { authTrx, pubsub, uid }) => {
analytics.track({
userId: uid,
event: 'move_to_folder',
properties: {
id,
folder,
},
})
const item = await authTrx((tx) =>
tx.getRepository(LibraryItem).findOne({
where: {
id,
},
relations: ['user'],
})
)
if (!item) {
return {
errorCodes: [MoveToFolderErrorCode.Unauthorized],
}
}
if (item.folder === folder) {
return {
errorCodes: [MoveToFolderErrorCode.AlreadyExists],
}
}
const savedAt = new Date()
// if the content is not fetched yet, create a page save request
if (!item.readableContent) {
const articleSavingRequest = await createPageSaveRequest({
userId: uid,
url: item.originalUrl,
articleSavingRequestId: id,
priority: 'high',
publishedAt: item.publishedAt || undefined,
savedAt,
pubsub,
})
return {
__typename: 'MoveToFolderSuccess',
articleSavingRequest,
}
}
const updatedItem = await updateLibraryItem(
item.id,
{
folder,
savedAt,
},
uid,
pubsub
)
return {
__typename: 'MoveToFolderSuccess',
articleSavingRequest: libraryItemToArticleSavingRequest(
updatedItem.user,
updatedItem
),
}
})

View File

@ -29,7 +29,6 @@ import {
generateUploadFilePathName,
} from '../utils/uploads'
import { optInFeatureResolver } from './features'
import { feedsResolver, moveToFolderResolver } from './following'
import { uploadImportFileResolver } from './importers/uploadImportFileResolver'
import {
addPopularReadResolver,
@ -53,6 +52,7 @@ import {
deleteRuleResolver,
deleteWebhookResolver,
deviceTokensResolver,
feedsResolver,
filtersResolver,
generateApiKeyResolver,
getAllUsersResolver,
@ -76,6 +76,7 @@ import {
mergeHighlightResolver,
moveFilterResolver,
moveLabelResolver,
moveToFolderResolver,
newsletterEmailsResolver,
recommendHighlightsResolver,
recommendResolver,
@ -88,6 +89,7 @@ import {
saveFilterResolver,
savePageResolver,
saveUrlResolver,
scanFeedsResolver,
searchResolver,
sendInstallInstructionsResolver,
setBookmarkArticleResolver,
@ -249,6 +251,7 @@ export const functionResolvers = {
groups: groupsResolver,
recentEmails: recentEmailsResolver,
feeds: feedsResolver,
scanFeeds: scanFeedsResolver,
},
User: {
async intercomHash(

View File

@ -1,12 +1,24 @@
import axios from 'axios'
import { parseHTML } from 'linkedom'
import Parser from 'rss-parser'
import { Brackets } from 'typeorm'
import { Subscription } from '../../entity/subscription'
import { env } from '../../env'
import {
FeedEdge,
FeedsError,
FeedsErrorCode,
FeedsSuccess,
MutationSubscribeArgs,
MutationUnsubscribeArgs,
MutationUpdateSubscriptionArgs,
QueryFeedsArgs,
QueryScanFeedsArgs,
QuerySubscriptionsArgs,
ScanFeedsError,
ScanFeedsErrorCode,
ScanFeedsSuccess,
ScanFeedsType,
SortBy,
SortOrder,
SubscribeError,
@ -25,11 +37,13 @@ import {
UpdateSubscriptionSuccess,
} from '../../generated/graphql'
import { getRepository } from '../../repository'
import { feedRepository } from '../../repository/feed'
import { unsubscribe } from '../../services/subscriptions'
import { Merge } from '../../util'
import { analytics } from '../../utils/analytics'
import { enqueueRssFeedFetch } from '../../utils/createTask'
import { authorized } from '../../utils/helpers'
import { parseFeed, parseOpml } from '../../utils/parser'
type PartialSubscription = Omit<Subscription, 'newsletterEmail'>
@ -175,7 +189,7 @@ export const subscribeResolver = authorized<
SubscribeSuccessPartial,
SubscribeError,
MutationSubscribeArgs
>(async (_, { input }, { authTrx, uid, log }) => {
>(async (_, { input }, { uid, log }) => {
try {
analytics.track({
userId: uid,
@ -224,7 +238,12 @@ export const subscribeResolver = authorized<
// create new rss subscription
const MAX_RSS_SUBSCRIPTIONS = 150
// validate rss feed
const feed = await parser.parseURL(input.url)
const feed = await parseFeed(input.url)
if (!feed) {
return {
errorCodes: [SubscribeErrorCode.NotFound],
}
}
// limit number of rss subscriptions to 150
const results = (await getRepository(Subscription).query(
@ -235,11 +254,11 @@ export const subscribeResolver = authorized<
returning *;`,
[
feed.title,
feed.feedUrl,
feed.url,
feed.description || null,
SubscriptionType.Rss,
uid,
feed.image?.url || null,
feed.thumbnail || null,
input.autoAddToLibrary ?? null,
input.isPrivate ?? null,
MAX_RSS_SUBSCRIPTIONS,
@ -336,3 +355,132 @@ export const updateSubscriptionResolver = authorized<
}
}
})
export const feedsResolver = authorized<
FeedsSuccess,
FeedsError,
QueryFeedsArgs
>(async (_, { input }, { log }) => {
try {
const startCursor = input.after || ''
const start =
startCursor && !isNaN(Number(startCursor)) ? Number(startCursor) : 0
const first = Math.min(input.first || 10, 100) // cap at 100
const { feeds, count } = await feedRepository.searchFeeds(
input.query || '',
first + 1, // fetch one extra to check if there is a next page
start,
input.sort?.by,
input.sort?.order || undefined
)
const hasNextPage = feeds.length > first
const endCursor = String(start + feeds.length - (hasNextPage ? 1 : 0))
if (hasNextPage) {
// remove an extra if exists
feeds.pop()
}
const edges: FeedEdge[] = feeds.map((feed) => ({
node: feed,
cursor: endCursor,
}))
return {
__typename: 'FeedsSuccess',
edges,
pageInfo: {
hasPreviousPage: start > 0,
hasNextPage,
startCursor,
endCursor,
totalCount: count,
},
}
} catch (error) {
log.error('Error fetching feeds', error)
return {
errorCodes: [FeedsErrorCode.BadRequest],
}
}
})
export const scanFeedsResolver = authorized<
ScanFeedsSuccess,
ScanFeedsError,
QueryScanFeedsArgs
>(async (_, { input: { type, opml, url } }, { log, uid }) => {
analytics.track({
userId: uid,
event: 'scan_feeds',
properties: {
type,
},
})
if (type === ScanFeedsType.Opml) {
if (!opml) {
return {
errorCodes: [ScanFeedsErrorCode.BadRequest],
}
}
// parse opml
const feeds = parseOpml(opml)
if (!feeds) {
return {
errorCodes: [ScanFeedsErrorCode.BadRequest],
}
}
return {
__typename: 'ScanFeedsSuccess',
feeds: feeds.map((feed) => ({
url: feed.url,
title: feed.title,
type: feed.type || 'rss',
})),
}
}
if (!url) {
return {
errorCodes: [ScanFeedsErrorCode.BadRequest],
}
}
try {
// fetch HTML and parse feeds
const response = await axios.get(url, {
timeout: 5000,
headers: {
'User-Agent': 'Mozilla/5.0',
Accept: 'text/html',
},
})
const html = response.data as string
const dom = parseHTML(html).document
const links = dom.querySelectorAll('link[type="application/rss+xml"]')
const feeds = Array.from(links)
.map((link) => ({
url: link.getAttribute('href') || '',
title: link.getAttribute('title') || '',
type: 'rss',
}))
.filter((feed) => feed.url)
return {
__typename: 'ScanFeedsSuccess',
feeds,
}
} catch (error) {
log.error('Error scanning HTML', error)
return {
errorCodes: [ScanFeedsErrorCode.BadRequest],
}
}
})

View File

@ -2646,15 +2646,16 @@ const schema = gql`
}
type Feed {
id: ID!
id: ID
title: String!
url: String!
description: String
image: String
createdAt: Date!
updatedAt: Date!
createdAt: Date
updatedAt: Date
publishedAt: Date
author: String
type: String
}
union MoveToFolderResult = MoveToFolderSuccess | MoveToFolderError
@ -2673,6 +2674,31 @@ const schema = gql`
ALREADY_EXISTS
}
input ScanFeedsInput {
type: ScanFeedsType!
url: String
opml: String
}
enum ScanFeedsType {
OPML
HTML
}
union ScanFeedsResult = ScanFeedsSuccess | ScanFeedsError
type ScanFeedsSuccess {
feeds: [Feed!]!
}
type ScanFeedsError {
errorCodes: [ScanFeedsErrorCode!]!
}
enum ScanFeedsErrorCode {
BAD_REQUEST
}
# Mutations
type Mutation {
googleLogin(input: GoogleLoginInput!): LoginResult!
@ -2837,6 +2863,7 @@ const schema = gql`
groups: GroupsResult!
recentEmails: RecentEmailsResult!
feeds(input: FeedsInput!): FeedsResult!
scanFeeds(input: ScanFeedsInput!): ScanFeedsResult!
}
`

View File

@ -30,7 +30,6 @@ import { validateUrl } from '../services/create_page_save_request'
import { updateLibraryItem } from '../services/library_item'
import { Merge } from '../util'
import { logger } from './logger'
import { InFilter } from './search'
interface InputObject {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
[key: string]: any

View File

@ -12,6 +12,8 @@ import * as jwt from 'jsonwebtoken'
import { parseHTML } from 'linkedom'
import { NodeHtmlMarkdown, TranslatorConfigObject } from 'node-html-markdown'
import { ElementNode } from 'node-html-markdown/dist/nodes'
import Parser from 'rss-parser'
import { parser } from 'sax'
import { ILike } from 'typeorm'
import { promisify } from 'util'
import { v4 as uuid } from 'uuid'
@ -31,9 +33,26 @@ import {
import { createImageProxyUrl } from './imageproxy'
import { buildLogger, LogRecord } from './logger'
interface Feed {
title: string
url: string
type: string
thumbnail?: string
description?: string
}
const logger = buildLogger('utils.parse')
const signToken = promisify(jwt.sign)
const axiosInstance = axios.create({
timeout: 5000,
headers: {
'User-Agent': 'Mozilla/5.0',
Accept: 'text/html',
},
responseType: 'text',
})
export const ALLOWED_CONTENT_TYPES = [
'text/html',
'application/octet-stream',
@ -703,3 +722,119 @@ export const getDistillerResult = async (
return undefined
}
}
const fetchHtml = async (url: string): Promise<string | null> => {
try {
const response = await axiosInstance.get(url)
return response.data as string
} catch (error) {
logger.error('Error fetching html', error)
return null
}
}
export const parseOpml = (opml: string): Feed[] | undefined => {
const xmlParser = parser(true, { lowercase: true })
const feeds: Feed[] = []
const existingFeeds = new Map<string, boolean>()
xmlParser.onopentag = function (node) {
if (node.name === 'outline') {
// folders also are outlines, make sure an xmlUrl is available
const feedUrl = node.attributes.xmlUrl.toString()
if (feedUrl && !existingFeeds.has(feedUrl)) {
feeds.push({
title: node.attributes.title.toString() || '',
url: feedUrl,
type: node.attributes.type.toString() || 'rss',
})
existingFeeds.set(feedUrl, true)
}
}
}
xmlParser.onend = function () {
return feeds
}
try {
xmlParser.write(opml).close()
} catch (error) {
logger.error('Error parsing opml', error)
return undefined
}
}
export const parseHtml = async (url: string): Promise<Feed[] | undefined> => {
// fetch HTML and parse feeds
const html = await fetchHtml(url)
if (!html) return undefined
try {
const dom = parseHTML(html).document
const links = dom.querySelectorAll('link[type="application/rss+xml"]')
const feeds = Array.from(links)
.map((link) => ({
url: link.getAttribute('href') || '',
title: link.getAttribute('title') || '',
type: 'rss',
}))
.filter((feed) => feed.url)
return feeds
} catch (error) {
logger.error('Error parsing html', error)
return undefined
}
}
export const parseFeed = async (url: string): Promise<Feed | null> => {
try {
// check if url is a telegram channel
const telegramRegex = /https:\/\/t\.me\/([a-zA-Z0-9_]+)/
const telegramMatch = url.match(telegramRegex)
if (telegramMatch) {
// fetch HTML and parse feeds
const html = await fetchHtml(url)
if (!html) return null
const dom = parseHTML(html).document
const title = dom.querySelector('meta[property="og:title"]')
const thumbnail = dom.querySelector('meta[property="og:image"]')
const description = dom.querySelector('meta[property="og:description"]')
return {
title: title?.getAttribute('content') || url,
url,
type: 'telegram',
thumbnail: thumbnail?.getAttribute('content') || '',
description: description?.getAttribute('content') || '',
}
}
const parser = new Parser({
timeout: 5000, // 5 seconds
headers: {
// some rss feeds require user agent
'User-Agent':
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
Accept:
'application/rss+xml, application/rdf+xml;q=0.8, application/atom+xml;q=0.6, application/xml;q=0.4, text/xml;q=0.4',
},
})
const feed = await parser.parseURL(url)
const feedUrl = feed.feedUrl || url
return {
title: feed.title || feedUrl,
url: feedUrl,
thumbnail: feed.image?.url,
type: 'rss',
description: feed.description,
}
} catch (error) {
logger.error('Error parsing feed', error)
return null
}
}

View File

@ -28,6 +28,7 @@
"axios": "^1.4.0",
"dotenv": "^16.0.1",
"jsonwebtoken": "^8.5.1",
"linkedom": "^0.16.4",
"rss-parser": "^3.13.0"
},
"volta": {

View File

@ -3,6 +3,7 @@ import axios from 'axios'
import crypto from 'crypto'
import * as dotenv from 'dotenv' // see https://github.com/motdotla/dotenv#how-do-i-use-dotenv-with-import
import * as jwt from 'jsonwebtoken'
import { parseHTML } from 'linkedom'
import Parser, { Item } from 'rss-parser'
import { promisify } from 'util'
import { CONTENT_FETCH_URL, createCloudTask } from './task'
@ -87,6 +88,48 @@ export const fetchAndChecksum = async (url: string) => {
}
}
const parseFeed = async (url: string, content: string) => {
try {
// check if url is a telegram channel
const telegramRegex = /https:\/\/t\.me\/([a-zA-Z0-9_]+)/
const telegramMatch = url.match(telegramRegex)
if (telegramMatch) {
const dom = parseHTML(content).document
const title = dom.querySelector('meta[property="og:title"]')
// post has attribute data-post
const posts = dom.querySelectorAll('[data-post]')
const items = Array.from(posts)
.map((post) => {
const id = post.getAttribute('data-post')
if (!id) {
return null
}
const url = `https://t.me/${telegramMatch[1]}/${id}`
// find the <time> element
const time = post.querySelector('time')
const dateTime = time?.getAttribute('datetime') || undefined
return {
link: url,
isoDate: dateTime,
}
})
.filter((item) => !!item) as RssFeedItem[]
return {
title: title?.getAttribute('content') || dom.title,
items,
}
}
return parser.parseString(content)
} catch (error) {
console.log(error)
return null
}
}
const sendUpdateSubscriptionMutation = async (
userId: string,
subscriptionId: string,
@ -477,7 +520,12 @@ export const rssHandler = Sentry.GCPFunction.wrapHttpFunction(
console.log('Processing feed', feedUrl)
const fetchResult = await fetchAndChecksum(feedUrl)
const feed = await parser.parseString(fetchResult.content)
const feed = await parseFeed(feedUrl, fetchResult.content)
if (!feed) {
console.error('Failed to parse RSS feed', feedUrl)
return res.status(500).send('INVALID_RSS_FEED')
}
console.log('Fetched feed', feed.title, new Date())
await Promise.all(
@ -498,7 +546,7 @@ export const rssHandler = Sentry.GCPFunction.wrapHttpFunction(
res.send('ok')
} catch (e) {
console.error('Error while parsing RSS feed', e)
console.error('Error while saving RSS feeds', e)
res.status(500).send('INTERNAL_SERVER_ERROR')
}
}

View File

@ -7663,6 +7663,13 @@
dependencies:
htmlparser2 "^4.1.0"
"@types/sax@^1.2.7":
version "1.2.7"
resolved "https://registry.yarnpkg.com/@types/sax/-/sax-1.2.7.tgz#ba5fe7df9aa9c89b6dff7688a19023dd2963091d"
integrity sha512-rO73L89PJxeYM3s3pPPjiPgVVcymqU490g0YO5n5By0k2Erzj6tay/4lr1CHAAU4JyOWd1rpQ8bCf6cZfHU96A==
dependencies:
"@types/node" "*"
"@types/scheduler@*":
version "0.16.2"
resolved "https://registry.yarnpkg.com/@types/scheduler/-/scheduler-0.16.2.tgz#1a62f89525723dde24ba1b01b092bf5df8ad4d39"
@ -12574,7 +12581,7 @@ domhandler@^4.3.1:
dependencies:
domelementtype "^2.2.0"
domhandler@^5.0.1, domhandler@^5.0.2:
domhandler@^5.0.1, domhandler@^5.0.2, domhandler@^5.0.3:
version "5.0.3"
resolved "https://registry.yarnpkg.com/domhandler/-/domhandler-5.0.3.tgz#cc385f7f751f1d1fc650c21374804254538c7d31"
integrity sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==
@ -12623,6 +12630,15 @@ domutils@^3.0.1:
domelementtype "^2.3.0"
domhandler "^5.0.1"
domutils@^3.1.0:
version "3.1.0"
resolved "https://registry.yarnpkg.com/domutils/-/domutils-3.1.0.tgz#c47f551278d3dc4b0b1ab8cbb42d751a6f0d824e"
integrity sha512-H78uMmQtI2AhgDJjWeQmHwJJ2bLPD3GMmO7Zja/ZZh84wkm+4ut+IUnUdRa8uCGX88DiVx1j6FRe1XfxEgjEZA==
dependencies:
dom-serializer "^2.0.0"
domelementtype "^2.3.0"
domhandler "^5.0.3"
dot-case@^2.1.0:
version "2.1.1"
resolved "https://registry.yarnpkg.com/dot-case/-/dot-case-2.1.1.tgz#34dcf37f50a8e93c2b3bca8bb7fb9155c7da3bee"
@ -12936,6 +12952,11 @@ entities@^4.2.0, entities@^4.3.0:
resolved "https://registry.yarnpkg.com/entities/-/entities-4.3.0.tgz#62915f08d67353bb4eb67e3d62641a4059aec656"
integrity sha512-/iP1rZrSEJ0DTlPiX+jbzlA3eVkY/e8L8SozroF395fIqE3TYF/Nz7YOMAawta+vLmyJ/hkGNNPcSbMADCCXbg==
entities@^4.5.0:
version "4.5.0"
resolved "https://registry.yarnpkg.com/entities/-/entities-4.5.0.tgz#5d268ea5e7113ec74c4d033b79ea5a35a488fb48"
integrity sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==
entities@~2.1.0:
version "2.1.0"
resolved "https://registry.yarnpkg.com/entities/-/entities-2.1.0.tgz#992d3129cf7df6870b96c57858c249a120f8b8b5"
@ -16138,6 +16159,16 @@ htmlparser2@^8.0.1:
domutils "^3.0.1"
entities "^4.3.0"
htmlparser2@^9.0.0:
version "9.0.0"
resolved "https://registry.yarnpkg.com/htmlparser2/-/htmlparser2-9.0.0.tgz#e431142b7eeb1d91672742dea48af8ac7140cddb"
integrity sha512-uxbSI98wmFT/G4P2zXx4OVx04qWUmyFPrD2/CNepa2Zo3GPNaCaaxElDgwUrwYWkK1nr9fft0Ya8dws8coDLLQ==
dependencies:
domelementtype "^2.3.0"
domhandler "^5.0.3"
domutils "^3.1.0"
entities "^4.5.0"
htmltidy2@^0.3.0:
version "0.3.0"
resolved "https://registry.yarnpkg.com/htmltidy2/-/htmltidy2-0.3.0.tgz#1edfb74b8cd530cdcdc29ef547c849a651f0870b"
@ -18748,6 +18779,17 @@ linkedom@^0.14.9:
htmlparser2 "^8.0.1"
uhyphen "^0.1.0"
linkedom@^0.16.4:
version "0.16.4"
resolved "https://registry.yarnpkg.com/linkedom/-/linkedom-0.16.4.tgz#6ea2711d03196b58af01fa8acab26cb231f38baf"
integrity sha512-SykvDVh/jAnaO+WiPqH5vX3QpZrIRImuppzYhIHons3RXPhDwqN2dOyfopOVaHleqWtoS+3vWCqen+m8M3HToQ==
dependencies:
css-select "^5.1.0"
cssom "^0.5.0"
html-escaper "^3.0.3"
htmlparser2 "^9.0.0"
uhyphen "^0.2.0"
linkify-it@^3.0.1:
version "3.0.3"
resolved "https://registry.yarnpkg.com/linkify-it/-/linkify-it-3.0.3.tgz#a98baf44ce45a550efb4d49c769d07524cc2fa2e"
@ -24939,6 +24981,11 @@ sax@>=0.6.0:
resolved "https://registry.yarnpkg.com/sax/-/sax-1.2.4.tgz#2816234e2378bddc4e5354fab5caa895df7100d9"
integrity sha512-NqVDv9TpANUjFm0N8uM5GxL36UgKi9/atZw+x7YFnQ8ckwFGKrl4xX4yWtrey3UJm5nP1kUbnYgLopqWNSRhWw==
sax@^1.3.0:
version "1.3.0"
resolved "https://registry.yarnpkg.com/sax/-/sax-1.3.0.tgz#a5dbe77db3be05c9d1ee7785dbd3ea9de51593d0"
integrity sha512-0s+oAmw9zLl1V1cS9BtZN7JAd0cW5e0QH4W3LWEK6a4LaLEA2OTpGYWDY+6XasBLtz6wkm3u1xRw95mRuJ59WA==
saxes@^5.0.1:
version "5.0.1"
resolved "https://registry.yarnpkg.com/saxes/-/saxes-5.0.1.tgz#eebab953fa3b7608dbe94e5dadb15c888fa6696d"