diff --git a/packages/api/src/entity/subscription.ts b/packages/api/src/entity/subscription.ts index 90f7d593b..b63400d56 100644 --- a/packages/api/src/entity/subscription.ts +++ b/packages/api/src/entity/subscription.ts @@ -59,6 +59,9 @@ export class Subscription { @Column('timestamp', { nullable: true }) lastFetchedAt?: Date | null + @Column('text', { nullable: true }) + lastFetchedChecksum?: string | null + @CreateDateColumn({ default: () => 'CURRENT_TIMESTAMP' }) createdAt!: Date diff --git a/packages/api/src/generated/graphql.ts b/packages/api/src/generated/graphql.ts index f0f41ff68..0ad412b4f 100644 --- a/packages/api/src/generated/graphql.ts +++ b/packages/api/src/generated/graphql.ts @@ -2978,6 +2978,7 @@ export type UpdateSubscriptionInput = { description?: InputMaybe; id: Scalars['ID']; lastFetchedAt?: InputMaybe; + lastFetchedChecksum?: InputMaybe; name?: InputMaybe; status?: InputMaybe; }; diff --git a/packages/api/src/generated/schema.graphql b/packages/api/src/generated/schema.graphql index 3ef33f886..af9064bee 100644 --- a/packages/api/src/generated/schema.graphql +++ b/packages/api/src/generated/schema.graphql @@ -2391,6 +2391,7 @@ input UpdateSubscriptionInput { description: String id: ID! lastFetchedAt: Date + lastFetchedChecksum: String name: String status: SubscriptionStatus } diff --git a/packages/api/src/resolvers/subscriptions/index.ts b/packages/api/src/resolvers/subscriptions/index.ts index 74375441d..430347654 100644 --- a/packages/api/src/resolvers/subscriptions/index.ts +++ b/packages/api/src/resolvers/subscriptions/index.ts @@ -290,6 +290,7 @@ export const updateSubscriptionResolver = authorized< lastFetchedAt: input.lastFetchedAt ? new Date(input.lastFetchedAt) : undefined, + lastFetchedChecksum: input.lastFetchedChecksum || undefined, status: input.status || undefined, }) diff --git a/packages/api/src/routers/svc/rss_feed.ts b/packages/api/src/routers/svc/rss_feed.ts index 0834e5025..c06ec9f71 100644 --- a/packages/api/src/routers/svc/rss_feed.ts +++ b/packages/api/src/routers/svc/rss_feed.ts @@ -24,7 +24,7 @@ export function rssFeedRouter() { // get all active rss feed subscriptions const subscriptions = await getRepository(Subscription).find({ - select: ['id', 'url', 'user', 'lastFetchedAt'], + select: ['id', 'url', 'user', 'lastFetchedAt', 'lastFetchedChecksum'], where: { type: SubscriptionType.Rss, status: SubscriptionStatus.Active, diff --git a/packages/api/src/schema.ts b/packages/api/src/schema.ts index e3b2b3442..2cc4e8b58 100755 --- a/packages/api/src/schema.ts +++ b/packages/api/src/schema.ts @@ -2548,6 +2548,7 @@ const schema = gql` name: String description: String lastFetchedAt: Date + lastFetchedChecksum: String status: SubscriptionStatus } diff --git a/packages/api/src/utils/createTask.ts b/packages/api/src/utils/createTask.ts index bb3457e00..483921bc9 100644 --- a/packages/api/src/utils/createTask.ts +++ b/packages/api/src/utils/createTask.ts @@ -601,6 +601,7 @@ export const enqueueRssFeedFetch = async ( subscriptionId: rssFeedSubscription.id, feedUrl: rssFeedSubscription.url, lastFetchedAt: rssFeedSubscription.lastFetchedAt?.getTime() || 0, // unix timestamp in milliseconds + lastFetchedChecksum: rssFeedSubscription.lastFetchedChecksum || null, } const headers = { diff --git a/packages/db/migrations/0138.do.add_checksum_to_subscriptions_table.sql b/packages/db/migrations/0138.do.add_checksum_to_subscriptions_table.sql new file mode 100755 index 000000000..dfc37edfe --- /dev/null +++ b/packages/db/migrations/0138.do.add_checksum_to_subscriptions_table.sql @@ -0,0 +1,9 @@ +-- Type: DO +-- Name: add_checksum_to_subscriptions_table +-- Description: Add a last fetched checksum field to the subscriptions table + +BEGIN; + +ALTER TABLE omnivore.subscriptions ADD COLUMN last_fetched_checksum TEXT ; + +COMMIT; diff --git a/packages/db/migrations/0138.undo.add_checksum_to_subscriptions_table.sql b/packages/db/migrations/0138.undo.add_checksum_to_subscriptions_table.sql new file mode 100755 index 000000000..e138d501a --- /dev/null +++ b/packages/db/migrations/0138.undo.add_checksum_to_subscriptions_table.sql @@ -0,0 +1,9 @@ +-- Type: UNDO +-- Name: add_checksum_to_subscriptions_table +-- Description: Add a last fetched checksum field to the subscriptions table + +BEGIN; + +ALTER TABLE omnivore.subscriptions DROP COLUMN last_fetched_checksum ; + +COMMIT; diff --git a/packages/rss-handler/package.json b/packages/rss-handler/package.json index 2a6d40851..0db5d93fd 100644 --- a/packages/rss-handler/package.json +++ b/packages/rss-handler/package.json @@ -17,7 +17,8 @@ "devDependencies": { "chai": "^4.3.6", "eslint-plugin-prettier": "^4.0.0", - "mocha": "^10.0.0" + "mocha": "^10.0.0", + "nock": "^13.3.4" }, "dependencies": { "@google-cloud/functions-framework": "3.1.2", diff --git a/packages/rss-handler/src/index.ts b/packages/rss-handler/src/index.ts index 60e789e73..ca6afb04b 100644 --- a/packages/rss-handler/src/index.ts +++ b/packages/rss-handler/src/index.ts @@ -1,5 +1,6 @@ import * as Sentry from '@sentry/serverless' import axios from 'axios' +import crypto from 'crypto' import * as dotenv from 'dotenv' // see https://github.com/motdotla/dotenv#how-do-i-use-dotenv-with-import import * as jwt from 'jsonwebtoken' import Parser, { Item } from 'rss-parser' @@ -10,6 +11,7 @@ interface RssFeedRequest { subscriptionId: string feedUrl: string lastFetchedAt: number // unix timestamp in milliseconds + lastFetchedChecksum: string | undefined } // link can be a string or an object @@ -21,10 +23,37 @@ function isRssFeedRequest(body: any): body is RssFeedRequest { ) } +export const fetchAndChecksum = async (url: string) => { + try { + const response = await axios.get(url, { + responseType: 'arraybuffer', + timeout: 60_000, + maxRedirects: 10, + headers: { + 'User-Agent': + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', + Accept: + 'application/rss+xml, application/rdf+xml;q=0.8, application/atom+xml;q=0.6, application/xml;q=0.4, text/xml;q=0.4', + }, + }) + + const hash = crypto.createHash('sha256') + hash.update(response.data as Buffer) + + const dataStr = (response.data as Buffer).toString() + + return { url, content: dataStr, checksum: hash.digest('hex') } + } catch (error) { + console.log(error) + throw new Error(`Failed to fetch or hash content from ${url}.`) + } +} + const sendUpdateSubscriptionMutation = async ( userId: string, subscriptionId: string, - lastFetchedAt: Date + lastFetchedAt: Date, + lastFetchedChecksum: string ) => { const JWT_SECRET = process.env.JWT_SECRET const REST_BACKEND_ENDPOINT = process.env.REST_BACKEND_ENDPOINT @@ -51,6 +80,7 @@ const sendUpdateSubscriptionMutation = async ( input: { id: subscriptionId, lastFetchedAt, + lastFetchedChecksum, }, }, }) @@ -118,19 +148,15 @@ Sentry.GCPFunction.init({ const signToken = promisify(jwt.sign) const parser = new Parser({ - timeout: 60000, // 60 seconds - maxRedirects: 10, customFields: { - item: [['link', 'links', { keepArray: true }], 'published', 'updated'], + item: [ + ['link', 'links', { keepArray: true }], + 'published', + 'updated', + 'created', + ], feed: ['dc:date', 'lastBuildDate', 'pubDate'], }, - headers: { - // some rss feeds require user agent - 'User-Agent': - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', - Accept: - 'application/rss+xml, application/rdf+xml;q=0.8, application/atom+xml;q=0.6, application/xml;q=0.4, text/xml;q=0.4', - }, }) // get link following the order of preference: via, alternate, self @@ -190,16 +216,24 @@ export const rssHandler = Sentry.GCPFunction.wrapHttpFunction( return res.status(400).send('INVALID_REQUEST_BODY') } - const { feedUrl, subscriptionId, lastFetchedAt } = req.body + const { feedUrl, subscriptionId, lastFetchedAt, lastFetchedChecksum } = + req.body console.log('Processing feed', feedUrl, lastFetchedAt) let lastItemFetchedAt: Date | null = null let lastValidItem: Item | null = null + const fetchResult = await fetchAndChecksum(feedUrl) + if (fetchResult.checksum === lastFetchedChecksum) { + console.log('feed has not been updated', feedUrl, lastFetchedChecksum) + return res.status(200) + } + const updatedLastFetchedChecksum = fetchResult.checksum + // fetch feed let itemCount = 0 - const feed = await parser.parseURL(feedUrl) - console.log('Fetched feed', feed, new Date()) + const feed = await parser.parseString(fetchResult.content) + console.log('Fetched feed', feed.title, new Date()) const feedPubDate = (feed['dc:date'] || feed.pubDate || @@ -214,7 +248,10 @@ export const rssHandler = Sentry.GCPFunction.wrapHttpFunction( for (const item of feed.items) { // use published or updated if isoDate is not available for atom feeds item.isoDate = - item.isoDate || (item.published as string) || (item.updated as string) + item.isoDate || + (item.published as string) || + (item.updated as string) || + (item.created as string) console.log('Processing feed item', item.links, item.isoDate) if (!item.links || item.links.length === 0) { @@ -299,7 +336,8 @@ export const rssHandler = Sentry.GCPFunction.wrapHttpFunction( const updatedSubscription = await sendUpdateSubscriptionMutation( userId, subscriptionId, - lastItemFetchedAt + lastItemFetchedAt, + updatedLastFetchedChecksum ) console.log('Updated subscription', updatedSubscription) diff --git a/packages/rss-handler/test/checksum.test.ts b/packages/rss-handler/test/checksum.test.ts new file mode 100644 index 000000000..24eee7a0b --- /dev/null +++ b/packages/rss-handler/test/checksum.test.ts @@ -0,0 +1,14 @@ +import 'mocha' +import nock from 'nock' +import { expect } from 'chai' +import { fetchAndChecksum } from '../src/index' + +describe('fetchAndChecksum', () => { + it('should hash the content available', async () => { + nock('https://fake.com', {}).get('/rss.xml').reply(200, 'i am some content') + const result = await fetchAndChecksum('https://fake.com/rss.xml') + expect(result.checksum).to.eq( + 'd6bc10faec048d999d0cf4b2f7103d84557fb9cd94c3bccd17884b1288949375' + ) + }) +}) diff --git a/packages/rss-handler/test/stub.test.ts b/packages/rss-handler/test/stub.test.ts deleted file mode 100644 index 24ad25c8f..000000000 --- a/packages/rss-handler/test/stub.test.ts +++ /dev/null @@ -1,8 +0,0 @@ -import 'mocha' -import { expect } from 'chai' - -describe('stub test', () => { - it('should pass', () => { - expect(true).to.be.true - }) -})