From 322f736fe02d9a6f424d5e90917e16eca53fead5 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Wed, 31 Jul 2024 19:14:38 +0800 Subject: [PATCH 1/2] stop storing original html in the database --- packages/api/src/entity/library_item.ts | 3 - packages/api/src/jobs/save_page.ts | 1 - packages/api/src/jobs/upload_content.ts | 32 +------ packages/api/src/resolvers/article/index.ts | 22 +---- .../api/src/resolvers/function_resolvers.ts | 2 +- packages/api/src/routers/page_router.ts | 1 - packages/api/src/services/library_item.ts | 92 ++----------------- packages/api/src/services/recommendation.ts | 1 - packages/api/src/services/reports.ts | 2 +- packages/api/src/services/save_email.ts | 1 - packages/api/src/services/save_page.ts | 5 +- packages/api/test/db.ts | 1 - packages/api/test/resolvers/article.test.ts | 3 - packages/content-fetch/src/request_handler.ts | 18 ++-- 14 files changed, 26 insertions(+), 158 deletions(-) diff --git a/packages/api/src/entity/library_item.ts b/packages/api/src/entity/library_item.ts index a0e3718fd..46d056310 100644 --- a/packages/api/src/entity/library_item.ts +++ b/packages/api/src/entity/library_item.ts @@ -135,9 +135,6 @@ export class LibraryItem { @Column('enum', { enum: ContentReaderType, default: ContentReaderType.WEB }) contentReader!: ContentReaderType - @Column('text', { nullable: true }) - originalContent?: string | null - @Column('text') readableContent!: string diff --git a/packages/api/src/jobs/save_page.ts b/packages/api/src/jobs/save_page.ts index cd85a1690..850b36d8b 100644 --- a/packages/api/src/jobs/save_page.ts +++ b/packages/api/src/jobs/save_page.ts @@ -273,7 +273,6 @@ export const savePageJob = async (data: Data, attemptsMade: number) => { publishedAt: publishedAt ? new Date(publishedAt) : null, source, folder, - originalContentUploaded: true, }, user ) diff --git a/packages/api/src/jobs/upload_content.ts b/packages/api/src/jobs/upload_content.ts index 4ffb5bdb9..15116bfca 100644 --- a/packages/api/src/jobs/upload_content.ts +++ b/packages/api/src/jobs/upload_content.ts @@ -45,39 +45,15 @@ const CONTENT_TYPES = { readable: 'text/html', } -const getSelectOptions = ( - format: ContentFormat -): { column: 'readableContent' | 'originalContent'; highlights?: boolean } => { - switch (format) { - case 'markdown': - case 'readable': - return { - column: 'readableContent', - } - case 'highlightedMarkdown': - return { - column: 'readableContent', - highlights: true, - } - case 'original': - return { - column: 'originalContent', - } - default: - throw new Error('Unsupported format') - } -} - export const uploadContentJob = async (data: UploadContentJobData) => { logger.info('Uploading content to bucket', data) const { libraryItemId, userId, format, filePath } = data - const { column, highlights } = getSelectOptions(format) const libraryItem = await findLibraryItemById(libraryItemId, userId, { - select: ['id', column], // id is required for relations + select: ['id', 'readableContent'], // id is required for relations relations: { - highlights, + highlights: format === 'highlightedMarkdown', }, }) if (!libraryItem) { @@ -85,10 +61,10 @@ export const uploadContentJob = async (data: UploadContentJobData) => { throw new Error('Library item not found') } - const content = data.content || libraryItem[column] + const content = libraryItem.readableContent if (!content) { - logger.error(`${column} not found`) + logger.error(`content not found`) throw new Error('Content not found') } diff --git a/packages/api/src/resolvers/article/index.ts b/packages/api/src/resolvers/article/index.ts index b28ca00ad..938ee123f 100644 --- a/packages/api/src/resolvers/article/index.ts +++ b/packages/api/src/resolvers/article/index.ts @@ -4,7 +4,6 @@ /* eslint-disable @typescript-eslint/no-unsafe-member-access */ /* eslint-disable @typescript-eslint/no-floating-promises */ import { Readability } from '@omnivore/readability' -import graphqlFields from 'graphql-fields' import { ContentReaderType, LibraryItem, @@ -106,7 +105,6 @@ import { titleForFilePath, } from '../../utils/helpers' import { - getDistillerResult, htmlToMarkdown, ParsedContentPuppeteer, parsePreparedContent, @@ -376,15 +374,9 @@ export const getArticleResolver = authorized< Merge, ArticleError, QueryArticleArgs ->(async (_obj, { slug, format }, { uid, log }, info) => { +>(async (_obj, { slug, format }, { uid, log }) => { try { const selectColumns = getColumns(libraryItemRepository) - const includeOriginalHtml = - format === ArticleFormat.Distiller || - !!graphqlFields(info).article.originalHtml - if (!includeOriginalHtml) { - selectColumns.splice(selectColumns.indexOf('originalContent'), 1) - } const libraryItem = await authTrx( (tx) => { @@ -435,18 +427,6 @@ export const getArticleResolver = authorized< if (format === ArticleFormat.Markdown) { libraryItem.readableContent = htmlToMarkdown(libraryItem.readableContent) - } else if (format === ArticleFormat.Distiller) { - if (!libraryItem.originalContent) { - return { errorCodes: [ArticleErrorCode.BadData] } - } - const distillerResult = await getDistillerResult( - uid, - libraryItem.originalContent - ) - if (!distillerResult) { - return { errorCodes: [ArticleErrorCode.BadData] } - } - libraryItem.readableContent = distillerResult } return { diff --git a/packages/api/src/resolvers/function_resolvers.ts b/packages/api/src/resolvers/function_resolvers.ts index 55c5ac245..752bdf2e9 100644 --- a/packages/api/src/resolvers/function_resolvers.ts +++ b/packages/api/src/resolvers/function_resolvers.ts @@ -442,7 +442,7 @@ export const functionResolvers = { return article.originalUrl }, hasContent(article: LibraryItem) { - return !!article.originalContent && !!article.readableContent + return !!article.readableContent }, publishedAt(article: LibraryItem) { return validatedDate(article.publishedAt || undefined) diff --git a/packages/api/src/routers/page_router.ts b/packages/api/src/routers/page_router.ts index 504983482..81d23ad9e 100644 --- a/packages/api/src/routers/page_router.ts +++ b/packages/api/src/routers/page_router.ts @@ -108,7 +108,6 @@ export function pageRouter() { id: clientRequestId, user: { id: claims.uid }, title, - originalContent: '', itemType: PageType.File, uploadFile: { id: uploadFileData.id }, slug: generateSlug(uploadFilePathName), diff --git a/packages/api/src/services/library_item.ts b/packages/api/src/services/library_item.ts index ed133eaff..5fa422103 100644 --- a/packages/api/src/services/library_item.ts +++ b/packages/api/src/services/library_item.ts @@ -30,11 +30,9 @@ import { } from '../repository' import { libraryItemRepository } from '../repository/library_item' import { Merge, PickTuple } from '../util' -import { enqueueBulkUploadContentJob } from '../utils/createTask' import { deepDelete, setRecentlySavedItemInRedis } from '../utils/helpers' -import { logError, logger } from '../utils/logger' +import { logger } from '../utils/logger' import { parseSearchQuery } from '../utils/search' -import { contentFilePath, downloadFromBucket } from '../utils/uploads' import { HighlightEvent } from './highlights' import { addLabelsToLibraryItem, LabelEvent } from './labels' @@ -45,7 +43,6 @@ const columnsToDelete = [ 'links', 'textContentHash', 'readableContent', - 'originalContent', 'feedContent', ] as const type ColumnsToDeleteType = typeof columnsToDelete[number] @@ -136,7 +133,7 @@ const readingProgressDataSource = new ReadingProgressDataSource() export const batchGetLibraryItems = async (ids: readonly string[]) => { // select all columns except content const select = getColumns(libraryItemRepository).filter( - (select) => ['originalContent', 'readableContent'].indexOf(select) === -1 + (select) => select !== 'readableContent' ) const items = await authTrx( async (tx) => @@ -638,12 +635,10 @@ export const createSearchQueryBuilder = ( ) => { const queryBuilder = em.createQueryBuilder(LibraryItem, 'library_item') - // select all columns except content + // exclude content if not requested const selects: Select[] = getColumns(libraryItemRepository) .filter( - (select) => - select !== 'originalContent' && // exclude original content - (args.includeContent || select !== 'readableContent') // exclude content if not requested + (select) => args.includeContent || select !== 'readableContent' // ) .map((column) => ({ column: `library_item.${column}` })) @@ -762,9 +757,7 @@ export const findRecentLibraryItems = async ( offset?: number ) => { const selectColumns = getColumns(libraryItemRepository) - .filter( - (column) => column !== 'readableContent' && column !== 'originalContent' - ) + .filter((column) => column !== 'readableContent') .map((column) => `library_item.${column}`) return authTrx( @@ -797,11 +790,9 @@ export const findLibraryItemsByIds = async ( relations?: Array<'labels' | 'highlights'> } ) => { - const selectColumns = - options?.select?.map((column) => `library_item.${column}`) || - getColumns(libraryItemRepository) - .filter((column) => column !== 'originalContent') - .map((column) => `library_item.${column}`) + const selectColumns = ( + options?.select || getColumns(libraryItemRepository) + ).map((column) => `library_item.${column}`) return authTrx( async (tx) => { const qb = tx @@ -1061,17 +1052,8 @@ export const createOrUpdateLibraryItem = async ( libraryItem: CreateOrUpdateLibraryItemArgs, userId: string, pubsub = createPubSubClient(), - skipPubSub = false, - originalContentUploaded = false + skipPubSub = false ): Promise => { - let originalContent: string | null = null - if (libraryItem.originalContent) { - originalContent = libraryItem.originalContent - - // remove original content from the item - delete libraryItem.originalContent - } - const newLibraryItem = await authTrx( async (tx) => { const repo = tx.withRepository(libraryItemRepository) @@ -1146,24 +1128,6 @@ export const createOrUpdateLibraryItem = async ( const data = deepDelete(newLibraryItem, columnsToDelete) await pubsub.entityCreated(EntityType.ITEM, data, userId) - // upload original content to GCS in a job if it's not already uploaded - if (originalContent && !originalContentUploaded) { - try { - await enqueueUploadOriginalContent( - userId, - newLibraryItem.id, - newLibraryItem.savedAt, - originalContent - ) - - logger.info('Queued to upload original content in GCS', { - id: newLibraryItem.id, - }) - } catch (error) { - logError(error) - } - } - return newLibraryItem } @@ -1747,41 +1711,3 @@ export const filterItemEvents = ( throw new Error('Unexpected state.') } - -export const enqueueUploadOriginalContent = async ( - userId: string, - libraryItemId: string, - savedAt: Date, - originalContent: string -) => { - const filePath = contentFilePath({ - userId, - libraryItemId, - savedAt, - format: 'original', - }) - await enqueueBulkUploadContentJob([ - { - userId, - libraryItemId, - filePath, - format: 'original', - content: originalContent, - }, - ]) -} - -export const downloadOriginalContent = async ( - userId: string, - libraryItemId: string, - savedAt: Date -) => { - return downloadFromBucket( - contentFilePath({ - userId, - libraryItemId, - savedAt, - format: 'original', - }) - ) -} diff --git a/packages/api/src/services/recommendation.ts b/packages/api/src/services/recommendation.ts index bbf8ffdf1..7935f3f56 100644 --- a/packages/api/src/services/recommendation.ts +++ b/packages/api/src/services/recommendation.ts @@ -47,7 +47,6 @@ export const addRecommendation = async ( author: item.author, description: item.description, originalUrl: item.originalUrl, - originalContent: item.originalContent, contentReader: item.contentReader, directionality: item.directionality, itemLanguage: item.itemLanguage, diff --git a/packages/api/src/services/reports.ts b/packages/api/src/services/reports.ts index ee71f6807..b6761f3f8 100644 --- a/packages/api/src/services/reports.ts +++ b/packages/api/src/services/reports.ts @@ -12,7 +12,7 @@ export const saveContentDisplayReport = async ( input: ReportItemInput ): Promise => { const item = await findLibraryItemById(input.pageId, uid, { - select: ['id', 'readableContent', 'originalContent', 'originalUrl'], + select: ['id', 'readableContent', 'originalUrl'], }) if (!item) { logger.info('unable to submit report, item not found', input) diff --git a/packages/api/src/services/save_email.ts b/packages/api/src/services/save_email.ts index c7c096a33..4826a163d 100644 --- a/packages/api/src/services/save_email.ts +++ b/packages/api/src/services/save_email.ts @@ -89,7 +89,6 @@ export const saveEmail = async ( user: { id: input.userId }, slug, readableContent: content, - originalContent: input.originalContent, description: metadata?.description || parseResult.parsedContent?.excerpt, title: input.title, author: input.author, diff --git a/packages/api/src/services/save_page.ts b/packages/api/src/services/save_page.ts index c21a5db31..00b0392d1 100644 --- a/packages/api/src/services/save_page.ts +++ b/packages/api/src/services/save_page.ts @@ -71,7 +71,6 @@ export type SavePageArgs = Merge< feedContent?: string previewImage?: string author?: string - originalContentUploaded?: boolean } > @@ -149,8 +148,7 @@ export const savePage = async ( itemToSave, user.id, undefined, - isImported, - input.originalContentUploaded + isImported ) clientRequestId = newItem.id @@ -240,7 +238,6 @@ export const parsedContentToLibraryItem = ({ id: itemId || undefined, slug, user: { id: userId }, - originalContent: originalHtml, readableContent: parsedContent?.content || '', description: parsedContent?.excerpt, previewContent: parsedContent?.excerpt, diff --git a/packages/api/test/db.ts b/packages/api/test/db.ts index fdb5c5b3f..2b5fe6b7f 100644 --- a/packages/api/test/db.ts +++ b/packages/api/test/db.ts @@ -124,7 +124,6 @@ export const createTestLibraryItem = async ( item, userId, undefined, - true, true ) if (labels) { diff --git a/packages/api/test/resolvers/article.test.ts b/packages/api/test/resolvers/article.test.ts index 5a64d06da..856bbaea3 100644 --- a/packages/api/test/resolvers/article.test.ts +++ b/packages/api/test/resolvers/article.test.ts @@ -430,7 +430,6 @@ describe('Article API', () => { before(async () => { const itemToCreate: CreateOrUpdateLibraryItemArgs = { title: 'test title', - originalContent: '

test

', slug: realSlug, readingProgressTopPercent: 100, user, @@ -441,7 +440,6 @@ describe('Article API', () => { itemToCreate, user.id, undefined, - true, true ) itemId = item.id @@ -1309,7 +1307,6 @@ describe('Article API', () => { item, user.id, undefined, - true, true ) items.push(savedItem) diff --git a/packages/content-fetch/src/request_handler.ts b/packages/content-fetch/src/request_handler.ts index fafe53ead..f8fea81f3 100644 --- a/packages/content-fetch/src/request_handler.ts +++ b/packages/content-fetch/src/request_handler.ts @@ -71,7 +71,7 @@ const uploadToBucket = async (filePath: string, data: string) => { await storage .bucket(bucketName) .file(filePath) - .save(data, { public: false, timeout: 30000 }) + .save(data, { public: false, timeout: 5000 }) } const uploadOriginalContent = async ( @@ -238,6 +238,14 @@ export const contentFetchRequestHandler: RequestHandler = async (req, res) => { }) try { + const domain = new URL(url).hostname + const isBlocked = await isDomainBlocked(redisDataSource, domain) + if (isBlocked) { + console.log('domain is blocked', domain) + + return res.sendStatus(200) + } + const key = cacheKey(url, locale, timezone) let fetchResult = await getCachedFetchResult(redisDataSource, key) if (!fetchResult) { @@ -246,14 +254,6 @@ export const contentFetchRequestHandler: RequestHandler = async (req, res) => { url ) - const domain = new URL(url).hostname - const isBlocked = await isDomainBlocked(redisDataSource, domain) - if (isBlocked) { - console.log('domain is blocked', domain) - - return res.sendStatus(200) - } - try { fetchResult = await fetchContent(url, locale, timezone) console.log('content has been fetched') From 8bbf2c07023b7370e1edfffe52b31d4a7ab21901 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Thu, 1 Aug 2024 11:56:36 +0800 Subject: [PATCH 2/2] add script to update original_content to NULL in db --- packages/db/remove_original_content.py | 49 ++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100755 packages/db/remove_original_content.py diff --git a/packages/db/remove_original_content.py b/packages/db/remove_original_content.py new file mode 100755 index 000000000..29285e134 --- /dev/null +++ b/packages/db/remove_original_content.py @@ -0,0 +1,49 @@ +#!/usr/bin/python3 +import os + +import psycopg2 + +PG_HOST = os.getenv('PG_HOST', 'localhost') +PG_PORT = os.getenv('PG_PORT', 5432) +PG_USER = os.getenv('PG_USER', 'app_user') +PG_PASSWORD = os.getenv('PG_PASSWORD', 'app_pass') +PG_DB = os.getenv('PG_DB', 'omnivore') +PG_TIMEOUT = os.getenv('PG_TIMEOUT', 10) + + +def batch_update_library_items(conn): + batch_size = 100 + # update original_content to NULL in batches + with conn.cursor() as cursor: + while True: + cursor.execute(f""" + UPDATE omnivore.library_item + SET original_content = NULL + WHERE ctid IN ( + SELECT ctid + FROM omnivore.library_item + WHERE original_content IS NOT NULL + LIMIT {batch_size} + ) + """) + rows_updated = cursor.rowcount + conn.commit() + if rows_updated == 0: + break + + +# postgres connection +conn = psycopg2.connect( + f'host={PG_HOST} port={PG_PORT} dbname={PG_DB} user={PG_USER} \ + password={PG_PASSWORD} connect_timeout={PG_TIMEOUT}') +print('Postgres connection:', conn.info) + +try: + print('Starting migration') + batch_update_library_items(conn) + print('Migration complete') +except Exception as err: + print('Migration error', err) +finally: + print('Closing connections') + conn.close()