Merge pull request #4235 from omnivore-app/fix/original-html

stop storing original html in the database
This commit is contained in:
Hongbo Wu
2024-08-02 14:53:10 +08:00
committed by GitHub
15 changed files with 75 additions and 158 deletions

View File

@ -135,9 +135,6 @@ export class LibraryItem {
@Column('enum', { enum: ContentReaderType, default: ContentReaderType.WEB })
contentReader!: ContentReaderType
@Column('text', { nullable: true })
originalContent?: string | null
@Column('text')
readableContent!: string

View File

@ -273,7 +273,6 @@ export const savePageJob = async (data: Data, attemptsMade: number) => {
publishedAt: publishedAt ? new Date(publishedAt) : null,
source,
folder,
originalContentUploaded: true,
},
user
)

View File

@ -45,39 +45,15 @@ const CONTENT_TYPES = {
readable: 'text/html',
}
const getSelectOptions = (
format: ContentFormat
): { column: 'readableContent' | 'originalContent'; highlights?: boolean } => {
switch (format) {
case 'markdown':
case 'readable':
return {
column: 'readableContent',
}
case 'highlightedMarkdown':
return {
column: 'readableContent',
highlights: true,
}
case 'original':
return {
column: 'originalContent',
}
default:
throw new Error('Unsupported format')
}
}
export const uploadContentJob = async (data: UploadContentJobData) => {
logger.info('Uploading content to bucket', data)
const { libraryItemId, userId, format, filePath } = data
const { column, highlights } = getSelectOptions(format)
const libraryItem = await findLibraryItemById(libraryItemId, userId, {
select: ['id', column], // id is required for relations
select: ['id', 'readableContent'], // id is required for relations
relations: {
highlights,
highlights: format === 'highlightedMarkdown',
},
})
if (!libraryItem) {
@ -85,10 +61,10 @@ export const uploadContentJob = async (data: UploadContentJobData) => {
throw new Error('Library item not found')
}
const content = data.content || libraryItem[column]
const content = libraryItem.readableContent
if (!content) {
logger.error(`${column} not found`)
logger.error(`content not found`)
throw new Error('Content not found')
}

View File

@ -4,7 +4,6 @@
/* eslint-disable @typescript-eslint/no-unsafe-member-access */
/* eslint-disable @typescript-eslint/no-floating-promises */
import { Readability } from '@omnivore/readability'
import graphqlFields from 'graphql-fields'
import {
ContentReaderType,
LibraryItem,
@ -106,7 +105,6 @@ import {
titleForFilePath,
} from '../../utils/helpers'
import {
getDistillerResult,
htmlToMarkdown,
ParsedContentPuppeteer,
parsePreparedContent,
@ -376,15 +374,9 @@ export const getArticleResolver = authorized<
Merge<ArticleSuccess, { article: LibraryItem }>,
ArticleError,
QueryArticleArgs
>(async (_obj, { slug, format }, { uid, log }, info) => {
>(async (_obj, { slug, format }, { uid, log }) => {
try {
const selectColumns = getColumns(libraryItemRepository)
const includeOriginalHtml =
format === ArticleFormat.Distiller ||
!!graphqlFields(info).article.originalHtml
if (!includeOriginalHtml) {
selectColumns.splice(selectColumns.indexOf('originalContent'), 1)
}
const libraryItem = await authTrx(
(tx) => {
@ -435,18 +427,6 @@ export const getArticleResolver = authorized<
if (format === ArticleFormat.Markdown) {
libraryItem.readableContent = htmlToMarkdown(libraryItem.readableContent)
} else if (format === ArticleFormat.Distiller) {
if (!libraryItem.originalContent) {
return { errorCodes: [ArticleErrorCode.BadData] }
}
const distillerResult = await getDistillerResult(
uid,
libraryItem.originalContent
)
if (!distillerResult) {
return { errorCodes: [ArticleErrorCode.BadData] }
}
libraryItem.readableContent = distillerResult
}
return {

View File

@ -442,7 +442,7 @@ export const functionResolvers = {
return article.originalUrl
},
hasContent(article: LibraryItem) {
return !!article.originalContent && !!article.readableContent
return !!article.readableContent
},
publishedAt(article: LibraryItem) {
return validatedDate(article.publishedAt || undefined)

View File

@ -108,7 +108,6 @@ export function pageRouter() {
id: clientRequestId,
user: { id: claims.uid },
title,
originalContent: '',
itemType: PageType.File,
uploadFile: { id: uploadFileData.id },
slug: generateSlug(uploadFilePathName),

View File

@ -30,11 +30,9 @@ import {
} from '../repository'
import { libraryItemRepository } from '../repository/library_item'
import { Merge, PickTuple } from '../util'
import { enqueueBulkUploadContentJob } from '../utils/createTask'
import { deepDelete, setRecentlySavedItemInRedis } from '../utils/helpers'
import { logError, logger } from '../utils/logger'
import { logger } from '../utils/logger'
import { parseSearchQuery } from '../utils/search'
import { contentFilePath, downloadFromBucket } from '../utils/uploads'
import { HighlightEvent } from './highlights'
import { addLabelsToLibraryItem, LabelEvent } from './labels'
@ -45,7 +43,6 @@ const columnsToDelete = [
'links',
'textContentHash',
'readableContent',
'originalContent',
'feedContent',
] as const
type ColumnsToDeleteType = typeof columnsToDelete[number]
@ -136,7 +133,7 @@ const readingProgressDataSource = new ReadingProgressDataSource()
export const batchGetLibraryItems = async (ids: readonly string[]) => {
// select all columns except content
const select = getColumns(libraryItemRepository).filter(
(select) => ['originalContent', 'readableContent'].indexOf(select) === -1
(select) => select !== 'readableContent'
)
const items = await authTrx(
async (tx) =>
@ -638,12 +635,10 @@ export const createSearchQueryBuilder = (
) => {
const queryBuilder = em.createQueryBuilder(LibraryItem, 'library_item')
// select all columns except content
// exclude content if not requested
const selects: Select[] = getColumns(libraryItemRepository)
.filter(
(select) =>
select !== 'originalContent' && // exclude original content
(args.includeContent || select !== 'readableContent') // exclude content if not requested
(select) => args.includeContent || select !== 'readableContent' //
)
.map((column) => ({ column: `library_item.${column}` }))
@ -762,9 +757,7 @@ export const findRecentLibraryItems = async (
offset?: number
) => {
const selectColumns = getColumns(libraryItemRepository)
.filter(
(column) => column !== 'readableContent' && column !== 'originalContent'
)
.filter((column) => column !== 'readableContent')
.map((column) => `library_item.${column}`)
return authTrx(
@ -797,11 +790,9 @@ export const findLibraryItemsByIds = async (
relations?: Array<'labels' | 'highlights'>
}
) => {
const selectColumns =
options?.select?.map((column) => `library_item.${column}`) ||
getColumns(libraryItemRepository)
.filter((column) => column !== 'originalContent')
.map((column) => `library_item.${column}`)
const selectColumns = (
options?.select || getColumns(libraryItemRepository)
).map((column) => `library_item.${column}`)
return authTrx(
async (tx) => {
const qb = tx
@ -1061,17 +1052,8 @@ export const createOrUpdateLibraryItem = async (
libraryItem: CreateOrUpdateLibraryItemArgs,
userId: string,
pubsub = createPubSubClient(),
skipPubSub = false,
originalContentUploaded = false
skipPubSub = false
): Promise<LibraryItem> => {
let originalContent: string | null = null
if (libraryItem.originalContent) {
originalContent = libraryItem.originalContent
// remove original content from the item
delete libraryItem.originalContent
}
const newLibraryItem = await authTrx(
async (tx) => {
const repo = tx.withRepository(libraryItemRepository)
@ -1146,24 +1128,6 @@ export const createOrUpdateLibraryItem = async (
const data = deepDelete(newLibraryItem, columnsToDelete)
await pubsub.entityCreated<ItemEvent>(EntityType.ITEM, data, userId)
// upload original content to GCS in a job if it's not already uploaded
if (originalContent && !originalContentUploaded) {
try {
await enqueueUploadOriginalContent(
userId,
newLibraryItem.id,
newLibraryItem.savedAt,
originalContent
)
logger.info('Queued to upload original content in GCS', {
id: newLibraryItem.id,
})
} catch (error) {
logError(error)
}
}
return newLibraryItem
}
@ -1747,41 +1711,3 @@ export const filterItemEvents = (
throw new Error('Unexpected state.')
}
export const enqueueUploadOriginalContent = async (
userId: string,
libraryItemId: string,
savedAt: Date,
originalContent: string
) => {
const filePath = contentFilePath({
userId,
libraryItemId,
savedAt,
format: 'original',
})
await enqueueBulkUploadContentJob([
{
userId,
libraryItemId,
filePath,
format: 'original',
content: originalContent,
},
])
}
export const downloadOriginalContent = async (
userId: string,
libraryItemId: string,
savedAt: Date
) => {
return downloadFromBucket(
contentFilePath({
userId,
libraryItemId,
savedAt,
format: 'original',
})
)
}

View File

@ -47,7 +47,6 @@ export const addRecommendation = async (
author: item.author,
description: item.description,
originalUrl: item.originalUrl,
originalContent: item.originalContent,
contentReader: item.contentReader,
directionality: item.directionality,
itemLanguage: item.itemLanguage,

View File

@ -12,7 +12,7 @@ export const saveContentDisplayReport = async (
input: ReportItemInput
): Promise<boolean> => {
const item = await findLibraryItemById(input.pageId, uid, {
select: ['id', 'readableContent', 'originalContent', 'originalUrl'],
select: ['id', 'readableContent', 'originalUrl'],
})
if (!item) {
logger.info('unable to submit report, item not found', input)

View File

@ -89,7 +89,6 @@ export const saveEmail = async (
user: { id: input.userId },
slug,
readableContent: content,
originalContent: input.originalContent,
description: metadata?.description || parseResult.parsedContent?.excerpt,
title: input.title,
author: input.author,

View File

@ -71,7 +71,6 @@ export type SavePageArgs = Merge<
feedContent?: string
previewImage?: string
author?: string
originalContentUploaded?: boolean
}
>
@ -149,8 +148,7 @@ export const savePage = async (
itemToSave,
user.id,
undefined,
isImported,
input.originalContentUploaded
isImported
)
clientRequestId = newItem.id
@ -240,7 +238,6 @@ export const parsedContentToLibraryItem = ({
id: itemId || undefined,
slug,
user: { id: userId },
originalContent: originalHtml,
readableContent: parsedContent?.content || '',
description: parsedContent?.excerpt,
previewContent: parsedContent?.excerpt,

View File

@ -124,7 +124,6 @@ export const createTestLibraryItem = async (
item,
userId,
undefined,
true,
true
)
if (labels) {

View File

@ -430,7 +430,6 @@ describe('Article API', () => {
before(async () => {
const itemToCreate: CreateOrUpdateLibraryItemArgs = {
title: 'test title',
originalContent: '<p>test</p>',
slug: realSlug,
readingProgressTopPercent: 100,
user,
@ -441,7 +440,6 @@ describe('Article API', () => {
itemToCreate,
user.id,
undefined,
true,
true
)
itemId = item.id
@ -1309,7 +1307,6 @@ describe('Article API', () => {
item,
user.id,
undefined,
true,
true
)
items.push(savedItem)

View File

@ -71,7 +71,7 @@ const uploadToBucket = async (filePath: string, data: string) => {
await storage
.bucket(bucketName)
.file(filePath)
.save(data, { public: false, timeout: 30000 })
.save(data, { public: false, timeout: 5000 })
}
const uploadOriginalContent = async (
@ -238,6 +238,14 @@ export const contentFetchRequestHandler: RequestHandler = async (req, res) => {
})
try {
const domain = new URL(url).hostname
const isBlocked = await isDomainBlocked(redisDataSource, domain)
if (isBlocked) {
console.log('domain is blocked', domain)
return res.sendStatus(200)
}
const key = cacheKey(url, locale, timezone)
let fetchResult = await getCachedFetchResult(redisDataSource, key)
if (!fetchResult) {
@ -246,14 +254,6 @@ export const contentFetchRequestHandler: RequestHandler = async (req, res) => {
url
)
const domain = new URL(url).hostname
const isBlocked = await isDomainBlocked(redisDataSource, domain)
if (isBlocked) {
console.log('domain is blocked', domain)
return res.sendStatus(200)
}
try {
fetchResult = await fetchContent(url, locale, timezone)
console.log('content has been fetched')

View File

@ -0,0 +1,49 @@
#!/usr/bin/python3
import os
import psycopg2
PG_HOST = os.getenv('PG_HOST', 'localhost')
PG_PORT = os.getenv('PG_PORT', 5432)
PG_USER = os.getenv('PG_USER', 'app_user')
PG_PASSWORD = os.getenv('PG_PASSWORD', 'app_pass')
PG_DB = os.getenv('PG_DB', 'omnivore')
PG_TIMEOUT = os.getenv('PG_TIMEOUT', 10)
def batch_update_library_items(conn):
batch_size = 100
# update original_content to NULL in batches
with conn.cursor() as cursor:
while True:
cursor.execute(f"""
UPDATE omnivore.library_item
SET original_content = NULL
WHERE ctid IN (
SELECT ctid
FROM omnivore.library_item
WHERE original_content IS NOT NULL
LIMIT {batch_size}
)
""")
rows_updated = cursor.rowcount
conn.commit()
if rows_updated == 0:
break
# postgres connection
conn = psycopg2.connect(
f'host={PG_HOST} port={PG_PORT} dbname={PG_DB} user={PG_USER} \
password={PG_PASSWORD} connect_timeout={PG_TIMEOUT}')
print('Postgres connection:', conn.info)
try:
print('Starting migration')
batch_update_library_items(conn)
print('Migration complete')
except Exception as err:
print('Migration error', err)
finally:
print('Closing connections')
conn.close()