Merge pull request #4235 from omnivore-app/fix/original-html
stop storing original html in the database
This commit is contained in:
@ -135,9 +135,6 @@ export class LibraryItem {
|
||||
@Column('enum', { enum: ContentReaderType, default: ContentReaderType.WEB })
|
||||
contentReader!: ContentReaderType
|
||||
|
||||
@Column('text', { nullable: true })
|
||||
originalContent?: string | null
|
||||
|
||||
@Column('text')
|
||||
readableContent!: string
|
||||
|
||||
|
||||
@ -273,7 +273,6 @@ export const savePageJob = async (data: Data, attemptsMade: number) => {
|
||||
publishedAt: publishedAt ? new Date(publishedAt) : null,
|
||||
source,
|
||||
folder,
|
||||
originalContentUploaded: true,
|
||||
},
|
||||
user
|
||||
)
|
||||
|
||||
@ -45,39 +45,15 @@ const CONTENT_TYPES = {
|
||||
readable: 'text/html',
|
||||
}
|
||||
|
||||
const getSelectOptions = (
|
||||
format: ContentFormat
|
||||
): { column: 'readableContent' | 'originalContent'; highlights?: boolean } => {
|
||||
switch (format) {
|
||||
case 'markdown':
|
||||
case 'readable':
|
||||
return {
|
||||
column: 'readableContent',
|
||||
}
|
||||
case 'highlightedMarkdown':
|
||||
return {
|
||||
column: 'readableContent',
|
||||
highlights: true,
|
||||
}
|
||||
case 'original':
|
||||
return {
|
||||
column: 'originalContent',
|
||||
}
|
||||
default:
|
||||
throw new Error('Unsupported format')
|
||||
}
|
||||
}
|
||||
|
||||
export const uploadContentJob = async (data: UploadContentJobData) => {
|
||||
logger.info('Uploading content to bucket', data)
|
||||
|
||||
const { libraryItemId, userId, format, filePath } = data
|
||||
|
||||
const { column, highlights } = getSelectOptions(format)
|
||||
const libraryItem = await findLibraryItemById(libraryItemId, userId, {
|
||||
select: ['id', column], // id is required for relations
|
||||
select: ['id', 'readableContent'], // id is required for relations
|
||||
relations: {
|
||||
highlights,
|
||||
highlights: format === 'highlightedMarkdown',
|
||||
},
|
||||
})
|
||||
if (!libraryItem) {
|
||||
@ -85,10 +61,10 @@ export const uploadContentJob = async (data: UploadContentJobData) => {
|
||||
throw new Error('Library item not found')
|
||||
}
|
||||
|
||||
const content = data.content || libraryItem[column]
|
||||
const content = libraryItem.readableContent
|
||||
|
||||
if (!content) {
|
||||
logger.error(`${column} not found`)
|
||||
logger.error(`content not found`)
|
||||
throw new Error('Content not found')
|
||||
}
|
||||
|
||||
|
||||
@ -4,7 +4,6 @@
|
||||
/* eslint-disable @typescript-eslint/no-unsafe-member-access */
|
||||
/* eslint-disable @typescript-eslint/no-floating-promises */
|
||||
import { Readability } from '@omnivore/readability'
|
||||
import graphqlFields from 'graphql-fields'
|
||||
import {
|
||||
ContentReaderType,
|
||||
LibraryItem,
|
||||
@ -106,7 +105,6 @@ import {
|
||||
titleForFilePath,
|
||||
} from '../../utils/helpers'
|
||||
import {
|
||||
getDistillerResult,
|
||||
htmlToMarkdown,
|
||||
ParsedContentPuppeteer,
|
||||
parsePreparedContent,
|
||||
@ -376,15 +374,9 @@ export const getArticleResolver = authorized<
|
||||
Merge<ArticleSuccess, { article: LibraryItem }>,
|
||||
ArticleError,
|
||||
QueryArticleArgs
|
||||
>(async (_obj, { slug, format }, { uid, log }, info) => {
|
||||
>(async (_obj, { slug, format }, { uid, log }) => {
|
||||
try {
|
||||
const selectColumns = getColumns(libraryItemRepository)
|
||||
const includeOriginalHtml =
|
||||
format === ArticleFormat.Distiller ||
|
||||
!!graphqlFields(info).article.originalHtml
|
||||
if (!includeOriginalHtml) {
|
||||
selectColumns.splice(selectColumns.indexOf('originalContent'), 1)
|
||||
}
|
||||
|
||||
const libraryItem = await authTrx(
|
||||
(tx) => {
|
||||
@ -435,18 +427,6 @@ export const getArticleResolver = authorized<
|
||||
|
||||
if (format === ArticleFormat.Markdown) {
|
||||
libraryItem.readableContent = htmlToMarkdown(libraryItem.readableContent)
|
||||
} else if (format === ArticleFormat.Distiller) {
|
||||
if (!libraryItem.originalContent) {
|
||||
return { errorCodes: [ArticleErrorCode.BadData] }
|
||||
}
|
||||
const distillerResult = await getDistillerResult(
|
||||
uid,
|
||||
libraryItem.originalContent
|
||||
)
|
||||
if (!distillerResult) {
|
||||
return { errorCodes: [ArticleErrorCode.BadData] }
|
||||
}
|
||||
libraryItem.readableContent = distillerResult
|
||||
}
|
||||
|
||||
return {
|
||||
|
||||
@ -442,7 +442,7 @@ export const functionResolvers = {
|
||||
return article.originalUrl
|
||||
},
|
||||
hasContent(article: LibraryItem) {
|
||||
return !!article.originalContent && !!article.readableContent
|
||||
return !!article.readableContent
|
||||
},
|
||||
publishedAt(article: LibraryItem) {
|
||||
return validatedDate(article.publishedAt || undefined)
|
||||
|
||||
@ -108,7 +108,6 @@ export function pageRouter() {
|
||||
id: clientRequestId,
|
||||
user: { id: claims.uid },
|
||||
title,
|
||||
originalContent: '',
|
||||
itemType: PageType.File,
|
||||
uploadFile: { id: uploadFileData.id },
|
||||
slug: generateSlug(uploadFilePathName),
|
||||
|
||||
@ -30,11 +30,9 @@ import {
|
||||
} from '../repository'
|
||||
import { libraryItemRepository } from '../repository/library_item'
|
||||
import { Merge, PickTuple } from '../util'
|
||||
import { enqueueBulkUploadContentJob } from '../utils/createTask'
|
||||
import { deepDelete, setRecentlySavedItemInRedis } from '../utils/helpers'
|
||||
import { logError, logger } from '../utils/logger'
|
||||
import { logger } from '../utils/logger'
|
||||
import { parseSearchQuery } from '../utils/search'
|
||||
import { contentFilePath, downloadFromBucket } from '../utils/uploads'
|
||||
import { HighlightEvent } from './highlights'
|
||||
import { addLabelsToLibraryItem, LabelEvent } from './labels'
|
||||
|
||||
@ -45,7 +43,6 @@ const columnsToDelete = [
|
||||
'links',
|
||||
'textContentHash',
|
||||
'readableContent',
|
||||
'originalContent',
|
||||
'feedContent',
|
||||
] as const
|
||||
type ColumnsToDeleteType = typeof columnsToDelete[number]
|
||||
@ -136,7 +133,7 @@ const readingProgressDataSource = new ReadingProgressDataSource()
|
||||
export const batchGetLibraryItems = async (ids: readonly string[]) => {
|
||||
// select all columns except content
|
||||
const select = getColumns(libraryItemRepository).filter(
|
||||
(select) => ['originalContent', 'readableContent'].indexOf(select) === -1
|
||||
(select) => select !== 'readableContent'
|
||||
)
|
||||
const items = await authTrx(
|
||||
async (tx) =>
|
||||
@ -638,12 +635,10 @@ export const createSearchQueryBuilder = (
|
||||
) => {
|
||||
const queryBuilder = em.createQueryBuilder(LibraryItem, 'library_item')
|
||||
|
||||
// select all columns except content
|
||||
// exclude content if not requested
|
||||
const selects: Select[] = getColumns(libraryItemRepository)
|
||||
.filter(
|
||||
(select) =>
|
||||
select !== 'originalContent' && // exclude original content
|
||||
(args.includeContent || select !== 'readableContent') // exclude content if not requested
|
||||
(select) => args.includeContent || select !== 'readableContent' //
|
||||
)
|
||||
.map((column) => ({ column: `library_item.${column}` }))
|
||||
|
||||
@ -762,9 +757,7 @@ export const findRecentLibraryItems = async (
|
||||
offset?: number
|
||||
) => {
|
||||
const selectColumns = getColumns(libraryItemRepository)
|
||||
.filter(
|
||||
(column) => column !== 'readableContent' && column !== 'originalContent'
|
||||
)
|
||||
.filter((column) => column !== 'readableContent')
|
||||
.map((column) => `library_item.${column}`)
|
||||
|
||||
return authTrx(
|
||||
@ -797,11 +790,9 @@ export const findLibraryItemsByIds = async (
|
||||
relations?: Array<'labels' | 'highlights'>
|
||||
}
|
||||
) => {
|
||||
const selectColumns =
|
||||
options?.select?.map((column) => `library_item.${column}`) ||
|
||||
getColumns(libraryItemRepository)
|
||||
.filter((column) => column !== 'originalContent')
|
||||
.map((column) => `library_item.${column}`)
|
||||
const selectColumns = (
|
||||
options?.select || getColumns(libraryItemRepository)
|
||||
).map((column) => `library_item.${column}`)
|
||||
return authTrx(
|
||||
async (tx) => {
|
||||
const qb = tx
|
||||
@ -1061,17 +1052,8 @@ export const createOrUpdateLibraryItem = async (
|
||||
libraryItem: CreateOrUpdateLibraryItemArgs,
|
||||
userId: string,
|
||||
pubsub = createPubSubClient(),
|
||||
skipPubSub = false,
|
||||
originalContentUploaded = false
|
||||
skipPubSub = false
|
||||
): Promise<LibraryItem> => {
|
||||
let originalContent: string | null = null
|
||||
if (libraryItem.originalContent) {
|
||||
originalContent = libraryItem.originalContent
|
||||
|
||||
// remove original content from the item
|
||||
delete libraryItem.originalContent
|
||||
}
|
||||
|
||||
const newLibraryItem = await authTrx(
|
||||
async (tx) => {
|
||||
const repo = tx.withRepository(libraryItemRepository)
|
||||
@ -1146,24 +1128,6 @@ export const createOrUpdateLibraryItem = async (
|
||||
const data = deepDelete(newLibraryItem, columnsToDelete)
|
||||
await pubsub.entityCreated<ItemEvent>(EntityType.ITEM, data, userId)
|
||||
|
||||
// upload original content to GCS in a job if it's not already uploaded
|
||||
if (originalContent && !originalContentUploaded) {
|
||||
try {
|
||||
await enqueueUploadOriginalContent(
|
||||
userId,
|
||||
newLibraryItem.id,
|
||||
newLibraryItem.savedAt,
|
||||
originalContent
|
||||
)
|
||||
|
||||
logger.info('Queued to upload original content in GCS', {
|
||||
id: newLibraryItem.id,
|
||||
})
|
||||
} catch (error) {
|
||||
logError(error)
|
||||
}
|
||||
}
|
||||
|
||||
return newLibraryItem
|
||||
}
|
||||
|
||||
@ -1747,41 +1711,3 @@ export const filterItemEvents = (
|
||||
|
||||
throw new Error('Unexpected state.')
|
||||
}
|
||||
|
||||
export const enqueueUploadOriginalContent = async (
|
||||
userId: string,
|
||||
libraryItemId: string,
|
||||
savedAt: Date,
|
||||
originalContent: string
|
||||
) => {
|
||||
const filePath = contentFilePath({
|
||||
userId,
|
||||
libraryItemId,
|
||||
savedAt,
|
||||
format: 'original',
|
||||
})
|
||||
await enqueueBulkUploadContentJob([
|
||||
{
|
||||
userId,
|
||||
libraryItemId,
|
||||
filePath,
|
||||
format: 'original',
|
||||
content: originalContent,
|
||||
},
|
||||
])
|
||||
}
|
||||
|
||||
export const downloadOriginalContent = async (
|
||||
userId: string,
|
||||
libraryItemId: string,
|
||||
savedAt: Date
|
||||
) => {
|
||||
return downloadFromBucket(
|
||||
contentFilePath({
|
||||
userId,
|
||||
libraryItemId,
|
||||
savedAt,
|
||||
format: 'original',
|
||||
})
|
||||
)
|
||||
}
|
||||
|
||||
@ -47,7 +47,6 @@ export const addRecommendation = async (
|
||||
author: item.author,
|
||||
description: item.description,
|
||||
originalUrl: item.originalUrl,
|
||||
originalContent: item.originalContent,
|
||||
contentReader: item.contentReader,
|
||||
directionality: item.directionality,
|
||||
itemLanguage: item.itemLanguage,
|
||||
|
||||
@ -12,7 +12,7 @@ export const saveContentDisplayReport = async (
|
||||
input: ReportItemInput
|
||||
): Promise<boolean> => {
|
||||
const item = await findLibraryItemById(input.pageId, uid, {
|
||||
select: ['id', 'readableContent', 'originalContent', 'originalUrl'],
|
||||
select: ['id', 'readableContent', 'originalUrl'],
|
||||
})
|
||||
if (!item) {
|
||||
logger.info('unable to submit report, item not found', input)
|
||||
|
||||
@ -89,7 +89,6 @@ export const saveEmail = async (
|
||||
user: { id: input.userId },
|
||||
slug,
|
||||
readableContent: content,
|
||||
originalContent: input.originalContent,
|
||||
description: metadata?.description || parseResult.parsedContent?.excerpt,
|
||||
title: input.title,
|
||||
author: input.author,
|
||||
|
||||
@ -71,7 +71,6 @@ export type SavePageArgs = Merge<
|
||||
feedContent?: string
|
||||
previewImage?: string
|
||||
author?: string
|
||||
originalContentUploaded?: boolean
|
||||
}
|
||||
>
|
||||
|
||||
@ -149,8 +148,7 @@ export const savePage = async (
|
||||
itemToSave,
|
||||
user.id,
|
||||
undefined,
|
||||
isImported,
|
||||
input.originalContentUploaded
|
||||
isImported
|
||||
)
|
||||
clientRequestId = newItem.id
|
||||
|
||||
@ -240,7 +238,6 @@ export const parsedContentToLibraryItem = ({
|
||||
id: itemId || undefined,
|
||||
slug,
|
||||
user: { id: userId },
|
||||
originalContent: originalHtml,
|
||||
readableContent: parsedContent?.content || '',
|
||||
description: parsedContent?.excerpt,
|
||||
previewContent: parsedContent?.excerpt,
|
||||
|
||||
@ -124,7 +124,6 @@ export const createTestLibraryItem = async (
|
||||
item,
|
||||
userId,
|
||||
undefined,
|
||||
true,
|
||||
true
|
||||
)
|
||||
if (labels) {
|
||||
|
||||
@ -430,7 +430,6 @@ describe('Article API', () => {
|
||||
before(async () => {
|
||||
const itemToCreate: CreateOrUpdateLibraryItemArgs = {
|
||||
title: 'test title',
|
||||
originalContent: '<p>test</p>',
|
||||
slug: realSlug,
|
||||
readingProgressTopPercent: 100,
|
||||
user,
|
||||
@ -441,7 +440,6 @@ describe('Article API', () => {
|
||||
itemToCreate,
|
||||
user.id,
|
||||
undefined,
|
||||
true,
|
||||
true
|
||||
)
|
||||
itemId = item.id
|
||||
@ -1309,7 +1307,6 @@ describe('Article API', () => {
|
||||
item,
|
||||
user.id,
|
||||
undefined,
|
||||
true,
|
||||
true
|
||||
)
|
||||
items.push(savedItem)
|
||||
|
||||
@ -71,7 +71,7 @@ const uploadToBucket = async (filePath: string, data: string) => {
|
||||
await storage
|
||||
.bucket(bucketName)
|
||||
.file(filePath)
|
||||
.save(data, { public: false, timeout: 30000 })
|
||||
.save(data, { public: false, timeout: 5000 })
|
||||
}
|
||||
|
||||
const uploadOriginalContent = async (
|
||||
@ -238,6 +238,14 @@ export const contentFetchRequestHandler: RequestHandler = async (req, res) => {
|
||||
})
|
||||
|
||||
try {
|
||||
const domain = new URL(url).hostname
|
||||
const isBlocked = await isDomainBlocked(redisDataSource, domain)
|
||||
if (isBlocked) {
|
||||
console.log('domain is blocked', domain)
|
||||
|
||||
return res.sendStatus(200)
|
||||
}
|
||||
|
||||
const key = cacheKey(url, locale, timezone)
|
||||
let fetchResult = await getCachedFetchResult(redisDataSource, key)
|
||||
if (!fetchResult) {
|
||||
@ -246,14 +254,6 @@ export const contentFetchRequestHandler: RequestHandler = async (req, res) => {
|
||||
url
|
||||
)
|
||||
|
||||
const domain = new URL(url).hostname
|
||||
const isBlocked = await isDomainBlocked(redisDataSource, domain)
|
||||
if (isBlocked) {
|
||||
console.log('domain is blocked', domain)
|
||||
|
||||
return res.sendStatus(200)
|
||||
}
|
||||
|
||||
try {
|
||||
fetchResult = await fetchContent(url, locale, timezone)
|
||||
console.log('content has been fetched')
|
||||
|
||||
49
packages/db/remove_original_content.py
Executable file
49
packages/db/remove_original_content.py
Executable file
@ -0,0 +1,49 @@
|
||||
#!/usr/bin/python3
|
||||
import os
|
||||
|
||||
import psycopg2
|
||||
|
||||
PG_HOST = os.getenv('PG_HOST', 'localhost')
|
||||
PG_PORT = os.getenv('PG_PORT', 5432)
|
||||
PG_USER = os.getenv('PG_USER', 'app_user')
|
||||
PG_PASSWORD = os.getenv('PG_PASSWORD', 'app_pass')
|
||||
PG_DB = os.getenv('PG_DB', 'omnivore')
|
||||
PG_TIMEOUT = os.getenv('PG_TIMEOUT', 10)
|
||||
|
||||
|
||||
def batch_update_library_items(conn):
|
||||
batch_size = 100
|
||||
# update original_content to NULL in batches
|
||||
with conn.cursor() as cursor:
|
||||
while True:
|
||||
cursor.execute(f"""
|
||||
UPDATE omnivore.library_item
|
||||
SET original_content = NULL
|
||||
WHERE ctid IN (
|
||||
SELECT ctid
|
||||
FROM omnivore.library_item
|
||||
WHERE original_content IS NOT NULL
|
||||
LIMIT {batch_size}
|
||||
)
|
||||
""")
|
||||
rows_updated = cursor.rowcount
|
||||
conn.commit()
|
||||
if rows_updated == 0:
|
||||
break
|
||||
|
||||
|
||||
# postgres connection
|
||||
conn = psycopg2.connect(
|
||||
f'host={PG_HOST} port={PG_PORT} dbname={PG_DB} user={PG_USER} \
|
||||
password={PG_PASSWORD} connect_timeout={PG_TIMEOUT}')
|
||||
print('Postgres connection:', conn.info)
|
||||
|
||||
try:
|
||||
print('Starting migration')
|
||||
batch_update_library_items(conn)
|
||||
print('Migration complete')
|
||||
except Exception as err:
|
||||
print('Migration error', err)
|
||||
finally:
|
||||
print('Closing connections')
|
||||
conn.close()
|
||||
Reference in New Issue
Block a user