Merge pull request #3866 from omnivore-app/feature/data-loader

feat: use dataloader for labels and highlights
This commit is contained in:
Hongbo Wu
2024-04-29 15:20:53 +08:00
committed by GitHub
22 changed files with 224 additions and 118 deletions

View File

@ -20,3 +20,15 @@ webview_gen:
yarn workspace @omnivore/appreader build
cp packages/appreader/build/bundle.js apple/OmnivoreKit/Sources/Views/Resources/bundle.js
cp packages/appreader/build/bundle.js android/Omnivore/app/src/main/assets/bundle.js
api:
yarn workspace @omnivore/api dev
web:
yarn workspace @omnivore/web dev
qp:
yarn workspace @omnivore/api dev_qp
content:
yarn workspace @omnivore/content_fetch start

View File

@ -56,7 +56,7 @@
"cookie-parser": "^1.4.5",
"cors": "^2.8.5",
"csv-stringify": "^6.4.0",
"dataloader": "^2.0.0",
"dataloader": "^2.2.2",
"diff-match-patch": "^1.0.5",
"dompurify": "^2.0.17",
"dot-case": "^3.0.4",

View File

@ -15,6 +15,7 @@ import {
import { ApolloServer } from 'apollo-server-express'
import { ExpressContext } from 'apollo-server-express/dist/ApolloServer'
import { ApolloServerPlugin } from 'apollo-server-plugin-base'
import DataLoader from 'dataloader'
import { Express } from 'express'
import * as httpContext from 'express-http-context2'
import type http from 'http'
@ -30,10 +31,14 @@ import { functionResolvers } from './resolvers/function_resolvers'
import { ClaimsToSet, RequestContext, ResolverContext } from './resolvers/types'
import ScalarResolvers from './scalars'
import typeDefs from './schema'
import { batchGetHighlightsFromLibraryItemIds } from './services/highlights'
import { batchGetLabelsFromLibraryItemIds } from './services/labels'
import { batchGetRecommendationsFromLibraryItemIds } from './services/recommendation'
import {
countDailyServiceUsage,
createServiceUsage,
} from './services/service_usage'
import { batchGetUploadFilesByIds } from './services/upload_file'
import { tracer } from './tracing'
import { getClaimsByToken, setAuthInCookie } from './utils/auth'
import { SetClaimsRole } from './utils/dictionary'
@ -100,6 +105,14 @@ const contextFunc: ContextFunction<ExpressContext, ResolverContext> = async ({
dataSources: {
readingProgress: new ReadingProgressDataSource(),
},
dataLoaders: {
labels: new DataLoader(batchGetLabelsFromLibraryItemIds),
highlights: new DataLoader(batchGetHighlightsFromLibraryItemIds),
recommendations: new DataLoader(
batchGetRecommendationsFromLibraryItemIds
),
uploadFiles: new DataLoader(batchGetUploadFilesByIds),
},
}
return ctx

View File

@ -7,7 +7,6 @@ import {
PrimaryGeneratedColumn,
UpdateDateColumn,
} from 'typeorm'
import { User } from './user'
@Entity({ name: 'user_profile' })

View File

@ -23,6 +23,9 @@ export class Recommendation {
@JoinColumn({ name: 'library_item_id' })
libraryItem!: LibraryItem
@Column('uuid')
libraryItemId!: string
@ManyToOne(() => Group, { onDelete: 'CASCADE' })
@JoinColumn({ name: 'group_id' })
group!: Group

View File

@ -2748,6 +2748,7 @@ export type SearchItem = {
directionality?: Maybe<DirectionalityType>;
feedContent?: Maybe<Scalars['String']>;
folder: Scalars['String'];
format?: Maybe<Scalars['String']>;
highlights?: Maybe<Array<Highlight>>;
id: Scalars['ID'];
image?: Maybe<Scalars['String']>;
@ -6617,6 +6618,7 @@ export type SearchItemResolvers<ContextType = ResolverContext, ParentType extend
directionality?: Resolver<Maybe<ResolversTypes['DirectionalityType']>, ParentType, ContextType>;
feedContent?: Resolver<Maybe<ResolversTypes['String']>, ParentType, ContextType>;
folder?: Resolver<ResolversTypes['String'], ParentType, ContextType>;
format?: Resolver<Maybe<ResolversTypes['String']>, ParentType, ContextType>;
highlights?: Resolver<Maybe<Array<ResolversTypes['Highlight']>>, ParentType, ContextType>;
id?: Resolver<ResolversTypes['ID'], ParentType, ContextType>;
image?: Resolver<Maybe<ResolversTypes['String']>, ParentType, ContextType>;

View File

@ -2102,6 +2102,7 @@ type SearchItem {
directionality: DirectionalityType
feedContent: String
folder: String!
format: String
highlights: [Highlight!]
id: ID!
image: String

View File

@ -7,13 +7,11 @@ import {
enqueueProcessYouTubeVideo,
enqueueTriggerRuleJob,
} from './utils/createTask'
import { buildLogger } from './utils/logger'
import { logger } from './utils/logger'
import { isYouTubeVideoURL } from './utils/youtube'
export type EntityEvent = { id: string }
const logger = buildLogger('pubsub')
const client = new PubSub()
export const createPubSubClient = (): PubsubClient => {

View File

@ -16,7 +16,13 @@ export const MAX_RECORDS_LIMIT = 1000
export const userRepository = appDataSource.getRepository(User).extend({
findById(id: string) {
return this.findOneBy({ id, status: StatusType.Active })
return this.createQueryBuilder('user')
.leftJoinAndSelect('user.profile', 'profile')
.where('user.id = :id AND user.status = :status', {
id,
status: StatusType.Active,
})
.getOne()
},
findByEmail(email: string) {

View File

@ -5,7 +5,6 @@
/* eslint-disable @typescript-eslint/no-floating-promises */
import { Readability } from '@omnivore/readability'
import graphqlFields from 'graphql-fields'
import { IsNull } from 'typeorm'
import { LibraryItem, LibraryItemState } from '../../entity/library_item'
import { env } from '../../env'
import {
@ -64,7 +63,6 @@ import { libraryItemRepository } from '../../repository/library_item'
import { userRepository } from '../../repository/user'
import { clearCachedReadingPosition } from '../../services/cached_reading_position'
import { createPageSaveRequest } from '../../services/create_page_save_request'
import { findHighlightsByLibraryItemId } from '../../services/highlights'
import {
addLabelsToLibraryItem,
createAndSaveLabelsInLibraryItem,
@ -104,7 +102,6 @@ import {
userDataToUser,
} from '../../utils/helpers'
import {
contentConverter,
getDistillerResult,
htmlToMarkdown,
ParsedContentPuppeteer,
@ -385,31 +382,33 @@ export const getArticleResolver = authorized<
if (!includeOriginalHtml) {
selectColumns.splice(selectColumns.indexOf('originalContent'), 1)
}
// We allow the backend to use the ID instead of a slug to fetch the article
// query against id if slug is a uuid
const where = slug.match(/^[0-9a-f]{8}-([0-9a-f]{4}-){3}[0-9a-f]{12}$/i)
? { id: slug }
: { slug }
const libraryItem = await authTrx((tx) =>
tx.withRepository(libraryItemRepository).findOne({
select: selectColumns,
where: {
...where,
deletedAt: IsNull(),
},
relations: {
highlights: {
user: true,
labels: true,
},
uploadFile: true,
recommendations: {
recommender: true,
group: true,
},
},
})
)
const libraryItem = await authTrx((tx) => {
const qb = tx
.createQueryBuilder(LibraryItem, 'libraryItem')
.select(selectColumns.map((column) => `libraryItem.${column}`))
.leftJoinAndSelect('libraryItem.labels', 'labels')
.leftJoinAndSelect('libraryItem.highlights', 'highlights')
.leftJoinAndSelect('highlights.labels', 'highlights_labels')
.leftJoinAndSelect('highlights.user', 'highlights_user')
.leftJoinAndSelect('highlights_user.profile', 'highlights_user_profile')
.leftJoinAndSelect('libraryItem.uploadFile', 'uploadFile')
.leftJoinAndSelect('libraryItem.recommendations', 'recommendations')
.leftJoinAndSelect('recommendations.group', 'recommendations_group')
.leftJoinAndSelect(
'recommendations.recommender',
'recommendations_recommender'
)
.where('libraryItem.user_id = :uid', { uid })
// We allow the backend to use the ID instead of a slug to fetch the article
// query against id if slug is a uuid
slug.match(/^[0-9a-f]{8}-([0-9a-f]{4}-){3}[0-9a-f]{12}$/i)
? qb.andWhere('libraryItem.id = :id', { id: slug })
: qb.andWhere('libraryItem.slug = :slug', { slug })
return qb.andWhere('libraryItem.deleted_at IS NULL').getOne()
})
if (!libraryItem) {
return { errorCodes: [ArticleErrorCode.NotFound] }
@ -667,7 +666,7 @@ export const searchResolver = authorized<
SearchSuccess,
SearchError,
QuerySearchArgs
>(async (_obj, params, { log, uid }) => {
>(async (_obj, params, { uid }) => {
const startCursor = params.after || ''
const first = Math.min(params.first || 10, 100) // limit to 100 items
@ -699,38 +698,11 @@ export const searchResolver = authorized<
libraryItems.pop()
}
const edges = await Promise.all(
libraryItems.map(async (libraryItem) => {
libraryItem.highlights = await findHighlightsByLibraryItemId(
libraryItem.id,
uid
)
if (params.includeContent && libraryItem.readableContent) {
// convert html to the requested format
const format = params.format || ArticleFormat.Html
try {
const converter = contentConverter(format)
if (converter) {
libraryItem.readableContent = converter(
libraryItem.readableContent,
libraryItem.highlights
)
}
} catch (error) {
log.error('Error converting content', error)
}
}
return {
node: libraryItemToSearchItem(libraryItem),
cursor: endCursor,
}
})
)
return {
edges,
edges: libraryItems.map((item) => ({
node: libraryItemToSearchItem(item, params.format as ArticleFormat),
cursor: endCursor,
})),
pageInfo: {
hasPreviousPage: false,
startCursor,

View File

@ -4,6 +4,7 @@
/* eslint-disable @typescript-eslint/no-unsafe-member-access */
/* eslint-disable @typescript-eslint/explicit-module-boundary-types */
import { createHmac } from 'crypto'
import { Highlight as HighlightEntity } from '../entity/highlight'
import {
EXISTING_NEWSLETTER_FOLDER,
NewsletterEmail,
@ -24,10 +25,6 @@ import {
} from '../generated/graphql'
import { getAISummary } from '../services/ai-summaries'
import { findUserFeatures } from '../services/features'
import { findHighlightsByLibraryItemId } from '../services/highlights'
import { findLabelsByLibraryItemId } from '../services/labels'
import { findRecommendationsByLibraryItemId } from '../services/recommendation'
import { findUploadFileById } from '../services/upload_file'
import {
highlightDataToHighlight,
isBase64Image,
@ -36,11 +33,16 @@ import {
wordsCount,
} from '../utils/helpers'
import { createImageProxyUrl } from '../utils/imageproxy'
import { contentConverter } from '../utils/parser'
import {
generateDownloadSignedUrl,
generateUploadFilePathName,
} from '../utils/uploads'
import { emptyTrashResolver, fetchContentResolver } from './article'
import {
ArticleFormat,
emptyTrashResolver,
fetchContentResolver,
} from './article'
import {
addDiscoverFeedResolver,
deleteDiscoverArticleResolver,
@ -404,7 +406,9 @@ export const functionResolvers = {
ctx.claims &&
article.uploadFileId
) {
const upload = await findUploadFileById(article.uploadFileId)
const upload = await ctx.dataLoaders.uploadFiles.load(
article.uploadFileId
)
if (!upload || !upload.fileName) {
return undefined
}
@ -439,7 +443,7 @@ export const functionResolvers = {
) {
if (article.labels) return article.labels
return findLabelsByLibraryItemId(article.id, ctx.uid)
return ctx.dataLoaders.labels.load(article.id)
},
...readingProgressHandlers,
},
@ -478,7 +482,7 @@ export const functionResolvers = {
ctx.claims &&
item.uploadFileId
) {
const upload = await findUploadFileById(item.uploadFileId)
const upload = await ctx.dataLoaders.uploadFiles.load(item.uploadFileId)
if (!upload || !upload.fileName) {
return undefined
}
@ -511,28 +515,22 @@ export const functionResolvers = {
) {
if (item.labels) return item.labels
return findLabelsByLibraryItemId(item.id, ctx.uid)
return ctx.dataLoaders.labels.load(item.id)
},
async recommendations(
item: {
id: string
recommendations?: Recommendation[]
recommenderNames?: string[] | null
},
_: unknown,
ctx: WithDataSourcesContext
) {
if (item.recommendations) return item.recommendations
if (item.recommenderNames && item.recommenderNames.length > 0) {
const recommendations = await findRecommendationsByLibraryItemId(
item.id,
ctx.uid
)
return recommendations.map(recommandationDataToRecommendation)
}
return []
const recommendations = await ctx.dataLoaders.recommendations.load(
item.id
)
return recommendations.map(recommandationDataToRecommendation)
},
async aiSummary(item: SearchItem, _: unknown, ctx: WithDataSourcesContext) {
return (
@ -553,10 +551,46 @@ export const functionResolvers = {
) {
if (item.highlights) return item.highlights
const highlights = await findHighlightsByLibraryItemId(item.id, ctx.uid)
const highlights = await ctx.dataLoaders.highlights.load(item.id)
return highlights.map(highlightDataToHighlight)
},
...readingProgressHandlers,
async content(
item: {
id: string
content?: string
highlightAnnotations?: string[]
format?: ArticleFormat
},
_: unknown,
ctx: WithDataSourcesContext
) {
// convert html to the requested format if requested
if (item.format && item.format !== ArticleFormat.Html && item.content) {
let highlights: HighlightEntity[] = []
// load highlights if needed
if (
item.format === ArticleFormat.HighlightedMarkdown &&
item.highlightAnnotations?.length
) {
highlights = await ctx.dataLoaders.highlights.load(item.id)
}
try {
ctx.log.info(`Converting content to: ${item.format}`)
// convert html to the requested format
const converter = contentConverter(item.format)
if (converter) {
return converter(item.content, highlights)
}
} catch (error) {
ctx.log.error('Error converting content', error)
}
}
return item.content
},
},
Subscription: {
newsletterEmail(subscription: Subscription) {

View File

@ -1,11 +1,16 @@
/* eslint-disable @typescript-eslint/ban-types */
import { Span } from '@opentelemetry/api'
import { Context as ApolloContext } from 'apollo-server-core'
import DataLoader from 'dataloader'
import * as jwt from 'jsonwebtoken'
import { EntityManager } from 'typeorm'
import winston from 'winston'
import { PubsubClient } from '../pubsub'
import { ReadingProgressDataSource } from '../datasources/reading_progress_data_source'
import { Highlight } from '../entity/highlight'
import { Label } from '../entity/label'
import { Recommendation } from '../entity/recommendation'
import { UploadFile } from '../entity/upload_file'
import { PubsubClient } from '../pubsub'
export interface Claims {
uid: string
@ -41,6 +46,12 @@ export interface RequestContext {
dataSources: {
readingProgress: ReadingProgressDataSource
}
dataLoaders: {
labels: DataLoader<string, Label[]>
highlights: DataLoader<string, Highlight[]>
recommendations: DataLoader<string, Recommendation[]>
uploadFiles: DataLoader<string, UploadFile | undefined>
}
}
export type ResolverContext = ApolloContext<RequestContext>

View File

@ -1657,6 +1657,7 @@ const schema = gql`
folder: String!
aiSummary: String
directionality: DirectionalityType
format: String
}
type SearchItemEdge {

View File

@ -1,5 +1,5 @@
import { diff_match_patch } from 'diff-match-patch'
import { DeepPartial } from 'typeorm'
import { DeepPartial, In } from 'typeorm'
import { QueryDeepPartialEntity } from 'typeorm/query-builder/QueryPartialEntity'
import { EntityLabel } from '../entity/entity_label'
import { Highlight } from '../entity/highlight'
@ -20,6 +20,21 @@ export type HighlightEvent = Merge<
EntityEvent
>
export const batchGetHighlightsFromLibraryItemIds = async (
libraryItemIds: readonly string[]
): Promise<Highlight[][]> => {
const highlights = await authTrx(async (tx) =>
tx.getRepository(Highlight).find({
where: { libraryItem: { id: In(libraryItemIds as string[]) } },
relations: ['user'],
})
)
return libraryItemIds.map((libraryItemId) =>
highlights.filter((highlight) => highlight.libraryItemId === libraryItemId)
)
}
export const getHighlightLocation = (patch: string): number | undefined => {
const dmp = new diff_match_patch()
const patches = dmp.patch_fromText(patch)

View File

@ -22,20 +22,22 @@ export type LabelEvent = Merge<
EntityEvent
>
// const batchGetLabelsFromLinkIds = async (
// linkIds: readonly string[]
// ): Promise<Label[][]> => {
// const links = await getRepository(Link).find({
// where: { id: In(linkIds as string[]) },
// relations: ['labels'],
// })
export const batchGetLabelsFromLibraryItemIds = async (
libraryItemIds: readonly string[]
): Promise<Label[][]> => {
const labels = await authTrx(async (tx) =>
tx.getRepository(EntityLabel).find({
where: { libraryItemId: In(libraryItemIds as string[]) },
relations: ['label'],
})
)
// return linkIds.map(
// (linkId) => links.find((link) => link.id === linkId)?.labels || []
// )
// }
// export const labelsLoader = new DataLoader(batchGetLabelsFromLinkIds)
return libraryItemIds.map((libraryItemId) =>
labels
.filter((label) => label.libraryItemId === libraryItemId)
.map((label) => label.label)
)
}
export const findOrCreateLabels = async (
labels: CreateLabelInput[],

View File

@ -1,5 +1,5 @@
import { nanoid } from 'nanoid'
import { DeepPartial } from 'typeorm'
import { DeepPartial, In } from 'typeorm'
import { LibraryItem } from '../entity/library_item'
import { Recommendation } from '../entity/recommendation'
import { authTrx } from '../repository'
@ -12,6 +12,23 @@ import {
updateLibraryItem,
} from './library_item'
export const batchGetRecommendationsFromLibraryItemIds = async (
libraryItemIds: readonly string[]
): Promise<Recommendation[][]> => {
const recommendations = await authTrx(async (tx) =>
tx.getRepository(Recommendation).find({
where: { libraryItem: { id: In(libraryItemIds as string[]) } },
relations: ['group', 'recommender'],
})
)
return libraryItemIds.map((libraryItemId) =>
recommendations.filter(
(recommendation) => recommendation.libraryItemId === libraryItemId
)
)
}
export const addRecommendation = async (
item: LibraryItem,
recommendation: Recommendation,

View File

@ -1,5 +1,7 @@
import normalizeUrl from 'normalize-url'
import path from 'path'
import { In } from 'typeorm'
import { v4 as uuid } from 'uuid'
import { LibraryItemState } from '../entity/library_item'
import { UploadFile } from '../entity/upload_file'
import {
@ -18,7 +20,16 @@ import {
} from '../utils/uploads'
import { validateUrl } from './create_page_save_request'
import { createOrUpdateLibraryItem } from './library_item'
import { v4 as uuid } from 'uuid'
export const batchGetUploadFilesByIds = async (
ids: readonly string[]
): Promise<(UploadFile | undefined)[]> => {
const uploadFiles = await getRepository(UploadFile).findBy({
id: In(ids as string[]),
})
return ids.map((id) => uploadFiles.find((uploadFile) => uploadFile.id === id))
}
const isFileUrl = (url: string): boolean => {
const parsedUrl = new URL(url)

View File

@ -36,12 +36,13 @@ export const claimsFromApiKey = async (key: string): Promise<Claims> => {
const apiKeyRepo = getRepository(ApiKey)
const apiKey = await apiKeyRepo.findOne({
where: {
const apiKey = await apiKeyRepo
.createQueryBuilder('apiKey')
.innerJoinAndSelect('apiKey.user', 'user')
.where({
key: hashedKey,
},
relations: ['user'],
})
})
.getOne()
if (!apiKey) {
throw new Error('api key not found')
}

View File

@ -26,6 +26,7 @@ import {
SearchItem,
} from '../generated/graphql'
import { createPubSubClient } from '../pubsub'
import { ArticleFormat } from '../resolvers'
import { validateUrl } from '../services/create_page_save_request'
import { updateLibraryItem } from '../services/library_item'
import { Merge } from '../util'
@ -230,7 +231,10 @@ export const libraryItemToArticle = (item: LibraryItem): Article => ({
directionality: item.directionality as unknown as DirectionalityType,
})
export const libraryItemToSearchItem = (item: LibraryItem): SearchItem => ({
export const libraryItemToSearchItem = (
item: LibraryItem,
format?: ArticleFormat
): SearchItem => ({
...item,
url: item.originalUrl,
state: item.state as unknown as ArticleSavingRequestStatus,
@ -247,6 +251,7 @@ export const libraryItemToSearchItem = (item: LibraryItem): SearchItem => ({
highlights: item.highlights?.map(highlightDataToHighlight),
wordsCount: item.wordCount,
directionality: item.directionality as unknown as DirectionalityType,
format,
})
export const isParsingTimeout = (libraryItem: LibraryItem): boolean => {

View File

@ -31,7 +31,7 @@ export class CustomTypeOrmLogger
constructor(options?: TypeOrmLoggerOptions) {
super(options)
this.logger = buildLogger('typeorm')
this.logger = logger
}
logQuery(query: string, parameters?: any[], queryRunner?: QueryRunner) {
@ -132,8 +132,8 @@ const truncateObjectDeep = (object: any, length: number): any => {
class GcpLoggingTransport extends LoggingWinston {
log(info: any, callback: (err: Error | null, apiResponse?: any) => void) {
// reduce the size of the log entry by truncating any string values to 500 characters
info = truncateObjectDeep(info, 500) as never
// reduce the size of the log entry by truncating any string values to 10000 characters
info = truncateObjectDeep(info, 10000) as never
super.log(info, callback)
}
}

View File

@ -33,7 +33,7 @@ import {
makeHighlightNodeAttributes,
} from './highlightGenerator'
import { createImageProxyUrl } from './imageproxy'
import { buildLogger, LogRecord } from './logger'
import { logger, LogRecord } from './logger'
interface Feed {
title: string
@ -43,7 +43,6 @@ interface Feed {
description?: string
}
const logger = buildLogger('utils.parse')
const signToken = promisify(jwt.sign)
const axiosInstance = axios.create({
@ -647,7 +646,6 @@ export const contentConverter = (
return htmlToMarkdown
case ArticleFormat.HighlightedMarkdown:
return htmlToHighlightedMarkdown
case ArticleFormat.Html:
default:
return undefined
}
@ -671,7 +669,7 @@ export const htmlToHighlightedMarkdown = (
throw new Error('Invalid html content')
}
} catch (err) {
logger.info(err)
logger.error(err)
return nhm.translate(/* html */ html)
}
@ -691,7 +689,7 @@ export const htmlToHighlightedMarkdown = (
articleTextNodes
)
} catch (err) {
logger.info(err)
logger.error(err)
}
})
html = document.documentElement.outerHTML

View File

@ -13348,11 +13348,16 @@ data-urls@^2.0.0:
whatwg-mimetype "^2.3.0"
whatwg-url "^8.0.0"
dataloader@2.1.0, dataloader@^2.0.0:
dataloader@2.1.0:
version "2.1.0"
resolved "https://registry.yarnpkg.com/dataloader/-/dataloader-2.1.0.tgz#c69c538235e85e7ac6c6c444bae8ecabf5de9df7"
integrity sha512-qTcEYLen3r7ojZNgVUaRggOI+KM7jrKxXeSHhogh/TWxYMeONEMqY+hmkobiYQozsGIyg9OYVzO4ZIfoB4I0pQ==
dataloader@^2.2.2:
version "2.2.2"
resolved "https://registry.yarnpkg.com/dataloader/-/dataloader-2.2.2.tgz#216dc509b5abe39d43a9b9d97e6e5e473dfbe3e0"
integrity sha512-8YnDaaf7N3k/q5HnTJVuzSyLETjoZjVmHc4AeKAzOvKHEFQKcn64OKBfzHYtE9zGjctNM7V9I0MfnUVLpi7M5g==
date-fns@2.x:
version "2.29.3"
resolved "https://registry.yarnpkg.com/date-fns/-/date-fns-2.29.3.tgz#27402d2fc67eb442b511b70bbdf98e6411cd68a8"