Digest job

This commit is contained in:
Jackson Harper
2024-04-16 16:55:46 +08:00
committed by Hongbo Wu
parent 47150c2991
commit fbaa9ce663

View File

@ -0,0 +1,167 @@
import { logger } from '../../utils/logger'
import { OpenAI } from '@langchain/openai'
import { PromptTemplate } from '@langchain/core/prompts'
import { LibraryItem } from '../../entity/library_item'
import { CreateDigestJobData } from '../../services/digest'
const USER_PROFILE_PROMPT =
'Create a user profile based on the supplied titles\n\ntitles:\n{titles}'
const SUMMARIZE_PROMPT =
'Summarize the supplied article.\n\ntitle: {title}\nauthor: {author}\ncontent: {content}'
// TODO: Makes multiple DB queries and combines the results
const getPreferencesList = (userId: string): Promise<LibraryItem[]> => {
// use the queries from the digest definitions to lookup preferences
// There should be a list of multiple queries we use. For now we can
// hardcode these queries:
// - query: "in:all is:read OR has:highlights sort:updated-desc wordsCount:>=20"
// count: 21
// reason: "recently read or highlighted items that are not part of the digest"
// - query: "in:all is:read OR has:highlights sort:saved-asc wordsCount:>=20"
// count: 4
// reason: "some older items that were interacted with"
return Promise.resolve([])
}
// TODO: Makes multiple DB queries and combines the results
const getCandidatesList = (userId: string): Promise<LibraryItem[]> => {
// use the queries from the digest definitions to lookup preferences
// There should be a list of multiple queries we use. For now we can
// hardcode these queries:
// - query: "in:all is:unread saved:last24hrs sort:saved-desc wordsCount:>=500"
// count: 100
// reason: "most recent 100 items saved over 500 words
return Promise.resolve([])
}
// TODO: Takes a list of library items, and uses a prompt to generate
// a text representation of a user profile
const createUserProfile = async (
preferences: LibraryItem[]
): Promise<string> => {
const llm = new OpenAI({
modelName: 'gpt-4-0125-preview',
configuration: {
apiKey: process.env.OPENAI_API_KEY,
},
})
const contextualTemplate = PromptTemplate.fromTemplate(USER_PROFILE_PROMPT)
const chain = contextualTemplate.pipe(llm)
const result = await chain.invoke({
titles: preferences.map((item) => `* ${item}`).join('\n'),
})
return result
}
// TODO: Checks redis for a user profile, if not found creates one and writes
// it to redis
const findOrCreateUserProfile = async (userId: string): Promise<string> => {
// check redis for user profile, return if found
// if not found
const preferences = await getPreferencesList(userId)
const profile = await createUserProfile(preferences)
// TODO: write to redis here
return profile
}
type RankedItem = {
topic: string
summary?: string
libraryItem: LibraryItem
}
// TODO: Uses OpenAI to rank all the titles based on the user profiles
const rankCandidates = async (
candidates: LibraryItem[],
userProfile: string
): Promise<RankedItem[]> => {
return Promise.resolve([])
}
// Does some grouping by topic while trying to maintain ranking
// adds some basic topic diversity
const chooseRankedSelections = (rankedCandidates: RankedItem[]) => {
const selected = []
const rankedTopics = []
const topicCount = {} as Record<string, number>
for (const item of rankedCandidates) {
if (selected.length >= 5) {
break
}
topicCount[item.topic] = (topicCount[item.topic] || 0) + 1
if (topicCount[item.topic] <= 2) {
selected.push(item)
if (rankedTopics.indexOf(item.topic) === -1) {
rankedTopics.push(item.topic)
}
}
}
console.log('rankedTopics: ', rankedTopics)
console.log('finalSelections: ', selected)
const finalSelections = []
for (const topic of rankedTopics) {
const matches = selected.filter((item) => item.topic == topic)
finalSelections.push(...matches)
}
console.log('finalSelections: ', finalSelections)
return finalSelections
}
// TODO: we could paralleize this step sending all the ranked candidates to openAI at once
const summarizeItems = async (
rankedCandidates: RankedItem[]
): Promise<RankedItem[]> => {
const llm = new OpenAI({
modelName: 'gpt-4-0125-preview',
configuration: {
apiKey: process.env.OPENAI_API_KEY,
},
})
for (const item of rankedCandidates) {
const contextualTemplate = PromptTemplate.fromTemplate(SUMMARIZE_PROMPT)
const chain = contextualTemplate.pipe(llm)
const summary = await chain.invoke({
title: item.libraryItem.title,
author: item.libraryItem.author ?? '',
content: item.libraryItem.readableContent, // markdown content
})
item.summary = summary
}
return rankedCandidates
}
// TODO: write the digest to redis here
const writeDigest = (userId: string, selections: RankedItem[]) => {}
export const CreateDigestJob = async (jobData: CreateDigestJobData) => {
try {
const candidates = await getCandidatesList(jobData.userId)
const userProfile = await findOrCreateUserProfile(jobData.userId)
const rankedCandidates = await rankCandidates(candidates, userProfile)
const selections = chooseRankedSelections(rankedCandidates)
const summaries = await summarizeItems(selections)
// TODO: we should have a QA step here that does some
// basic checks to make sure the summaries are good.
writeDigest(jobData.userId, summaries)
} catch (err) {
console.log('error creating summary: ', err)
}
}