@ -22,6 +22,7 @@
|
||||
"@google-cloud/storage": "^7.0.1",
|
||||
"@google-cloud/tasks": "^4.0.0",
|
||||
"@graphql-tools/utils": "^9.1.1",
|
||||
"@langchain/anthropic": "^0.1.16",
|
||||
"@langchain/openai": "^0.0.14",
|
||||
"@notionhq/client": "^2.2.14",
|
||||
"@omnivore/content-handler": "1.0.0",
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
import { ChatAnthropic } from '@langchain/anthropic'
|
||||
import { JsonOutputParser } from '@langchain/core/output_parsers'
|
||||
import { PromptTemplate } from '@langchain/core/prompts'
|
||||
import { ChatPromptTemplate, PromptTemplate } from '@langchain/core/prompts'
|
||||
import { OpenAI } from '@langchain/openai'
|
||||
import {
|
||||
htmlToSpeechFile,
|
||||
@ -8,6 +9,7 @@ import {
|
||||
} from '@omnivore/text-to-speech-handler'
|
||||
import axios from 'axios'
|
||||
import showdown from 'showdown'
|
||||
import { v4 as uuid } from 'uuid'
|
||||
import yaml from 'yaml'
|
||||
import { LibraryItem } from '../../entity/library_item'
|
||||
import { TaskState } from '../../generated/graphql'
|
||||
@ -18,14 +20,15 @@ import {
|
||||
searchLibraryItems,
|
||||
} from '../../services/library_item'
|
||||
import { findDeviceTokensByUserId } from '../../services/user_device_tokens'
|
||||
import { wordsCount } from '../../utils/helpers'
|
||||
import { logger } from '../../utils/logger'
|
||||
import { htmlToMarkdown } from '../../utils/parser'
|
||||
import { sendMulticastPushNotifications } from '../../utils/sendNotification'
|
||||
|
||||
export type CreateDigestJobSchedule = 'daily' | 'weekly'
|
||||
|
||||
export interface CreateDigestJobData {
|
||||
id: string
|
||||
export interface CreateDigestData {
|
||||
id?: string
|
||||
userId: string
|
||||
voices?: string[]
|
||||
language?: string
|
||||
@ -113,18 +116,17 @@ const getPreferencesList = async (userId: string): Promise<LibraryItem[]> => {
|
||||
// reason: "some older items that were interacted with"
|
||||
|
||||
const preferences = await Promise.all(
|
||||
digestDefinition.preferenceSelectors.map(async (selector) => {
|
||||
// use the selector to fetch items
|
||||
const results = await searchLibraryItems(
|
||||
{
|
||||
query: selector.query,
|
||||
size: selector.count,
|
||||
},
|
||||
userId
|
||||
)
|
||||
|
||||
return results.libraryItems
|
||||
})
|
||||
digestDefinition.preferenceSelectors.map(
|
||||
async (selector) =>
|
||||
// use the selector to fetch items
|
||||
await searchLibraryItems(
|
||||
{
|
||||
query: selector.query,
|
||||
size: selector.count,
|
||||
},
|
||||
userId
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
// deduplicate and flatten the items
|
||||
@ -137,11 +139,17 @@ const getPreferencesList = async (userId: string): Promise<LibraryItem[]> => {
|
||||
return dedupedPreferences
|
||||
}
|
||||
|
||||
const randomSelectCandidates = (candidates: LibraryItem[]): LibraryItem[] => {
|
||||
// randomly choose at most 25 candidates
|
||||
return candidates.sort(() => 0.5 - Math.random()).slice(0, 25)
|
||||
}
|
||||
|
||||
// Makes multiple DB queries and combines the results
|
||||
const getCandidatesList = async (
|
||||
userId: string,
|
||||
selectedLibraryItemIds?: string[]
|
||||
): Promise<LibraryItem[]> => {
|
||||
console.time('getCandidatesList')
|
||||
// use the queries from the digest definitions to lookup preferences
|
||||
// There should be a list of multiple queries we use. For now we can
|
||||
// hardcode these queries:
|
||||
@ -153,28 +161,28 @@ const getCandidatesList = async (
|
||||
return findLibraryItemsByIds(selectedLibraryItemIds, userId)
|
||||
}
|
||||
|
||||
// get the existing candidate ids from cache
|
||||
const key = `digest:${userId}:existingCandidateIds`
|
||||
const existingCandidateIds = await redisDataSource.redisClient?.get(key)
|
||||
// // get the existing candidate ids from cache
|
||||
// const key = `digest:${userId}:existingCandidateIds`
|
||||
// const existingCandidateIds = await redisDataSource.redisClient?.get(key)
|
||||
|
||||
logger.info('existingCandidateIds: ', { existingCandidateIds })
|
||||
// logger.info('existingCandidateIds: ', { existingCandidateIds })
|
||||
|
||||
const candidates = await Promise.all(
|
||||
digestDefinition.candidateSelectors.map(async (selector) => {
|
||||
// use the selector to fetch items
|
||||
const results = await searchLibraryItems(
|
||||
{
|
||||
includeContent: true,
|
||||
query: existingCandidateIds
|
||||
? `(${selector.query}) -includes:${existingCandidateIds}` // exclude the existing candidates
|
||||
: selector.query,
|
||||
size: selector.count,
|
||||
},
|
||||
userId
|
||||
)
|
||||
|
||||
return results.libraryItems
|
||||
})
|
||||
digestDefinition.candidateSelectors.map(
|
||||
async (selector) =>
|
||||
// use the selector to fetch items
|
||||
await searchLibraryItems(
|
||||
{
|
||||
includeContent: true,
|
||||
// query: existingCandidateIds
|
||||
// ? `(${selector.query}) -includes:${existingCandidateIds}` // exclude the existing candidates
|
||||
// : selector.query,
|
||||
query: selector.query,
|
||||
size: selector.count,
|
||||
},
|
||||
userId
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
// deduplicate and flatten the items
|
||||
@ -188,24 +196,38 @@ const getCandidatesList = async (
|
||||
readableContent: htmlToMarkdown(item.readableContent),
|
||||
})) // convert the html content to markdown
|
||||
|
||||
logger.info(
|
||||
'dedupedCandidates: ',
|
||||
dedupedCandidates.map((item) => item.title)
|
||||
)
|
||||
|
||||
console.timeEnd('getCandidatesList')
|
||||
|
||||
if (dedupedCandidates.length === 0) {
|
||||
logger.info('No new candidates found')
|
||||
|
||||
if (existingCandidateIds) {
|
||||
// reuse the existing candidates
|
||||
const existingIds = existingCandidateIds.split(',')
|
||||
return findLibraryItemsByIds(existingIds, userId)
|
||||
}
|
||||
// if (existingCandidateIds) {
|
||||
// // reuse the existing candidates
|
||||
// const existingIds = existingCandidateIds.split(',')
|
||||
// return findLibraryItemsByIds(existingIds, userId)
|
||||
// }
|
||||
|
||||
// return empty array if no existing candidates
|
||||
return []
|
||||
}
|
||||
|
||||
// store the ids in cache
|
||||
const candidateIds = dedupedCandidates.map((item) => item.id).join(',')
|
||||
await redisDataSource.redisClient?.set(key, candidateIds)
|
||||
const selectedCandidates = randomSelectCandidates(dedupedCandidates)
|
||||
|
||||
return dedupedCandidates
|
||||
logger.info(
|
||||
'selectedCandidates: ',
|
||||
selectedCandidates.map((item) => item.title)
|
||||
)
|
||||
|
||||
// // store the ids in cache
|
||||
// const candidateIds = selectedCandidates.map((item) => item.id).join(',')
|
||||
// await redisDataSource.redisClient?.set(key, candidateIds)
|
||||
|
||||
return selectedCandidates
|
||||
}
|
||||
|
||||
// Takes a list of library items, and uses a prompt to generate
|
||||
@ -220,7 +242,7 @@ const createUserProfile = async (
|
||||
},
|
||||
})
|
||||
|
||||
const contextualTemplate = PromptTemplate.fromTemplate(
|
||||
const contextualTemplate = ChatPromptTemplate.fromTemplate(
|
||||
digestDefinition.zeroShot.userPreferencesProfilePrompt
|
||||
)
|
||||
|
||||
@ -347,31 +369,58 @@ const chooseRankedSelections = (rankedCandidates: RankedItem[]) => {
|
||||
const summarizeItems = async (
|
||||
rankedCandidates: RankedItem[]
|
||||
): Promise<RankedItem[]> => {
|
||||
const llm = new OpenAI({
|
||||
modelName: 'gpt-4-0125-preview',
|
||||
configuration: {
|
||||
apiKey: process.env.OPENAI_API_KEY,
|
||||
},
|
||||
console.time('summarizeItems')
|
||||
// const llm = new OpenAI({
|
||||
// modelName: 'gpt-4-0125-preview',
|
||||
// configuration: {
|
||||
// apiKey: process.env.OPENAI_API_KEY,
|
||||
// },
|
||||
// })
|
||||
|
||||
const llm = new ChatAnthropic({
|
||||
apiKey: process.env.CLAUDE_API_KEY,
|
||||
model: 'claude-3-sonnet-20240229',
|
||||
})
|
||||
|
||||
const contextualTemplate = PromptTemplate.fromTemplate(
|
||||
const contextualTemplate = ChatPromptTemplate.fromTemplate(
|
||||
digestDefinition.summaryPrompt
|
||||
)
|
||||
const chain = contextualTemplate.pipe(llm)
|
||||
|
||||
// send all the ranked candidates to openAI at once in a batch
|
||||
const summaries = await chain.batch(
|
||||
rankedCandidates.map((item) => ({
|
||||
title: item.libraryItem.title,
|
||||
author: item.libraryItem.author ?? '',
|
||||
content: item.libraryItem.readableContent, // markdown content
|
||||
}))
|
||||
// // send all the ranked candidates to openAI at once in a batch
|
||||
// const summaries = await chain.batch(
|
||||
// rankedCandidates.map((item) => ({
|
||||
// title: item.libraryItem.title,
|
||||
// author: item.libraryItem.author ?? '',
|
||||
// content: item.libraryItem.readableContent, // markdown content
|
||||
// }))
|
||||
// )
|
||||
|
||||
const prompts = await Promise.all(
|
||||
rankedCandidates.map(async (item) => {
|
||||
try {
|
||||
return await contextualTemplate.format({
|
||||
title: item.libraryItem.title,
|
||||
author: item.libraryItem.author ?? '',
|
||||
content: item.libraryItem.readableContent, // markdown content
|
||||
})
|
||||
} catch (error) {
|
||||
logger.error('summarizeItems error', error)
|
||||
return ''
|
||||
}
|
||||
})
|
||||
)
|
||||
logger.info('prompts: ', prompts)
|
||||
|
||||
const summaries = await llm.batch(prompts)
|
||||
logger.info('summaries: ', summaries)
|
||||
|
||||
summaries.forEach(
|
||||
(summary, index) => (rankedCandidates[index].summary = summary)
|
||||
(summary, index) =>
|
||||
(rankedCandidates[index].summary = summary.content.toString())
|
||||
)
|
||||
|
||||
console.timeEnd('summarizeItems')
|
||||
|
||||
return rankedCandidates
|
||||
}
|
||||
|
||||
@ -380,6 +429,7 @@ const generateSpeechFiles = (
|
||||
rankedItems: RankedItem[],
|
||||
options: SSMLOptions
|
||||
): SpeechFile[] => {
|
||||
console.time('generateSpeechFiles')
|
||||
// convert the summaries from markdown to HTML
|
||||
const converter = new showdown.Converter({
|
||||
backslashEscapesHTMLTags: true,
|
||||
@ -398,6 +448,8 @@ const generateSpeechFiles = (
|
||||
})
|
||||
})
|
||||
|
||||
console.timeEnd('generateSpeechFiles')
|
||||
|
||||
return speechFiles
|
||||
}
|
||||
|
||||
@ -405,7 +457,10 @@ const generateSpeechFiles = (
|
||||
// basic checks to make sure the summaries are good.
|
||||
const filterSummaries = (summaries: RankedItem[]): RankedItem[] => {
|
||||
return summaries.filter(
|
||||
(item) => item.summary.length < item.libraryItem.readableContent.length
|
||||
(item) =>
|
||||
wordsCount(item.summary) > 100 &&
|
||||
wordsCount(item.summary) < 1000 &&
|
||||
item.summary.length < item.libraryItem.readableContent.length
|
||||
)
|
||||
}
|
||||
|
||||
@ -434,7 +489,11 @@ const generateByline = (summaries: RankedItem[]): string =>
|
||||
.map((item) => item.libraryItem.author)
|
||||
.join(', ')
|
||||
|
||||
export const createDigestJob = async (jobData: CreateDigestJobData) => {
|
||||
export const createDigest = async (jobData: CreateDigestData) => {
|
||||
console.time('createDigestJob')
|
||||
|
||||
// generate a unique id for the digest if not provided for scheduled jobs
|
||||
const digestId = jobData.id ?? uuid()
|
||||
try {
|
||||
digestDefinition = await fetchDigestDefinition()
|
||||
|
||||
@ -445,18 +504,23 @@ export const createDigestJob = async (jobData: CreateDigestJobData) => {
|
||||
if (candidates.length === 0) {
|
||||
logger.info('No candidates found')
|
||||
return writeDigest(jobData.userId, {
|
||||
id: jobData.id,
|
||||
id: digestId,
|
||||
jobState: TaskState.Succeeded,
|
||||
title: 'No articles found',
|
||||
})
|
||||
}
|
||||
|
||||
const userProfile = await findOrCreateUserProfile(jobData.userId)
|
||||
const rankedCandidates = await rankCandidates(candidates, userProfile)
|
||||
const { finalSelections, rankedTopics } =
|
||||
chooseRankedSelections(rankedCandidates)
|
||||
// const userProfile = await findOrCreateUserProfile(jobData.userId)
|
||||
// const rankedCandidates = await rankCandidates(candidates, userProfile)
|
||||
// const { finalSelections, rankedTopics } =
|
||||
// chooseRankedSelections(rankedCandidates)
|
||||
|
||||
const summaries = await summarizeItems(finalSelections)
|
||||
const selections = candidates.map((item) => ({
|
||||
topic: '',
|
||||
libraryItem: item,
|
||||
summary: '',
|
||||
}))
|
||||
const summaries = await summarizeItems(selections)
|
||||
|
||||
const filteredSummaries = filterSummaries(summaries)
|
||||
|
||||
@ -467,7 +531,7 @@ export const createDigestJob = async (jobData: CreateDigestJobData) => {
|
||||
})
|
||||
const title = generateTitle(summaries)
|
||||
const digest: Digest = {
|
||||
id: jobData.id,
|
||||
id: digestId,
|
||||
title,
|
||||
content: generateContent(summaries),
|
||||
jobState: TaskState.Succeeded,
|
||||
@ -480,7 +544,8 @@ export const createDigestJob = async (jobData: CreateDigestJobData) => {
|
||||
wordCount: speechFiles[index].wordCount,
|
||||
})),
|
||||
createdAt: new Date(),
|
||||
description: generateDescription(summaries, rankedTopics),
|
||||
description: '',
|
||||
// description: generateDescription(summaries, rankedTopics),
|
||||
byline: generateByline(summaries),
|
||||
urlsToAudio: [],
|
||||
}
|
||||
@ -490,7 +555,7 @@ export const createDigestJob = async (jobData: CreateDigestJobData) => {
|
||||
logger.error('createDigestJob error', error)
|
||||
|
||||
await writeDigest(jobData.userId, {
|
||||
id: jobData.id,
|
||||
id: digestId,
|
||||
jobState: TaskState.Failed,
|
||||
})
|
||||
} finally {
|
||||
@ -507,5 +572,7 @@ export const createDigestJob = async (jobData: CreateDigestJobData) => {
|
||||
|
||||
await sendMulticastPushNotifications(jobData.userId, message, 'reminder')
|
||||
}
|
||||
|
||||
console.timeEnd('createDigestJob')
|
||||
}
|
||||
}
|
||||
|
||||
@ -243,14 +243,13 @@ const triggerActions = async (
|
||||
} catch (error) {
|
||||
if (error instanceof RequiresSearchQueryError) {
|
||||
logger.info('Failed to filter items by metadata, running search query')
|
||||
const searchResult = await searchLibraryItems(
|
||||
results = await searchLibraryItems(
|
||||
{
|
||||
query: `includes:${data.id} AND (${rule.filter})`,
|
||||
size: 1,
|
||||
},
|
||||
userId
|
||||
)
|
||||
results = searchResult.libraryItems
|
||||
} else {
|
||||
logger.error('Error filtering item events', error)
|
||||
await markRuleAsFailed(rule.id, userId)
|
||||
|
||||
@ -16,7 +16,7 @@ import { appDataSource } from './data_source'
|
||||
import { env } from './env'
|
||||
import { TaskState } from './generated/graphql'
|
||||
import { aiSummarize, AI_SUMMARIZE_JOB_NAME } from './jobs/ai-summarize'
|
||||
import { createDigestJob, CREATE_DIGEST_JOB } from './jobs/ai/create_digest'
|
||||
import { createDigest, CREATE_DIGEST_JOB } from './jobs/ai/create_digest'
|
||||
import { bulkAction, BULK_ACTION_JOB_NAME } from './jobs/bulk_action'
|
||||
import { callWebhook, CALL_WEBHOOK_JOB_NAME } from './jobs/call_webhook'
|
||||
import {
|
||||
@ -181,7 +181,7 @@ export const createWorker = (connection: ConnectionOptions) =>
|
||||
case FORWARD_EMAIL_JOB:
|
||||
return forwardEmailJob(job.data)
|
||||
case CREATE_DIGEST_JOB:
|
||||
return createDigestJob(job.data)
|
||||
return createDigest(job.data)
|
||||
default:
|
||||
logger.warning(`[queue-processor] unhandled job: ${job.name}`)
|
||||
}
|
||||
|
||||
@ -74,7 +74,7 @@ import {
|
||||
countLibraryItems,
|
||||
createOrUpdateLibraryItem,
|
||||
findLibraryItemsByPrefix,
|
||||
searchLibraryItems,
|
||||
searchAndCountLibraryItems,
|
||||
softDeleteLibraryItem,
|
||||
sortParamsToSort,
|
||||
updateLibraryItem,
|
||||
@ -675,7 +675,7 @@ export const searchResolver = authorized<
|
||||
return { errorCodes: [SearchErrorCode.QueryTooLong] }
|
||||
}
|
||||
|
||||
const { libraryItems, count } = await searchLibraryItems(
|
||||
const { libraryItems, count } = await searchAndCountLibraryItems(
|
||||
{
|
||||
from: Number(startCursor),
|
||||
size: first + 1, // fetch one more item to get next cursor
|
||||
@ -752,7 +752,7 @@ export const updatesSinceResolver = authorized<
|
||||
folder ? ' in:' + folder : ''
|
||||
} sort:${sort.by}-${sort.order}`
|
||||
|
||||
const { libraryItems, count } = await searchLibraryItems(
|
||||
const { libraryItems, count } = await searchAndCountLibraryItems(
|
||||
{
|
||||
from: Number(startCursor),
|
||||
size: size + 1, // fetch one more item to get next cursor
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
import cors from 'cors'
|
||||
import express from 'express'
|
||||
import { v4 as uuid } from 'uuid'
|
||||
import { env } from '../env'
|
||||
import { TaskState } from '../generated/graphql'
|
||||
import { CreateDigestJobSchedule } from '../jobs/ai/create_digest'
|
||||
@ -66,12 +65,6 @@ export function digestRouter() {
|
||||
}
|
||||
|
||||
try {
|
||||
const user = await findActiveUser(userId)
|
||||
if (!user) {
|
||||
logger.info(`User not found: ${userId}`)
|
||||
return res.sendStatus(401)
|
||||
}
|
||||
|
||||
const feature = await findGrantedFeatureByName(
|
||||
FeatureName.AIDigest,
|
||||
userId
|
||||
@ -95,9 +88,11 @@ export function digestRouter() {
|
||||
// enqueue job and return job id
|
||||
const result = await enqueueCreateDigest(
|
||||
{
|
||||
id: uuid(), // generate job id
|
||||
userId,
|
||||
...data,
|
||||
voices: data.voices,
|
||||
language: data.language,
|
||||
rate: data.rate,
|
||||
libraryItemIds: data.libraryItemIds,
|
||||
},
|
||||
data.schedule
|
||||
)
|
||||
@ -131,12 +126,6 @@ export function digestRouter() {
|
||||
}
|
||||
|
||||
try {
|
||||
const user = await findActiveUser(userId)
|
||||
if (!user) {
|
||||
logger.info(`User not found: ${userId}`)
|
||||
return res.sendStatus(401)
|
||||
}
|
||||
|
||||
const feature = await findGrantedFeatureByName(
|
||||
FeatureName.AIDigest,
|
||||
userId
|
||||
|
||||
@ -6,10 +6,10 @@ import {
|
||||
EntityManager,
|
||||
FindOptionsWhere,
|
||||
ObjectLiteral,
|
||||
SelectQueryBuilder,
|
||||
} from 'typeorm'
|
||||
import { QueryDeepPartialEntity } from 'typeorm/query-builder/QueryPartialEntity'
|
||||
import { ReadingProgressDataSource } from '../datasources/reading_progress_data_source'
|
||||
import { appDataSource } from '../data_source'
|
||||
import { EntityLabel } from '../entity/entity_label'
|
||||
import { Highlight } from '../entity/highlight'
|
||||
import { Label } from '../entity/label'
|
||||
@ -18,12 +18,7 @@ import { env } from '../env'
|
||||
import { BulkActionType, InputMaybe, SortParams } from '../generated/graphql'
|
||||
import { createPubSubClient, EntityEvent, EntityType } from '../pubsub'
|
||||
import { redisDataSource } from '../redis_data_source'
|
||||
import {
|
||||
authTrx,
|
||||
getColumns,
|
||||
getRepository,
|
||||
queryBuilderToRawSql,
|
||||
} from '../repository'
|
||||
import { authTrx, getColumns, queryBuilderToRawSql } from '../repository'
|
||||
import { libraryItemRepository } from '../repository/library_item'
|
||||
import { Merge, PickTuple } from '../util'
|
||||
import { deepDelete, setRecentlySavedItemInRedis } from '../utils/helpers'
|
||||
@ -619,11 +614,13 @@ export const buildQueryString = (
|
||||
return serialize(searchQuery)
|
||||
}
|
||||
|
||||
export const buildQuery = (
|
||||
queryBuilder: SelectQueryBuilder<LibraryItem>,
|
||||
export const createSearchQueryBuilder = (
|
||||
args: SearchArgs,
|
||||
userId: string
|
||||
userId: string,
|
||||
em = appDataSource.manager
|
||||
) => {
|
||||
const queryBuilder = em.createQueryBuilder(LibraryItem, 'library_item')
|
||||
|
||||
// select all columns except content
|
||||
const selects: Select[] = getColumns(libraryItemRepository)
|
||||
.filter(
|
||||
@ -688,44 +685,54 @@ export const buildQuery = (
|
||||
orders.forEach((order) => {
|
||||
queryBuilder.addOrderBy(order.by, order.order, order.nulls)
|
||||
})
|
||||
|
||||
return queryBuilder
|
||||
}
|
||||
|
||||
export const countLibraryItems = async (args: SearchArgs, userId: string) => {
|
||||
const queryBuilder =
|
||||
getRepository(LibraryItem).createQueryBuilder('library_item')
|
||||
|
||||
buildQuery(queryBuilder, args, userId)
|
||||
|
||||
return queryBuilder.getCount()
|
||||
return authTrx(
|
||||
async (tx) => createSearchQueryBuilder(args, userId, tx).getCount(),
|
||||
undefined,
|
||||
userId
|
||||
)
|
||||
}
|
||||
|
||||
export const searchLibraryItems = async (
|
||||
args: SearchArgs,
|
||||
userId: string
|
||||
): Promise<{ libraryItems: LibraryItem[]; count: number }> => {
|
||||
): Promise<LibraryItem[]> => {
|
||||
const { from = 0, size = 10 } = args
|
||||
|
||||
if (size === 0) {
|
||||
// return only count if size is 0 because limit 0 is not allowed in typeorm
|
||||
return []
|
||||
}
|
||||
|
||||
return authTrx(
|
||||
async (tx) => {
|
||||
const queryBuilder = tx.createQueryBuilder(LibraryItem, 'library_item')
|
||||
buildQuery(queryBuilder, args, userId)
|
||||
|
||||
const count = await queryBuilder.getCount()
|
||||
if (size === 0) {
|
||||
// return only count if size is 0 because limit 0 is not allowed in typeorm
|
||||
return { libraryItems: [], count }
|
||||
}
|
||||
|
||||
// add pagination
|
||||
const libraryItems = await queryBuilder.skip(from).take(size).getMany()
|
||||
|
||||
return { libraryItems, count }
|
||||
},
|
||||
async (tx) =>
|
||||
createSearchQueryBuilder(args, userId, tx)
|
||||
.skip(from)
|
||||
.take(size)
|
||||
.getMany(),
|
||||
undefined,
|
||||
userId
|
||||
)
|
||||
}
|
||||
|
||||
export const searchAndCountLibraryItems = async (
|
||||
args: SearchArgs,
|
||||
userId: string
|
||||
): Promise<{ libraryItems: LibraryItem[]; count: number }> => {
|
||||
const count = await countLibraryItems(args, userId)
|
||||
if (count === 0) {
|
||||
return { libraryItems: [], count }
|
||||
}
|
||||
|
||||
const libraryItems = await searchLibraryItems(args, userId)
|
||||
|
||||
return { libraryItems, count }
|
||||
}
|
||||
|
||||
export const findRecentLibraryItems = async (
|
||||
userId: string,
|
||||
limit = 1000,
|
||||
|
||||
@ -18,7 +18,7 @@ import {
|
||||
} from '../generated/graphql'
|
||||
import { AISummarizeJobData, AI_SUMMARIZE_JOB_NAME } from '../jobs/ai-summarize'
|
||||
import {
|
||||
CreateDigestJobData,
|
||||
CreateDigestData,
|
||||
CreateDigestJobResponse,
|
||||
CreateDigestJobSchedule,
|
||||
CREATE_DIGEST_JOB,
|
||||
@ -856,7 +856,7 @@ export const enqueueSendEmail = async (jobData: SendEmailJobData) => {
|
||||
}
|
||||
|
||||
export const enqueueCreateDigest = async (
|
||||
data: CreateDigestJobData,
|
||||
data: CreateDigestData,
|
||||
schedule?: CreateDigestJobSchedule
|
||||
): Promise<CreateDigestJobResponse> => {
|
||||
const queue = await getBackendQueue()
|
||||
@ -864,6 +864,8 @@ export const enqueueCreateDigest = async (
|
||||
throw new Error('No queue found')
|
||||
}
|
||||
|
||||
// generate unique id for the digest
|
||||
data.id = uuid()
|
||||
// enqueue create digest job immediately
|
||||
const jobId = `${CREATE_DIGEST_JOB}_${data.userId}`
|
||||
const job = await queue.add(CREATE_DIGEST_JOB, data, {
|
||||
@ -890,9 +892,9 @@ export const enqueueCreateDigest = async (
|
||||
await writeDigest(data.userId, digest)
|
||||
|
||||
if (schedule) {
|
||||
// remove existing repeated job if any
|
||||
await Promise.all(
|
||||
Object.keys(CRON_PATTERNS).map(async (key) => {
|
||||
// remove existing repeated job if any
|
||||
const isDeleted = await queue.removeRepeatable(
|
||||
CREATE_DIGEST_JOB,
|
||||
{
|
||||
@ -909,6 +911,9 @@ export const enqueueCreateDigest = async (
|
||||
)
|
||||
|
||||
// schedule repeated job
|
||||
// delete the digest id to avoid duplication
|
||||
delete data.id
|
||||
|
||||
const job = await queue.add(CREATE_DIGEST_JOB, data, {
|
||||
attempts: 1,
|
||||
priority: getJobPriority(CREATE_DIGEST_JOB),
|
||||
|
||||
@ -6,7 +6,7 @@ import { userRepository } from '../../src/repository/user'
|
||||
import { isValidSignupRequest } from '../../src/routers/auth/auth_router'
|
||||
import { AuthProvider } from '../../src/routers/auth/auth_types'
|
||||
import { createPendingUserToken } from '../../src/routers/auth/jwt_helpers'
|
||||
import { searchLibraryItems } from '../../src/services/library_item'
|
||||
import { searchAndCountLibraryItems } from '../../src/services/library_item'
|
||||
import { deleteUser, updateUser } from '../../src/services/user'
|
||||
import {
|
||||
comparePassword,
|
||||
@ -528,7 +528,7 @@ describe('auth router', () => {
|
||||
'web'
|
||||
).expect(200)
|
||||
const user = await userRepository.findOneByOrFail({ name })
|
||||
const { count } = await searchLibraryItems(
|
||||
const { count } = await searchAndCountLibraryItems(
|
||||
{ query: 'in:inbox sort:read-desc is:reading' },
|
||||
user.id
|
||||
)
|
||||
@ -552,7 +552,10 @@ describe('auth router', () => {
|
||||
'ios'
|
||||
).expect(200)
|
||||
const user = await userRepository.findOneByOrFail({ name })
|
||||
const { count } = await searchLibraryItems({ query: 'in:all' }, user.id)
|
||||
const { count } = await searchAndCountLibraryItems(
|
||||
{ query: 'in:all' },
|
||||
user.id
|
||||
)
|
||||
|
||||
expect(count).to.eql(4)
|
||||
})
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
{
|
||||
"extension": ["ts"],
|
||||
"spec": "test/**/*.test.ts"
|
||||
"spec": "test/**/*.test.ts",
|
||||
"timeout": 10000
|
||||
}
|
||||
|
||||
@ -5,6 +5,7 @@ import {
|
||||
SentenceTokenizerNew,
|
||||
WordPunctTokenizer,
|
||||
} from 'natural'
|
||||
import { isElement } from 'underscore'
|
||||
|
||||
// this code needs to be kept in sync with the
|
||||
// frontend code in: useReadingProgressAnchor
|
||||
@ -49,6 +50,7 @@ const DEFAULT_LANGUAGE = 'en-US'
|
||||
const DEFAULT_VOICE = 'en-US-JennyNeural'
|
||||
const DEFAULT_SECONDARY_VOICE = 'en-US-GuyNeural'
|
||||
const DEFAULT_RATE = '1.1'
|
||||
const ELEMENT_INDEX_ATTRIBUTE = 'data-omnivore-anchor-idx'
|
||||
|
||||
const ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES = [
|
||||
'omnivore-highlight-id',
|
||||
@ -74,6 +76,19 @@ const TOP_LEVEL_TAGS = [
|
||||
'LI',
|
||||
]
|
||||
|
||||
const SKIP_TAGS = [
|
||||
'SCRIPT',
|
||||
'STYLE',
|
||||
'IMG',
|
||||
'FIGURE',
|
||||
'FIGCAPTION',
|
||||
'IFRAME',
|
||||
'CODE',
|
||||
]
|
||||
|
||||
const getAnchorIndex = (element: Element): number =>
|
||||
Number(element.getAttribute(ELEMENT_INDEX_ATTRIBUTE))
|
||||
|
||||
function parseDomTree(pageNode: Element) {
|
||||
if (!pageNode || pageNode.childNodes.length == 0) {
|
||||
console.log('no child nodes found')
|
||||
@ -108,7 +123,7 @@ function parseDomTree(pageNode: Element) {
|
||||
visitedNodeList.forEach((node, index) => {
|
||||
// We start at index 3, because the frontend starts two nodes above us
|
||||
// on the #readability-page-1 element that wraps the entire content.
|
||||
node.setAttribute('data-omnivore-anchor-idx', (index + 3).toString())
|
||||
node.setAttribute(ELEMENT_INDEX_ATTRIBUTE, (index + 3).toString())
|
||||
})
|
||||
return visitedNodeList
|
||||
}
|
||||
@ -146,18 +161,8 @@ function emitElement(
|
||||
element: Element,
|
||||
isTopLevel: boolean
|
||||
) {
|
||||
const SKIP_TAGS = [
|
||||
'SCRIPT',
|
||||
'STYLE',
|
||||
'IMG',
|
||||
'FIGURE',
|
||||
'FIGCAPTION',
|
||||
'IFRAME',
|
||||
'CODE',
|
||||
]
|
||||
|
||||
const topLevelTags = ssmlTagsForTopLevelElement()
|
||||
const idx = element.getAttribute('data-omnivore-anchor-idx')
|
||||
const idx = getAnchorIndex(element)
|
||||
let maxVisitedIdx = Number(idx)
|
||||
|
||||
if (isTopLevel) {
|
||||
@ -166,6 +171,11 @@ function emitElement(
|
||||
|
||||
for (const child of Array.from(element.childNodes)) {
|
||||
if (SKIP_TAGS.indexOf(child.nodeName) >= 0) {
|
||||
// Skip unwanted tags and update the index
|
||||
if (isElement(child)) {
|
||||
const childIdx = getAnchorIndex(child)
|
||||
maxVisitedIdx = Math.max(maxVisitedIdx, childIdx)
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
@ -180,8 +190,8 @@ function emitElement(
|
||||
}
|
||||
emitTextNode(textItems, cleanedText, child)
|
||||
}
|
||||
if (child.nodeType == 1 /* Node.ELEMENT_NODE */) {
|
||||
maxVisitedIdx = emitElement(textItems, child as HTMLElement, false)
|
||||
if (isElement(child)) {
|
||||
maxVisitedIdx = emitElement(textItems, child, false)
|
||||
if (child.nodeName === 'LI') {
|
||||
// add a new line after each list item
|
||||
emit(textItems, '\n')
|
||||
@ -265,6 +275,11 @@ export const stripEmojis = (text: string): string => {
|
||||
return text.replace(emojiRegex, '').replace(/\s+/g, ' ')
|
||||
}
|
||||
|
||||
const filterUtterances = (utterances: Utterance[]): Utterance[] => {
|
||||
const punctuationOrSpaceOnly = /^[\s.,;:!?]+$/
|
||||
return utterances.filter((u) => !punctuationOrSpaceOnly.test(u.text))
|
||||
}
|
||||
|
||||
const textToUtterances = ({
|
||||
wordTokenizer,
|
||||
idx,
|
||||
@ -309,7 +324,10 @@ const textToUtterances = ({
|
||||
const sentenceTokenizer = new SentenceTokenizerNew()
|
||||
sentences = sentenceTokenizer.tokenize(text)
|
||||
} catch (err) {
|
||||
console.debug('Unable to tokenize sentences')
|
||||
console.log(
|
||||
'Unable to tokenize sentences, falling back to old tokenizer',
|
||||
text
|
||||
)
|
||||
// fallback to old sentence tokenizer
|
||||
const sentenceTokenizer = new SentenceTokenizer()
|
||||
sentences = sentenceTokenizer.tokenize(text)
|
||||
@ -369,6 +387,18 @@ const replaceSmartQuotes = (text: string): string => {
|
||||
return text.replace(/[\u2018\u2019]/g, "'").replace(/[\u201C\u201D]/g, '"')
|
||||
}
|
||||
|
||||
// get the max idx of the element and its children
|
||||
const getMaxVisitedIdx = (element: Element): number => {
|
||||
let maxVisitedIdx = getAnchorIndex(element)
|
||||
for (const child of Array.from(element.childNodes)) {
|
||||
if (isElement(child)) {
|
||||
maxVisitedIdx = Math.max(maxVisitedIdx, getMaxVisitedIdx(child))
|
||||
}
|
||||
}
|
||||
|
||||
return maxVisitedIdx
|
||||
}
|
||||
|
||||
export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => {
|
||||
const { title, content, options } = htmlInput
|
||||
console.log('creating speech file with options:', options)
|
||||
@ -421,6 +451,12 @@ export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => {
|
||||
const textItems: string[] = []
|
||||
const node = parsedNodes[i - 3]
|
||||
|
||||
// skip unwanted tags and update the index
|
||||
if (SKIP_TAGS.includes(node.nodeName)) {
|
||||
i = getMaxVisitedIdx(node)
|
||||
continue
|
||||
}
|
||||
|
||||
if (TOP_LEVEL_TAGS.includes(node.nodeName) || hasSignificantText(node)) {
|
||||
// use paragraph as anchor
|
||||
const idx = i.toString()
|
||||
@ -441,10 +477,12 @@ export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => {
|
||||
}
|
||||
}
|
||||
|
||||
const filteredUtterances = filterUtterances(utterances)
|
||||
|
||||
return {
|
||||
wordCount: wordOffset,
|
||||
language,
|
||||
defaultVoice,
|
||||
utterances,
|
||||
utterances: filteredUtterances,
|
||||
}
|
||||
}
|
||||
|
||||
@ -350,6 +350,41 @@ describe('convert HTML to Speech file', () => {
|
||||
options: TEST_OPTIONS,
|
||||
})
|
||||
expect(speechFile.utterances).to.have.lengthOf(2)
|
||||
expect(speechFile.utterances[1].text).to.eql('McMeekin makes an extended case that Stalin was preparing to attack Nazi Germany when Hitler attacked him, that the two dictators were basically in a race to see who could mobilize to betray the other first — and that the initial Soviet debacle in 1941 happened in part because Stalin was also pushing his military toward an offensive alignment, and they were caught in a "mid-mobilization limbo."')
|
||||
expect(speechFile.utterances[1].text).to.eql(
|
||||
'McMeekin makes an extended case that Stalin was preparing to attack Nazi Germany when Hitler attacked him, that the two dictators were basically in a race to see who could mobilize to betray the other first — and that the initial Soviet debacle in 1941 happened in part because Stalin was also pushing his military toward an offensive alignment, and they were caught in a "mid-mobilization limbo."'
|
||||
)
|
||||
})
|
||||
|
||||
it('skip unwanted elements', () => {
|
||||
const html = `<div class="page" id="readability-page-1" data-omnivore-anchor-idx="1">
|
||||
<p data-omnivore-anchor-idx="2">This is a test.</p>
|
||||
<script data-omnivore-anchor-idx="3">alert('hello');</script>
|
||||
<style data-omnivore-anchor-idx="4">body { color: red; }</style>
|
||||
<iframe data-omnivore-anchor-idx="6" src="https://example.com">test</iframe>
|
||||
<figcaption data-omnivore-anchor-idx="7">test</figcaption>
|
||||
</div>`
|
||||
|
||||
const speechFile = htmlToSpeechFile({
|
||||
content: html,
|
||||
options: TEST_OPTIONS,
|
||||
})
|
||||
expect(speechFile.utterances).to.have.lengthOf(1)
|
||||
expect(speechFile.utterances[0].text).to.eql('This is a test.')
|
||||
})
|
||||
|
||||
it('filters out utterances with only punctuation or whitespace', () => {
|
||||
const html = `<div class="page" id="readability-page-1" data-omnivore-anchor-idx="1">
|
||||
<p data-omnivore-anchor-idx="2">This is a test.</p>
|
||||
<p data-omnivore-anchor-idx="3">.</p>
|
||||
<p data-omnivore-anchor-idx="4"> </p>
|
||||
</div>`
|
||||
|
||||
const speechFile = htmlToSpeechFile({
|
||||
content: html,
|
||||
options: TEST_OPTIONS,
|
||||
})
|
||||
|
||||
expect(speechFile.utterances).to.have.lengthOf(1)
|
||||
expect(speechFile.utterances[0].text).to.eql('This is a test.')
|
||||
})
|
||||
})
|
||||
|
||||
@ -6,5 +6,5 @@
|
||||
// Generate d.ts files
|
||||
"declaration": true
|
||||
},
|
||||
"include": ["src"]
|
||||
"include": ["src", "test"]
|
||||
}
|
||||
|
||||
55
yarn.lock
55
yarn.lock
@ -50,6 +50,20 @@
|
||||
lodash "^4.17.21"
|
||||
resize-observer-polyfill "^1.5.1"
|
||||
|
||||
"@anthropic-ai/sdk@^0.20.1":
|
||||
version "0.20.7"
|
||||
resolved "https://registry.yarnpkg.com/@anthropic-ai/sdk/-/sdk-0.20.7.tgz#b19b0e66ba070f928bbf583c06d76e6efdd93d5e"
|
||||
integrity sha512-uyc+3WGLpe8ur6mSIKSab7P9JdBerTdmqb7popc/yROYLLCW/Ykyw4ZfjmN/cLmxjnAKnv5YUngzbPM0BJuGjg==
|
||||
dependencies:
|
||||
"@types/node" "^18.11.18"
|
||||
"@types/node-fetch" "^2.6.4"
|
||||
abort-controller "^3.0.0"
|
||||
agentkeepalive "^4.2.1"
|
||||
form-data-encoder "1.7.2"
|
||||
formdata-node "^4.3.2"
|
||||
node-fetch "^2.6.7"
|
||||
web-streams-polyfill "^3.2.1"
|
||||
|
||||
"@anthropic-ai/sdk@^0.9.1":
|
||||
version "0.9.1"
|
||||
resolved "https://registry.yarnpkg.com/@anthropic-ai/sdk/-/sdk-0.9.1.tgz#b2d2b7bf05c90dce502c9a2e869066870f69ba88"
|
||||
@ -3774,6 +3788,17 @@
|
||||
dependencies:
|
||||
lodash "^4.17.21"
|
||||
|
||||
"@langchain/anthropic@^0.1.16":
|
||||
version "0.1.16"
|
||||
resolved "https://registry.yarnpkg.com/@langchain/anthropic/-/anthropic-0.1.16.tgz#c2a9d3dd4e02df7118dd97cf2503c9bd1a4de5ad"
|
||||
integrity sha512-vCbwkZ3pkMSKf67fBgNlslvuW9f3EZGBbO8Ic2etgX3xFl6L0WuMtfS26P1FCDpRwaKuC1BrCj2aLAeMzMq/Fg==
|
||||
dependencies:
|
||||
"@anthropic-ai/sdk" "^0.20.1"
|
||||
"@langchain/core" "~0.1.56"
|
||||
fast-xml-parser "^4.3.5"
|
||||
zod "^3.22.4"
|
||||
zod-to-json-schema "^3.22.4"
|
||||
|
||||
"@langchain/community@~0.0.33":
|
||||
version "0.0.33"
|
||||
resolved "https://registry.yarnpkg.com/@langchain/community/-/community-0.0.33.tgz#5568fe36b1e2f8947d49414d47e14a27da5b65c9"
|
||||
@ -3803,6 +3828,24 @@
|
||||
zod "^3.22.4"
|
||||
zod-to-json-schema "^3.22.3"
|
||||
|
||||
"@langchain/core@~0.1.56":
|
||||
version "0.1.61"
|
||||
resolved "https://registry.yarnpkg.com/@langchain/core/-/core-0.1.61.tgz#9313363e04f1c6981a938b2909c44ce6fceb2736"
|
||||
integrity sha512-C8OkAly+ugvXsL8TACCmFv9WTTcT4gvQaG6NbrXCOzibBCywfxxcTqEMOyg3zIKpxHEmR0DHqh0OiJRHocnsCg==
|
||||
dependencies:
|
||||
ansi-styles "^5.0.0"
|
||||
camelcase "6"
|
||||
decamelize "1.2.0"
|
||||
js-tiktoken "^1.0.8"
|
||||
langsmith "~0.1.7"
|
||||
ml-distance "^4.0.0"
|
||||
mustache "^4.2.0"
|
||||
p-queue "^6.6.2"
|
||||
p-retry "4"
|
||||
uuid "^9.0.0"
|
||||
zod "^3.22.4"
|
||||
zod-to-json-schema "^3.22.3"
|
||||
|
||||
"@langchain/openai@^0.0.14":
|
||||
version "0.0.14"
|
||||
resolved "https://registry.yarnpkg.com/@langchain/openai/-/openai-0.0.14.tgz#27a6ba83f6b754391868b22f3b90cd440038acf0"
|
||||
@ -15694,7 +15737,7 @@ fast-xml-parser@^4.2.2, fast-xml-parser@^4.3.0:
|
||||
dependencies:
|
||||
strnum "^1.0.5"
|
||||
|
||||
fast-xml-parser@^4.3.2:
|
||||
fast-xml-parser@^4.3.2, fast-xml-parser@^4.3.5:
|
||||
version "4.3.6"
|
||||
resolved "https://registry.yarnpkg.com/fast-xml-parser/-/fast-xml-parser-4.3.6.tgz#190f9d99097f0c8f2d3a0e681a10404afca052ff"
|
||||
integrity sha512-M2SovcRxD4+vC493Uc2GZVcZaj66CCJhWurC4viynVSTvrpErCShNcDz1lAho6n9REQKvL/ll4A4/fw6Y9z8nw==
|
||||
@ -22822,6 +22865,11 @@ multimatch@5.0.0:
|
||||
arrify "^2.0.1"
|
||||
minimatch "^3.0.4"
|
||||
|
||||
mustache@^4.2.0:
|
||||
version "4.2.0"
|
||||
resolved "https://registry.yarnpkg.com/mustache/-/mustache-4.2.0.tgz#e5892324d60a12ec9c2a73359edca52972bf6f64"
|
||||
integrity sha512-71ippSywq5Yb7/tVYyGbkBggbU8H3u5Rz56fH60jGFgr8uHwxs+aSKeqmluIVzM0m0kB7xQjKS6qPfd0b2ZoqQ==
|
||||
|
||||
mute-stream@0.0.8, mute-stream@~0.0.4:
|
||||
version "0.0.8"
|
||||
resolved "https://registry.yarnpkg.com/mute-stream/-/mute-stream-0.0.8.tgz#1630c42b2251ff81e2a283de96a5497ea92e5e0d"
|
||||
@ -32217,6 +32265,11 @@ zod-to-json-schema@^3.22.3:
|
||||
resolved "https://registry.yarnpkg.com/zod-to-json-schema/-/zod-to-json-schema-3.22.4.tgz#f8cc691f6043e9084375e85fb1f76ebafe253d70"
|
||||
integrity sha512-2Ed5dJ+n/O3cU383xSY28cuVi0BCQhF8nYqWU5paEpl7fVdqdAmiLdqLyfblbNdfOFwFfi/mqU4O1pwc60iBhQ==
|
||||
|
||||
zod-to-json-schema@^3.22.4:
|
||||
version "3.23.0"
|
||||
resolved "https://registry.yarnpkg.com/zod-to-json-schema/-/zod-to-json-schema-3.23.0.tgz#4fc60e88d3c709eedbfaae3f92f8a7bf786469f2"
|
||||
integrity sha512-az0uJ243PxsRIa2x1WmNE/pnuA05gUq/JB8Lwe1EDCCL/Fz9MgjYQ0fPlyc2Tcv6aF2ZA7WM5TWaRZVEFaAIag==
|
||||
|
||||
zod@^3.22.3, zod@^3.22.4:
|
||||
version "3.22.4"
|
||||
resolved "https://registry.yarnpkg.com/zod/-/zod-3.22.4.tgz#f31c3a9386f61b1f228af56faa9255e845cf3fff"
|
||||
|
||||
Reference in New Issue
Block a user