From d0d088aae35b97117018e5cfbc8dc5ab5edef125 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Mon, 29 Apr 2024 12:49:13 +0800 Subject: [PATCH 01/16] fix: remove redundant user query --- packages/api/src/routers/digest_router.ts | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/packages/api/src/routers/digest_router.ts b/packages/api/src/routers/digest_router.ts index 86f0a4636..5f7a38fe8 100644 --- a/packages/api/src/routers/digest_router.ts +++ b/packages/api/src/routers/digest_router.ts @@ -66,12 +66,6 @@ export function digestRouter() { } try { - const user = await findActiveUser(userId) - if (!user) { - logger.info(`User not found: ${userId}`) - return res.sendStatus(401) - } - const feature = await findGrantedFeatureByName( FeatureName.AIDigest, userId @@ -131,12 +125,6 @@ export function digestRouter() { } try { - const user = await findActiveUser(userId) - if (!user) { - logger.info(`User not found: ${userId}`) - return res.sendStatus(401) - } - const feature = await findGrantedFeatureByName( FeatureName.AIDigest, userId From 14e91d338d2a84a190b516d1a1a37f30d9f72d39 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Mon, 29 Apr 2024 13:46:09 +0800 Subject: [PATCH 02/16] do not create user profile and topics --- packages/api/src/jobs/ai/create_digest.ts | 22 ++++++++++++++-------- packages/api/src/queue-processor.ts | 4 ++-- packages/api/src/utils/createTask.ts | 4 ++-- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/packages/api/src/jobs/ai/create_digest.ts b/packages/api/src/jobs/ai/create_digest.ts index a3be248a0..25f10dd94 100644 --- a/packages/api/src/jobs/ai/create_digest.ts +++ b/packages/api/src/jobs/ai/create_digest.ts @@ -24,7 +24,7 @@ import { sendMulticastPushNotifications } from '../../utils/sendNotification' export type CreateDigestJobSchedule = 'daily' | 'weekly' -export interface CreateDigestJobData { +export interface CreateDigestData { id: string userId: string voices?: string[] @@ -434,7 +434,7 @@ const generateByline = (summaries: RankedItem[]): string => .map((item) => item.libraryItem.author) .join(', ') -export const createDigestJob = async (jobData: CreateDigestJobData) => { +export const createDigest = async (jobData: CreateDigestData) => { try { digestDefinition = await fetchDigestDefinition() @@ -451,12 +451,17 @@ export const createDigestJob = async (jobData: CreateDigestJobData) => { }) } - const userProfile = await findOrCreateUserProfile(jobData.userId) - const rankedCandidates = await rankCandidates(candidates, userProfile) - const { finalSelections, rankedTopics } = - chooseRankedSelections(rankedCandidates) + // const userProfile = await findOrCreateUserProfile(jobData.userId) + // const rankedCandidates = await rankCandidates(candidates, userProfile) + // const { finalSelections, rankedTopics } = + // chooseRankedSelections(rankedCandidates) - const summaries = await summarizeItems(finalSelections) + const selections = candidates.map((item) => ({ + topic: '', + libraryItem: item, + summary: '', + })) + const summaries = await summarizeItems(selections) const filteredSummaries = filterSummaries(summaries) @@ -480,7 +485,8 @@ export const createDigestJob = async (jobData: CreateDigestJobData) => { wordCount: speechFiles[index].wordCount, })), createdAt: new Date(), - description: generateDescription(summaries, rankedTopics), + description: '', + // description: generateDescription(summaries, rankedTopics), byline: generateByline(summaries), urlsToAudio: [], } diff --git a/packages/api/src/queue-processor.ts b/packages/api/src/queue-processor.ts index 89448b96f..2d0bf5881 100644 --- a/packages/api/src/queue-processor.ts +++ b/packages/api/src/queue-processor.ts @@ -16,7 +16,7 @@ import { appDataSource } from './data_source' import { env } from './env' import { TaskState } from './generated/graphql' import { aiSummarize, AI_SUMMARIZE_JOB_NAME } from './jobs/ai-summarize' -import { createDigestJob, CREATE_DIGEST_JOB } from './jobs/ai/create_digest' +import { createDigest, CREATE_DIGEST_JOB } from './jobs/ai/create_digest' import { bulkAction, BULK_ACTION_JOB_NAME } from './jobs/bulk_action' import { callWebhook, CALL_WEBHOOK_JOB_NAME } from './jobs/call_webhook' import { @@ -181,7 +181,7 @@ export const createWorker = (connection: ConnectionOptions) => case FORWARD_EMAIL_JOB: return forwardEmailJob(job.data) case CREATE_DIGEST_JOB: - return createDigestJob(job.data) + return createDigest(job.data) default: logger.warning(`[queue-processor] unhandled job: ${job.name}`) } diff --git a/packages/api/src/utils/createTask.ts b/packages/api/src/utils/createTask.ts index a8c6e2fb1..2383df98c 100644 --- a/packages/api/src/utils/createTask.ts +++ b/packages/api/src/utils/createTask.ts @@ -18,7 +18,7 @@ import { } from '../generated/graphql' import { AISummarizeJobData, AI_SUMMARIZE_JOB_NAME } from '../jobs/ai-summarize' import { - CreateDigestJobData, + CreateDigestData, CreateDigestJobResponse, CreateDigestJobSchedule, CREATE_DIGEST_JOB, @@ -856,7 +856,7 @@ export const enqueueSendEmail = async (jobData: SendEmailJobData) => { } export const enqueueCreateDigest = async ( - data: CreateDigestJobData, + data: CreateDigestData, schedule?: CreateDigestJobSchedule ): Promise => { const queue = await getBackendQueue() From 32be8126dfa5d1a2295b56ca796589c36bc74e22 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Mon, 29 Apr 2024 14:07:19 +0800 Subject: [PATCH 03/16] reduce number of db calls for search library items --- packages/api/src/jobs/ai/create_digest.ts | 52 ++++++++------- packages/api/src/jobs/trigger_rule.ts | 3 +- packages/api/src/resolvers/article/index.ts | 6 +- packages/api/src/services/library_item.ts | 71 +++++++++++---------- packages/api/test/routers/auth.test.ts | 9 ++- 5 files changed, 74 insertions(+), 67 deletions(-) diff --git a/packages/api/src/jobs/ai/create_digest.ts b/packages/api/src/jobs/ai/create_digest.ts index 25f10dd94..2a6e155d4 100644 --- a/packages/api/src/jobs/ai/create_digest.ts +++ b/packages/api/src/jobs/ai/create_digest.ts @@ -113,18 +113,17 @@ const getPreferencesList = async (userId: string): Promise => { // reason: "some older items that were interacted with" const preferences = await Promise.all( - digestDefinition.preferenceSelectors.map(async (selector) => { - // use the selector to fetch items - const results = await searchLibraryItems( - { - query: selector.query, - size: selector.count, - }, - userId - ) - - return results.libraryItems - }) + digestDefinition.preferenceSelectors.map( + async (selector) => + // use the selector to fetch items + await searchLibraryItems( + { + query: selector.query, + size: selector.count, + }, + userId + ) + ) ) // deduplicate and flatten the items @@ -160,21 +159,20 @@ const getCandidatesList = async ( logger.info('existingCandidateIds: ', { existingCandidateIds }) const candidates = await Promise.all( - digestDefinition.candidateSelectors.map(async (selector) => { - // use the selector to fetch items - const results = await searchLibraryItems( - { - includeContent: true, - query: existingCandidateIds - ? `(${selector.query}) -includes:${existingCandidateIds}` // exclude the existing candidates - : selector.query, - size: selector.count, - }, - userId - ) - - return results.libraryItems - }) + digestDefinition.candidateSelectors.map( + async (selector) => + // use the selector to fetch items + await searchLibraryItems( + { + includeContent: true, + query: existingCandidateIds + ? `(${selector.query}) -includes:${existingCandidateIds}` // exclude the existing candidates + : selector.query, + size: selector.count, + }, + userId + ) + ) ) // deduplicate and flatten the items diff --git a/packages/api/src/jobs/trigger_rule.ts b/packages/api/src/jobs/trigger_rule.ts index dcc1c902f..27b3a6a64 100644 --- a/packages/api/src/jobs/trigger_rule.ts +++ b/packages/api/src/jobs/trigger_rule.ts @@ -243,14 +243,13 @@ const triggerActions = async ( } catch (error) { if (error instanceof RequiresSearchQueryError) { logger.info('Failed to filter items by metadata, running search query') - const searchResult = await searchLibraryItems( + results = await searchLibraryItems( { query: `includes:${data.id} AND (${rule.filter})`, size: 1, }, userId ) - results = searchResult.libraryItems } else { logger.error('Error filtering item events', error) await markRuleAsFailed(rule.id, userId) diff --git a/packages/api/src/resolvers/article/index.ts b/packages/api/src/resolvers/article/index.ts index 874d235da..660474c7b 100644 --- a/packages/api/src/resolvers/article/index.ts +++ b/packages/api/src/resolvers/article/index.ts @@ -74,7 +74,7 @@ import { countLibraryItems, createOrUpdateLibraryItem, findLibraryItemsByPrefix, - searchLibraryItems, + searchAndCountLibraryItems, softDeleteLibraryItem, sortParamsToSort, updateLibraryItem, @@ -675,7 +675,7 @@ export const searchResolver = authorized< return { errorCodes: [SearchErrorCode.QueryTooLong] } } - const { libraryItems, count } = await searchLibraryItems( + const { libraryItems, count } = await searchAndCountLibraryItems( { from: Number(startCursor), size: first + 1, // fetch one more item to get next cursor @@ -752,7 +752,7 @@ export const updatesSinceResolver = authorized< folder ? ' in:' + folder : '' } sort:${sort.by}-${sort.order}` - const { libraryItems, count } = await searchLibraryItems( + const { libraryItems, count } = await searchAndCountLibraryItems( { from: Number(startCursor), size: size + 1, // fetch one more item to get next cursor diff --git a/packages/api/src/services/library_item.ts b/packages/api/src/services/library_item.ts index 47ec7bba8..d91cfa10d 100644 --- a/packages/api/src/services/library_item.ts +++ b/packages/api/src/services/library_item.ts @@ -6,10 +6,10 @@ import { EntityManager, FindOptionsWhere, ObjectLiteral, - SelectQueryBuilder, } from 'typeorm' import { QueryDeepPartialEntity } from 'typeorm/query-builder/QueryPartialEntity' import { ReadingProgressDataSource } from '../datasources/reading_progress_data_source' +import { appDataSource } from '../data_source' import { EntityLabel } from '../entity/entity_label' import { Highlight } from '../entity/highlight' import { Label } from '../entity/label' @@ -18,12 +18,7 @@ import { env } from '../env' import { BulkActionType, InputMaybe, SortParams } from '../generated/graphql' import { createPubSubClient, EntityEvent, EntityType } from '../pubsub' import { redisDataSource } from '../redis_data_source' -import { - authTrx, - getColumns, - getRepository, - queryBuilderToRawSql, -} from '../repository' +import { authTrx, getColumns, queryBuilderToRawSql } from '../repository' import { libraryItemRepository } from '../repository/library_item' import { Merge, PickTuple } from '../util' import { deepDelete, setRecentlySavedItemInRedis } from '../utils/helpers' @@ -619,11 +614,13 @@ export const buildQueryString = ( return serialize(searchQuery) } -export const buildQuery = ( - queryBuilder: SelectQueryBuilder, +export const createSearchQueryBuilder = ( args: SearchArgs, - userId: string + userId: string, + em = appDataSource.manager ) => { + const queryBuilder = em.createQueryBuilder(LibraryItem, 'library_item') + // select all columns except content const selects: Select[] = getColumns(libraryItemRepository) .filter( @@ -688,44 +685,54 @@ export const buildQuery = ( orders.forEach((order) => { queryBuilder.addOrderBy(order.by, order.order, order.nulls) }) + + return queryBuilder } export const countLibraryItems = async (args: SearchArgs, userId: string) => { - const queryBuilder = - getRepository(LibraryItem).createQueryBuilder('library_item') - - buildQuery(queryBuilder, args, userId) - - return queryBuilder.getCount() + return authTrx( + async (tx) => createSearchQueryBuilder(args, userId, tx).getCount(), + undefined, + userId + ) } export const searchLibraryItems = async ( args: SearchArgs, userId: string -): Promise<{ libraryItems: LibraryItem[]; count: number }> => { +): Promise => { const { from = 0, size = 10 } = args + if (size === 0) { + // return only count if size is 0 because limit 0 is not allowed in typeorm + return [] + } + return authTrx( - async (tx) => { - const queryBuilder = tx.createQueryBuilder(LibraryItem, 'library_item') - buildQuery(queryBuilder, args, userId) - - const count = await queryBuilder.getCount() - if (size === 0) { - // return only count if size is 0 because limit 0 is not allowed in typeorm - return { libraryItems: [], count } - } - - // add pagination - const libraryItems = await queryBuilder.skip(from).take(size).getMany() - - return { libraryItems, count } - }, + async (tx) => + createSearchQueryBuilder(args, userId, tx) + .skip(from) + .take(size) + .getMany(), undefined, userId ) } +export const searchAndCountLibraryItems = async ( + args: SearchArgs, + userId: string +): Promise<{ libraryItems: LibraryItem[]; count: number }> => { + const count = await countLibraryItems(args, userId) + if (count === 0) { + return { libraryItems: [], count } + } + + const libraryItems = await searchLibraryItems(args, userId) + + return { libraryItems, count } +} + export const findRecentLibraryItems = async ( userId: string, limit = 1000, diff --git a/packages/api/test/routers/auth.test.ts b/packages/api/test/routers/auth.test.ts index 32b438047..415231149 100644 --- a/packages/api/test/routers/auth.test.ts +++ b/packages/api/test/routers/auth.test.ts @@ -6,7 +6,7 @@ import { userRepository } from '../../src/repository/user' import { isValidSignupRequest } from '../../src/routers/auth/auth_router' import { AuthProvider } from '../../src/routers/auth/auth_types' import { createPendingUserToken } from '../../src/routers/auth/jwt_helpers' -import { searchLibraryItems } from '../../src/services/library_item' +import { searchAndCountLibraryItems } from '../../src/services/library_item' import { deleteUser, updateUser } from '../../src/services/user' import { comparePassword, @@ -528,7 +528,7 @@ describe('auth router', () => { 'web' ).expect(200) const user = await userRepository.findOneByOrFail({ name }) - const { count } = await searchLibraryItems( + const { count } = await searchAndCountLibraryItems( { query: 'in:inbox sort:read-desc is:reading' }, user.id ) @@ -552,7 +552,10 @@ describe('auth router', () => { 'ios' ).expect(200) const user = await userRepository.findOneByOrFail({ name }) - const { count } = await searchLibraryItems({ query: 'in:all' }, user.id) + const { count } = await searchAndCountLibraryItems( + { query: 'in:all' }, + user.id + ) expect(count).to.eql(4) }) From 3954fa309b1d77b74df916474dc5d83565c170c7 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Mon, 29 Apr 2024 14:24:37 +0800 Subject: [PATCH 04/16] randomly select at most 25 candidates and time each step --- packages/api/src/jobs/ai/create_digest.ts | 24 +++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/packages/api/src/jobs/ai/create_digest.ts b/packages/api/src/jobs/ai/create_digest.ts index 2a6e155d4..8d7a4fd71 100644 --- a/packages/api/src/jobs/ai/create_digest.ts +++ b/packages/api/src/jobs/ai/create_digest.ts @@ -136,11 +136,17 @@ const getPreferencesList = async (userId: string): Promise => { return dedupedPreferences } +const randomSelectCandidates = (candidates: LibraryItem[]): LibraryItem[] => { + // randomly choose at most 25 candidates + return candidates.sort(() => 0.5 - Math.random()).slice(0, 25) +} + // Makes multiple DB queries and combines the results const getCandidatesList = async ( userId: string, selectedLibraryItemIds?: string[] ): Promise => { + console.time('getCandidatesList') // use the queries from the digest definitions to lookup preferences // There should be a list of multiple queries we use. For now we can // hardcode these queries: @@ -199,11 +205,15 @@ const getCandidatesList = async ( return [] } + const selectedCandidates = randomSelectCandidates(dedupedCandidates) + // store the ids in cache - const candidateIds = dedupedCandidates.map((item) => item.id).join(',') + const candidateIds = selectedCandidates.map((item) => item.id).join(',') await redisDataSource.redisClient?.set(key, candidateIds) - return dedupedCandidates + console.timeEnd('getCandidatesList') + + return selectedCandidates } // Takes a list of library items, and uses a prompt to generate @@ -345,6 +355,7 @@ const chooseRankedSelections = (rankedCandidates: RankedItem[]) => { const summarizeItems = async ( rankedCandidates: RankedItem[] ): Promise => { + console.time('summarizeItems') const llm = new OpenAI({ modelName: 'gpt-4-0125-preview', configuration: { @@ -370,6 +381,8 @@ const summarizeItems = async ( (summary, index) => (rankedCandidates[index].summary = summary) ) + console.timeEnd('summarizeItems') + return rankedCandidates } @@ -378,6 +391,7 @@ const generateSpeechFiles = ( rankedItems: RankedItem[], options: SSMLOptions ): SpeechFile[] => { + console.time('generateSpeechFiles') // convert the summaries from markdown to HTML const converter = new showdown.Converter({ backslashEscapesHTMLTags: true, @@ -396,6 +410,8 @@ const generateSpeechFiles = ( }) }) + console.timeEnd('generateSpeechFiles') + return speechFiles } @@ -434,6 +450,8 @@ const generateByline = (summaries: RankedItem[]): string => export const createDigest = async (jobData: CreateDigestData) => { try { + console.time('createDigestJob') + digestDefinition = await fetchDigestDefinition() const candidates = await getCandidatesList( @@ -511,5 +529,7 @@ export const createDigest = async (jobData: CreateDigestData) => { await sendMulticastPushNotifications(jobData.userId, message, 'reminder') } + + console.timeEnd('createDigestJob') } } From 9a6ec7a58b78e3c38072f60609d4ac95b5cd2267 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Mon, 29 Apr 2024 14:31:16 +0800 Subject: [PATCH 05/16] filter out summary if shorter than 200 --- packages/api/src/jobs/ai/create_digest.ts | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/packages/api/src/jobs/ai/create_digest.ts b/packages/api/src/jobs/ai/create_digest.ts index 8d7a4fd71..ffdbcc878 100644 --- a/packages/api/src/jobs/ai/create_digest.ts +++ b/packages/api/src/jobs/ai/create_digest.ts @@ -419,7 +419,9 @@ const generateSpeechFiles = ( // basic checks to make sure the summaries are good. const filterSummaries = (summaries: RankedItem[]): RankedItem[] => { return summaries.filter( - (item) => item.summary.length < item.libraryItem.readableContent.length + (item) => + item.summary.length > 200 || + item.summary.length < item.libraryItem.readableContent.length ) } From cbf1f976301128260a74907ac3298c07bc8b9094 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Mon, 29 Apr 2024 14:33:28 +0800 Subject: [PATCH 06/16] fix timer on get candidates function --- packages/api/src/jobs/ai/create_digest.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/api/src/jobs/ai/create_digest.ts b/packages/api/src/jobs/ai/create_digest.ts index ffdbcc878..edcf2b389 100644 --- a/packages/api/src/jobs/ai/create_digest.ts +++ b/packages/api/src/jobs/ai/create_digest.ts @@ -192,6 +192,8 @@ const getCandidatesList = async ( readableContent: htmlToMarkdown(item.readableContent), })) // convert the html content to markdown + console.timeEnd('getCandidatesList') + if (dedupedCandidates.length === 0) { logger.info('No new candidates found') @@ -211,8 +213,6 @@ const getCandidatesList = async ( const candidateIds = selectedCandidates.map((item) => item.id).join(',') await redisDataSource.redisClient?.set(key, candidateIds) - console.timeEnd('getCandidatesList') - return selectedCandidates } From 0a2f9dba25b670eabfefc8bc5471423a6251c673 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Mon, 29 Apr 2024 15:34:56 +0800 Subject: [PATCH 07/16] filter out summary if longer than 1000 words --- packages/api/src/jobs/ai/create_digest.ts | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/packages/api/src/jobs/ai/create_digest.ts b/packages/api/src/jobs/ai/create_digest.ts index edcf2b389..d21e5248c 100644 --- a/packages/api/src/jobs/ai/create_digest.ts +++ b/packages/api/src/jobs/ai/create_digest.ts @@ -18,6 +18,7 @@ import { searchLibraryItems, } from '../../services/library_item' import { findDeviceTokensByUserId } from '../../services/user_device_tokens' +import { wordsCount } from '../../utils/helpers' import { logger } from '../../utils/logger' import { htmlToMarkdown } from '../../utils/parser' import { sendMulticastPushNotifications } from '../../utils/sendNotification' @@ -171,9 +172,10 @@ const getCandidatesList = async ( await searchLibraryItems( { includeContent: true, - query: existingCandidateIds - ? `(${selector.query}) -includes:${existingCandidateIds}` // exclude the existing candidates - : selector.query, + // query: existingCandidateIds + // ? `(${selector.query}) -includes:${existingCandidateIds}` // exclude the existing candidates + // : selector.query, + query: selector.query, size: selector.count, }, userId @@ -192,6 +194,8 @@ const getCandidatesList = async ( readableContent: htmlToMarkdown(item.readableContent), })) // convert the html content to markdown + logger.info('dedupedCandidates: ', dedupedCandidates) + console.timeEnd('getCandidatesList') if (dedupedCandidates.length === 0) { @@ -209,6 +213,8 @@ const getCandidatesList = async ( const selectedCandidates = randomSelectCandidates(dedupedCandidates) + logger.info('selectedCandidates: ', selectedCandidates) + // store the ids in cache const candidateIds = selectedCandidates.map((item) => item.id).join(',') await redisDataSource.redisClient?.set(key, candidateIds) @@ -420,7 +426,8 @@ const generateSpeechFiles = ( const filterSummaries = (summaries: RankedItem[]): RankedItem[] => { return summaries.filter( (item) => - item.summary.length > 200 || + wordsCount(item.summary) > 100 && + wordsCount(item.summary) < 1000 && item.summary.length < item.libraryItem.readableContent.length ) } @@ -480,6 +487,7 @@ export const createDigest = async (jobData: CreateDigestData) => { summary: '', })) const summaries = await summarizeItems(selections) + logger.info('summaries: ', summaries) const filteredSummaries = filterSummaries(summaries) From ea227b6462b04d5ef2ce83f5d67154674463a898 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Mon, 29 Apr 2024 17:11:20 +0800 Subject: [PATCH 08/16] use claude --- packages/api/package.json | 1 + packages/api/src/jobs/ai/create_digest.ts | 53 +++++++++++++++------- yarn.lock | 55 ++++++++++++++++++++++- 3 files changed, 91 insertions(+), 18 deletions(-) diff --git a/packages/api/package.json b/packages/api/package.json index 554e99f45..8d4bad8d2 100644 --- a/packages/api/package.json +++ b/packages/api/package.json @@ -22,6 +22,7 @@ "@google-cloud/storage": "^7.0.1", "@google-cloud/tasks": "^4.0.0", "@graphql-tools/utils": "^9.1.1", + "@langchain/anthropic": "^0.1.16", "@langchain/openai": "^0.0.14", "@notionhq/client": "^2.2.14", "@omnivore/content-handler": "1.0.0", diff --git a/packages/api/src/jobs/ai/create_digest.ts b/packages/api/src/jobs/ai/create_digest.ts index d21e5248c..ff4ccf3a9 100644 --- a/packages/api/src/jobs/ai/create_digest.ts +++ b/packages/api/src/jobs/ai/create_digest.ts @@ -1,5 +1,6 @@ +import { ChatAnthropic } from '@langchain/anthropic' import { JsonOutputParser } from '@langchain/core/output_parsers' -import { PromptTemplate } from '@langchain/core/prompts' +import { ChatPromptTemplate, PromptTemplate } from '@langchain/core/prompts' import { OpenAI } from '@langchain/openai' import { htmlToSpeechFile, @@ -234,7 +235,7 @@ const createUserProfile = async ( }, }) - const contextualTemplate = PromptTemplate.fromTemplate( + const contextualTemplate = ChatPromptTemplate.fromTemplate( digestDefinition.zeroShot.userPreferencesProfilePrompt ) @@ -362,29 +363,47 @@ const summarizeItems = async ( rankedCandidates: RankedItem[] ): Promise => { console.time('summarizeItems') - const llm = new OpenAI({ - modelName: 'gpt-4-0125-preview', - configuration: { - apiKey: process.env.OPENAI_API_KEY, - }, + // const llm = new OpenAI({ + // modelName: 'gpt-4-0125-preview', + // configuration: { + // apiKey: process.env.OPENAI_API_KEY, + // }, + // }) + + const llm = new ChatAnthropic({ + apiKey: process.env.CLAUDE_API_KEY, + model: 'claude-3-sonnet-20240229', }) - const contextualTemplate = PromptTemplate.fromTemplate( + const contextualTemplate = ChatPromptTemplate.fromTemplate( digestDefinition.summaryPrompt ) - const chain = contextualTemplate.pipe(llm) - // send all the ranked candidates to openAI at once in a batch - const summaries = await chain.batch( - rankedCandidates.map((item) => ({ - title: item.libraryItem.title, - author: item.libraryItem.author ?? '', - content: item.libraryItem.readableContent, // markdown content - })) + const prompts = await Promise.all( + rankedCandidates.map( + async (item) => + await contextualTemplate.format({ + title: item.libraryItem.title, + author: item.libraryItem.author ?? '', + content: item.libraryItem.readableContent, // markdown content + }) + ) ) + // // send all the ranked candidates to openAI at once in a batch + // const summaries = await chain.batch( + // rankedCandidates.map((item) => ({ + // title: item.libraryItem.title, + // author: item.libraryItem.author ?? '', + // content: item.libraryItem.readableContent, // markdown content + // })) + // ) + + const summaries = await llm.batch(prompts) + summaries.forEach( - (summary, index) => (rankedCandidates[index].summary = summary) + (summary, index) => + (rankedCandidates[index].summary = summary.content.toString()) ) console.timeEnd('summarizeItems') diff --git a/yarn.lock b/yarn.lock index 08b76b456..960c4280a 100644 --- a/yarn.lock +++ b/yarn.lock @@ -50,6 +50,20 @@ lodash "^4.17.21" resize-observer-polyfill "^1.5.1" +"@anthropic-ai/sdk@^0.20.1": + version "0.20.7" + resolved "https://registry.yarnpkg.com/@anthropic-ai/sdk/-/sdk-0.20.7.tgz#b19b0e66ba070f928bbf583c06d76e6efdd93d5e" + integrity sha512-uyc+3WGLpe8ur6mSIKSab7P9JdBerTdmqb7popc/yROYLLCW/Ykyw4ZfjmN/cLmxjnAKnv5YUngzbPM0BJuGjg== + dependencies: + "@types/node" "^18.11.18" + "@types/node-fetch" "^2.6.4" + abort-controller "^3.0.0" + agentkeepalive "^4.2.1" + form-data-encoder "1.7.2" + formdata-node "^4.3.2" + node-fetch "^2.6.7" + web-streams-polyfill "^3.2.1" + "@anthropic-ai/sdk@^0.9.1": version "0.9.1" resolved "https://registry.yarnpkg.com/@anthropic-ai/sdk/-/sdk-0.9.1.tgz#b2d2b7bf05c90dce502c9a2e869066870f69ba88" @@ -3774,6 +3788,17 @@ dependencies: lodash "^4.17.21" +"@langchain/anthropic@^0.1.16": + version "0.1.16" + resolved "https://registry.yarnpkg.com/@langchain/anthropic/-/anthropic-0.1.16.tgz#c2a9d3dd4e02df7118dd97cf2503c9bd1a4de5ad" + integrity sha512-vCbwkZ3pkMSKf67fBgNlslvuW9f3EZGBbO8Ic2etgX3xFl6L0WuMtfS26P1FCDpRwaKuC1BrCj2aLAeMzMq/Fg== + dependencies: + "@anthropic-ai/sdk" "^0.20.1" + "@langchain/core" "~0.1.56" + fast-xml-parser "^4.3.5" + zod "^3.22.4" + zod-to-json-schema "^3.22.4" + "@langchain/community@~0.0.33": version "0.0.33" resolved "https://registry.yarnpkg.com/@langchain/community/-/community-0.0.33.tgz#5568fe36b1e2f8947d49414d47e14a27da5b65c9" @@ -3803,6 +3828,24 @@ zod "^3.22.4" zod-to-json-schema "^3.22.3" +"@langchain/core@~0.1.56": + version "0.1.61" + resolved "https://registry.yarnpkg.com/@langchain/core/-/core-0.1.61.tgz#9313363e04f1c6981a938b2909c44ce6fceb2736" + integrity sha512-C8OkAly+ugvXsL8TACCmFv9WTTcT4gvQaG6NbrXCOzibBCywfxxcTqEMOyg3zIKpxHEmR0DHqh0OiJRHocnsCg== + dependencies: + ansi-styles "^5.0.0" + camelcase "6" + decamelize "1.2.0" + js-tiktoken "^1.0.8" + langsmith "~0.1.7" + ml-distance "^4.0.0" + mustache "^4.2.0" + p-queue "^6.6.2" + p-retry "4" + uuid "^9.0.0" + zod "^3.22.4" + zod-to-json-schema "^3.22.3" + "@langchain/openai@^0.0.14": version "0.0.14" resolved "https://registry.yarnpkg.com/@langchain/openai/-/openai-0.0.14.tgz#27a6ba83f6b754391868b22f3b90cd440038acf0" @@ -15694,7 +15737,7 @@ fast-xml-parser@^4.2.2, fast-xml-parser@^4.3.0: dependencies: strnum "^1.0.5" -fast-xml-parser@^4.3.2: +fast-xml-parser@^4.3.2, fast-xml-parser@^4.3.5: version "4.3.6" resolved "https://registry.yarnpkg.com/fast-xml-parser/-/fast-xml-parser-4.3.6.tgz#190f9d99097f0c8f2d3a0e681a10404afca052ff" integrity sha512-M2SovcRxD4+vC493Uc2GZVcZaj66CCJhWurC4viynVSTvrpErCShNcDz1lAho6n9REQKvL/ll4A4/fw6Y9z8nw== @@ -22822,6 +22865,11 @@ multimatch@5.0.0: arrify "^2.0.1" minimatch "^3.0.4" +mustache@^4.2.0: + version "4.2.0" + resolved "https://registry.yarnpkg.com/mustache/-/mustache-4.2.0.tgz#e5892324d60a12ec9c2a73359edca52972bf6f64" + integrity sha512-71ippSywq5Yb7/tVYyGbkBggbU8H3u5Rz56fH60jGFgr8uHwxs+aSKeqmluIVzM0m0kB7xQjKS6qPfd0b2ZoqQ== + mute-stream@0.0.8, mute-stream@~0.0.4: version "0.0.8" resolved "https://registry.yarnpkg.com/mute-stream/-/mute-stream-0.0.8.tgz#1630c42b2251ff81e2a283de96a5497ea92e5e0d" @@ -32217,6 +32265,11 @@ zod-to-json-schema@^3.22.3: resolved "https://registry.yarnpkg.com/zod-to-json-schema/-/zod-to-json-schema-3.22.4.tgz#f8cc691f6043e9084375e85fb1f76ebafe253d70" integrity sha512-2Ed5dJ+n/O3cU383xSY28cuVi0BCQhF8nYqWU5paEpl7fVdqdAmiLdqLyfblbNdfOFwFfi/mqU4O1pwc60iBhQ== +zod-to-json-schema@^3.22.4: + version "3.23.0" + resolved "https://registry.yarnpkg.com/zod-to-json-schema/-/zod-to-json-schema-3.23.0.tgz#4fc60e88d3c709eedbfaae3f92f8a7bf786469f2" + integrity sha512-az0uJ243PxsRIa2x1WmNE/pnuA05gUq/JB8Lwe1EDCCL/Fz9MgjYQ0fPlyc2Tcv6aF2ZA7WM5TWaRZVEFaAIag== + zod@^3.22.3, zod@^3.22.4: version "3.22.4" resolved "https://registry.yarnpkg.com/zod/-/zod-3.22.4.tgz#f31c3a9386f61b1f228af56faa9255e845cf3fff" From debb396b445e21d75d5160214ed768d95b313a8c Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Mon, 29 Apr 2024 17:34:10 +0800 Subject: [PATCH 09/16] better error handling --- packages/api/src/jobs/ai/create_digest.ts | 28 +++++++++++++---------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/packages/api/src/jobs/ai/create_digest.ts b/packages/api/src/jobs/ai/create_digest.ts index ff4ccf3a9..487401926 100644 --- a/packages/api/src/jobs/ai/create_digest.ts +++ b/packages/api/src/jobs/ai/create_digest.ts @@ -379,17 +379,6 @@ const summarizeItems = async ( digestDefinition.summaryPrompt ) - const prompts = await Promise.all( - rankedCandidates.map( - async (item) => - await contextualTemplate.format({ - title: item.libraryItem.title, - author: item.libraryItem.author ?? '', - content: item.libraryItem.readableContent, // markdown content - }) - ) - ) - // // send all the ranked candidates to openAI at once in a batch // const summaries = await chain.batch( // rankedCandidates.map((item) => ({ @@ -399,7 +388,22 @@ const summarizeItems = async ( // })) // ) - const summaries = await llm.batch(prompts) + const summaries = await Promise.all( + rankedCandidates.map(async (item) => { + try { + const prompt = await contextualTemplate.format({ + title: item.libraryItem.title, + author: item.libraryItem.author ?? '', + content: item.libraryItem.readableContent, // markdown content + }) + + return llm.invoke(prompt) + } catch (error) { + logger.error('summarizeItems error', error) + return { content: '' } + } + }) + ) summaries.forEach( (summary, index) => From aa30b592b5964c8aceda1c0231af6a81919cd66d Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Mon, 29 Apr 2024 17:47:07 +0800 Subject: [PATCH 10/16] reduce debug logs --- packages/api/src/jobs/ai/create_digest.ts | 46 +++++++++++++++-------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/packages/api/src/jobs/ai/create_digest.ts b/packages/api/src/jobs/ai/create_digest.ts index 487401926..a82011e9f 100644 --- a/packages/api/src/jobs/ai/create_digest.ts +++ b/packages/api/src/jobs/ai/create_digest.ts @@ -160,11 +160,11 @@ const getCandidatesList = async ( return findLibraryItemsByIds(selectedLibraryItemIds, userId) } - // get the existing candidate ids from cache - const key = `digest:${userId}:existingCandidateIds` - const existingCandidateIds = await redisDataSource.redisClient?.get(key) + // // get the existing candidate ids from cache + // const key = `digest:${userId}:existingCandidateIds` + // const existingCandidateIds = await redisDataSource.redisClient?.get(key) - logger.info('existingCandidateIds: ', { existingCandidateIds }) + // logger.info('existingCandidateIds: ', { existingCandidateIds }) const candidates = await Promise.all( digestDefinition.candidateSelectors.map( @@ -195,18 +195,21 @@ const getCandidatesList = async ( readableContent: htmlToMarkdown(item.readableContent), })) // convert the html content to markdown - logger.info('dedupedCandidates: ', dedupedCandidates) + logger.info( + 'dedupedCandidates: ', + dedupedCandidates.map((item) => item.title) + ) console.timeEnd('getCandidatesList') if (dedupedCandidates.length === 0) { logger.info('No new candidates found') - if (existingCandidateIds) { - // reuse the existing candidates - const existingIds = existingCandidateIds.split(',') - return findLibraryItemsByIds(existingIds, userId) - } + // if (existingCandidateIds) { + // // reuse the existing candidates + // const existingIds = existingCandidateIds.split(',') + // return findLibraryItemsByIds(existingIds, userId) + // } // return empty array if no existing candidates return [] @@ -214,11 +217,14 @@ const getCandidatesList = async ( const selectedCandidates = randomSelectCandidates(dedupedCandidates) - logger.info('selectedCandidates: ', selectedCandidates) + logger.info( + 'selectedCandidates: ', + selectedCandidates.map((item) => item.title) + ) - // store the ids in cache - const candidateIds = selectedCandidates.map((item) => item.id).join(',') - await redisDataSource.redisClient?.set(key, candidateIds) + // // store the ids in cache + // const candidateIds = selectedCandidates.map((item) => item.id).join(',') + // await redisDataSource.redisClient?.set(key, candidateIds) return selectedCandidates } @@ -397,7 +403,12 @@ const summarizeItems = async ( content: item.libraryItem.readableContent, // markdown content }) - return llm.invoke(prompt) + logger.info('summarizeItems prompt: ', prompt) + + const summary = await llm.invoke(prompt) + logger.info('summarizeItems summary: ', summary) + + return summary } catch (error) { logger.error('summarizeItems error', error) return { content: '' } @@ -510,7 +521,10 @@ export const createDigest = async (jobData: CreateDigestData) => { summary: '', })) const summaries = await summarizeItems(selections) - logger.info('summaries: ', summaries) + logger.info( + 'summaries: ', + summaries.map((item) => item.summary) + ) const filteredSummaries = filterSummaries(summaries) From 83ed971b281aa8b6cb516b5e0469f021eb5dc67b Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Mon, 29 Apr 2024 17:59:34 +0800 Subject: [PATCH 11/16] improve logs --- packages/api/src/jobs/ai/create_digest.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/api/src/jobs/ai/create_digest.ts b/packages/api/src/jobs/ai/create_digest.ts index a82011e9f..ef8e38ce2 100644 --- a/packages/api/src/jobs/ai/create_digest.ts +++ b/packages/api/src/jobs/ai/create_digest.ts @@ -403,7 +403,7 @@ const summarizeItems = async ( content: item.libraryItem.readableContent, // markdown content }) - logger.info('summarizeItems prompt: ', prompt) + logger.info(`summarizeItems prompt: ${prompt}`) const summary = await llm.invoke(prompt) logger.info('summarizeItems summary: ', summary) From 8edfce596b1b44e1ce59acf44059071a94ecbd69 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Mon, 29 Apr 2024 18:19:41 +0800 Subject: [PATCH 12/16] revert to batch api call --- packages/api/src/jobs/ai/create_digest.ts | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/packages/api/src/jobs/ai/create_digest.ts b/packages/api/src/jobs/ai/create_digest.ts index ef8e38ce2..4ab46c679 100644 --- a/packages/api/src/jobs/ai/create_digest.ts +++ b/packages/api/src/jobs/ai/create_digest.ts @@ -394,27 +394,24 @@ const summarizeItems = async ( // })) // ) - const summaries = await Promise.all( + const prompts = await Promise.all( rankedCandidates.map(async (item) => { try { - const prompt = await contextualTemplate.format({ + return await contextualTemplate.format({ title: item.libraryItem.title, author: item.libraryItem.author ?? '', content: item.libraryItem.readableContent, // markdown content }) - - logger.info(`summarizeItems prompt: ${prompt}`) - - const summary = await llm.invoke(prompt) - logger.info('summarizeItems summary: ', summary) - - return summary } catch (error) { logger.error('summarizeItems error', error) - return { content: '' } + return '' } }) ) + logger.info('prompts: ', prompts) + + const summaries = await llm.batch(prompts) + logger.info('summaries: ', summaries) summaries.forEach( (summary, index) => @@ -521,10 +518,6 @@ export const createDigest = async (jobData: CreateDigestData) => { summary: '', })) const summaries = await summarizeItems(selections) - logger.info( - 'summaries: ', - summaries.map((item) => item.summary) - ) const filteredSummaries = filterSummaries(summaries) From d589c18b68d5ded8eb8dd29bdac80f56977b5430 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Tue, 30 Apr 2024 12:43:24 +0800 Subject: [PATCH 13/16] generate a unique id for each scheduled digest to avoid duplication --- packages/api/src/jobs/ai/create_digest.ts | 15 +++++++++------ packages/api/src/routers/digest_router.ts | 7 ++++--- packages/api/src/utils/createTask.ts | 7 ++++++- 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/packages/api/src/jobs/ai/create_digest.ts b/packages/api/src/jobs/ai/create_digest.ts index 4ab46c679..bd2c29497 100644 --- a/packages/api/src/jobs/ai/create_digest.ts +++ b/packages/api/src/jobs/ai/create_digest.ts @@ -9,6 +9,7 @@ import { } from '@omnivore/text-to-speech-handler' import axios from 'axios' import showdown from 'showdown' +import { v4 as uuid } from 'uuid' import yaml from 'yaml' import { LibraryItem } from '../../entity/library_item' import { TaskState } from '../../generated/graphql' @@ -27,7 +28,7 @@ import { sendMulticastPushNotifications } from '../../utils/sendNotification' export type CreateDigestJobSchedule = 'daily' | 'weekly' export interface CreateDigestData { - id: string + id?: string userId: string voices?: string[] language?: string @@ -489,9 +490,11 @@ const generateByline = (summaries: RankedItem[]): string => .join(', ') export const createDigest = async (jobData: CreateDigestData) => { - try { - console.time('createDigestJob') + console.time('createDigestJob') + // generate a unique id for the digest if not provided for scheduled jobs + const digestId = jobData.id ?? uuid() + try { digestDefinition = await fetchDigestDefinition() const candidates = await getCandidatesList( @@ -501,7 +504,7 @@ export const createDigest = async (jobData: CreateDigestData) => { if (candidates.length === 0) { logger.info('No candidates found') return writeDigest(jobData.userId, { - id: jobData.id, + id: digestId, jobState: TaskState.Succeeded, title: 'No articles found', }) @@ -528,7 +531,7 @@ export const createDigest = async (jobData: CreateDigestData) => { }) const title = generateTitle(summaries) const digest: Digest = { - id: jobData.id, + id: digestId, title, content: generateContent(summaries), jobState: TaskState.Succeeded, @@ -552,7 +555,7 @@ export const createDigest = async (jobData: CreateDigestData) => { logger.error('createDigestJob error', error) await writeDigest(jobData.userId, { - id: jobData.id, + id: digestId, jobState: TaskState.Failed, }) } finally { diff --git a/packages/api/src/routers/digest_router.ts b/packages/api/src/routers/digest_router.ts index 5f7a38fe8..6cbdd1e1d 100644 --- a/packages/api/src/routers/digest_router.ts +++ b/packages/api/src/routers/digest_router.ts @@ -1,6 +1,5 @@ import cors from 'cors' import express from 'express' -import { v4 as uuid } from 'uuid' import { env } from '../env' import { TaskState } from '../generated/graphql' import { CreateDigestJobSchedule } from '../jobs/ai/create_digest' @@ -89,9 +88,11 @@ export function digestRouter() { // enqueue job and return job id const result = await enqueueCreateDigest( { - id: uuid(), // generate job id userId, - ...data, + voices: data.voices, + language: data.language, + rate: data.rate, + libraryItemIds: data.libraryItemIds, }, data.schedule ) diff --git a/packages/api/src/utils/createTask.ts b/packages/api/src/utils/createTask.ts index 2383df98c..d74f2ae10 100644 --- a/packages/api/src/utils/createTask.ts +++ b/packages/api/src/utils/createTask.ts @@ -864,6 +864,8 @@ export const enqueueCreateDigest = async ( throw new Error('No queue found') } + // generate unique id for the digest + data.id = uuid() // enqueue create digest job immediately const jobId = `${CREATE_DIGEST_JOB}_${data.userId}` const job = await queue.add(CREATE_DIGEST_JOB, data, { @@ -890,9 +892,9 @@ export const enqueueCreateDigest = async ( await writeDigest(data.userId, digest) if (schedule) { + // remove existing repeated job if any await Promise.all( Object.keys(CRON_PATTERNS).map(async (key) => { - // remove existing repeated job if any const isDeleted = await queue.removeRepeatable( CREATE_DIGEST_JOB, { @@ -909,6 +911,9 @@ export const enqueueCreateDigest = async ( ) // schedule repeated job + // delete the digest id to avoid duplication + delete data.id + const job = await queue.add(CREATE_DIGEST_JOB, data, { attempts: 1, priority: getJobPriority(CREATE_DIGEST_JOB), From 1a3697e24b912df85185b286b42b316056e032c2 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Tue, 30 Apr 2024 13:58:37 +0800 Subject: [PATCH 14/16] fix: skip unwanted nodes and update the index when converting html to utterances --- packages/text-to-speech/src/htmlToSsml.ts | 40 +++++++++++++++++------ 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/packages/text-to-speech/src/htmlToSsml.ts b/packages/text-to-speech/src/htmlToSsml.ts index beebd4a71..c74a463ea 100644 --- a/packages/text-to-speech/src/htmlToSsml.ts +++ b/packages/text-to-speech/src/htmlToSsml.ts @@ -74,6 +74,16 @@ const TOP_LEVEL_TAGS = [ 'LI', ] +const SKIP_TAGS = [ + 'SCRIPT', + 'STYLE', + 'IMG', + 'FIGURE', + 'FIGCAPTION', + 'IFRAME', + 'CODE', +] + function parseDomTree(pageNode: Element) { if (!pageNode || pageNode.childNodes.length == 0) { console.log('no child nodes found') @@ -146,16 +156,6 @@ function emitElement( element: Element, isTopLevel: boolean ) { - const SKIP_TAGS = [ - 'SCRIPT', - 'STYLE', - 'IMG', - 'FIGURE', - 'FIGCAPTION', - 'IFRAME', - 'CODE', - ] - const topLevelTags = ssmlTagsForTopLevelElement() const idx = element.getAttribute('data-omnivore-anchor-idx') let maxVisitedIdx = Number(idx) @@ -369,6 +369,20 @@ const replaceSmartQuotes = (text: string): string => { return text.replace(/[\u2018\u2019]/g, "'").replace(/[\u201C\u201D]/g, '"') } +// get the max idx of the element and its children +const getMaxVisitedIdx = (element: Element): number => { + let maxVisitedIdx = Number(element.getAttribute('data-omnivore-anchor-idx')) + for (const child of Array.from(element.childNodes)) { + if (child.nodeType === 1) { + maxVisitedIdx = Math.max( + maxVisitedIdx, + getMaxVisitedIdx(child as Element) + ) + } + } + return maxVisitedIdx +} + export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => { const { title, content, options } = htmlInput console.log('creating speech file with options:', options) @@ -421,6 +435,12 @@ export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => { const textItems: string[] = [] const node = parsedNodes[i - 3] + // skip unwanted tags and update the index + if (SKIP_TAGS.includes(node.nodeName)) { + i = getMaxVisitedIdx(node) + continue + } + if (TOP_LEVEL_TAGS.includes(node.nodeName) || hasSignificantText(node)) { // use paragraph as anchor const idx = i.toString() From 76c1cd53305e68f62203e74cd486dcd4070453f1 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Tue, 30 Apr 2024 14:33:10 +0800 Subject: [PATCH 15/16] fix: filter punctuation and whitespace only utterance out --- packages/text-to-speech/src/htmlToSsml.ts | 42 ++++++++++++++++------- 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/packages/text-to-speech/src/htmlToSsml.ts b/packages/text-to-speech/src/htmlToSsml.ts index c74a463ea..e8e379a8b 100644 --- a/packages/text-to-speech/src/htmlToSsml.ts +++ b/packages/text-to-speech/src/htmlToSsml.ts @@ -5,6 +5,7 @@ import { SentenceTokenizerNew, WordPunctTokenizer, } from 'natural' +import { isElement } from 'underscore' // this code needs to be kept in sync with the // frontend code in: useReadingProgressAnchor @@ -49,6 +50,7 @@ const DEFAULT_LANGUAGE = 'en-US' const DEFAULT_VOICE = 'en-US-JennyNeural' const DEFAULT_SECONDARY_VOICE = 'en-US-GuyNeural' const DEFAULT_RATE = '1.1' +const ELEMENT_INDEX_ATTRIBUTE = 'data-omnivore-anchor-idx' const ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES = [ 'omnivore-highlight-id', @@ -84,6 +86,9 @@ const SKIP_TAGS = [ 'CODE', ] +const getAnchorIndex = (element: Element): number => + Number(element.getAttribute(ELEMENT_INDEX_ATTRIBUTE)) + function parseDomTree(pageNode: Element) { if (!pageNode || pageNode.childNodes.length == 0) { console.log('no child nodes found') @@ -118,7 +123,7 @@ function parseDomTree(pageNode: Element) { visitedNodeList.forEach((node, index) => { // We start at index 3, because the frontend starts two nodes above us // on the #readability-page-1 element that wraps the entire content. - node.setAttribute('data-omnivore-anchor-idx', (index + 3).toString()) + node.setAttribute(ELEMENT_INDEX_ATTRIBUTE, (index + 3).toString()) }) return visitedNodeList } @@ -157,7 +162,7 @@ function emitElement( isTopLevel: boolean ) { const topLevelTags = ssmlTagsForTopLevelElement() - const idx = element.getAttribute('data-omnivore-anchor-idx') + const idx = getAnchorIndex(element) let maxVisitedIdx = Number(idx) if (isTopLevel) { @@ -166,6 +171,11 @@ function emitElement( for (const child of Array.from(element.childNodes)) { if (SKIP_TAGS.indexOf(child.nodeName) >= 0) { + // Skip unwanted tags and update the index + if (isElement(child)) { + const childIdx = getAnchorIndex(child) + maxVisitedIdx = Math.max(maxVisitedIdx, childIdx) + } continue } @@ -180,8 +190,8 @@ function emitElement( } emitTextNode(textItems, cleanedText, child) } - if (child.nodeType == 1 /* Node.ELEMENT_NODE */) { - maxVisitedIdx = emitElement(textItems, child as HTMLElement, false) + if (isElement(child)) { + maxVisitedIdx = emitElement(textItems, child, false) if (child.nodeName === 'LI') { // add a new line after each list item emit(textItems, '\n') @@ -265,6 +275,11 @@ export const stripEmojis = (text: string): string => { return text.replace(emojiRegex, '').replace(/\s+/g, ' ') } +const filterUtterances = (utterances: Utterance[]): Utterance[] => { + const punctuationOrSpaceOnly = /^[\s.,;:!?]+$/ + return utterances.filter((u) => !punctuationOrSpaceOnly.test(u.text)) +} + const textToUtterances = ({ wordTokenizer, idx, @@ -309,7 +324,10 @@ const textToUtterances = ({ const sentenceTokenizer = new SentenceTokenizerNew() sentences = sentenceTokenizer.tokenize(text) } catch (err) { - console.debug('Unable to tokenize sentences') + console.log( + 'Unable to tokenize sentences, falling back to old tokenizer', + text + ) // fallback to old sentence tokenizer const sentenceTokenizer = new SentenceTokenizer() sentences = sentenceTokenizer.tokenize(text) @@ -371,15 +389,13 @@ const replaceSmartQuotes = (text: string): string => { // get the max idx of the element and its children const getMaxVisitedIdx = (element: Element): number => { - let maxVisitedIdx = Number(element.getAttribute('data-omnivore-anchor-idx')) + let maxVisitedIdx = getAnchorIndex(element) for (const child of Array.from(element.childNodes)) { - if (child.nodeType === 1) { - maxVisitedIdx = Math.max( - maxVisitedIdx, - getMaxVisitedIdx(child as Element) - ) + if (isElement(child)) { + maxVisitedIdx = Math.max(maxVisitedIdx, getMaxVisitedIdx(child)) } } + return maxVisitedIdx } @@ -461,10 +477,12 @@ export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => { } } + const filteredUtterances = filterUtterances(utterances) + return { wordCount: wordOffset, language, defaultVoice, - utterances, + utterances: filteredUtterances, } } From 7a167c7c5531cd14fa704310b6cc6ca93e19d8a1 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Tue, 30 Apr 2024 14:42:46 +0800 Subject: [PATCH 16/16] add tests --- packages/text-to-speech/mocha-config.json | 3 +- .../text-to-speech/test/htmlToSsml.test.ts | 37 ++++++++++++++++++- packages/text-to-speech/tsconfig.json | 2 +- 3 files changed, 39 insertions(+), 3 deletions(-) diff --git a/packages/text-to-speech/mocha-config.json b/packages/text-to-speech/mocha-config.json index 5a2631f42..8e24eb08b 100644 --- a/packages/text-to-speech/mocha-config.json +++ b/packages/text-to-speech/mocha-config.json @@ -1,4 +1,5 @@ { "extension": ["ts"], - "spec": "test/**/*.test.ts" + "spec": "test/**/*.test.ts", + "timeout": 10000 } diff --git a/packages/text-to-speech/test/htmlToSsml.test.ts b/packages/text-to-speech/test/htmlToSsml.test.ts index 88c23d386..62063c3d2 100644 --- a/packages/text-to-speech/test/htmlToSsml.test.ts +++ b/packages/text-to-speech/test/htmlToSsml.test.ts @@ -350,6 +350,41 @@ describe('convert HTML to Speech file', () => { options: TEST_OPTIONS, }) expect(speechFile.utterances).to.have.lengthOf(2) - expect(speechFile.utterances[1].text).to.eql('McMeekin makes an extended case that Stalin was preparing to attack Nazi Germany when Hitler attacked him, that the two dictators were basically in a race to see who could mobilize to betray the other first — and that the initial Soviet debacle in 1941 happened in part because Stalin was also pushing his military toward an offensive alignment, and they were caught in a "mid-mobilization limbo."') + expect(speechFile.utterances[1].text).to.eql( + 'McMeekin makes an extended case that Stalin was preparing to attack Nazi Germany when Hitler attacked him, that the two dictators were basically in a race to see who could mobilize to betray the other first — and that the initial Soviet debacle in 1941 happened in part because Stalin was also pushing his military toward an offensive alignment, and they were caught in a "mid-mobilization limbo."' + ) + }) + + it('skip unwanted elements', () => { + const html = `
+

This is a test.

+ + + +
test
+
` + + const speechFile = htmlToSpeechFile({ + content: html, + options: TEST_OPTIONS, + }) + expect(speechFile.utterances).to.have.lengthOf(1) + expect(speechFile.utterances[0].text).to.eql('This is a test.') + }) + + it('filters out utterances with only punctuation or whitespace', () => { + const html = `
+

This is a test.

+

.

+

+
` + + const speechFile = htmlToSpeechFile({ + content: html, + options: TEST_OPTIONS, + }) + + expect(speechFile.utterances).to.have.lengthOf(1) + expect(speechFile.utterances[0].text).to.eql('This is a test.') }) }) diff --git a/packages/text-to-speech/tsconfig.json b/packages/text-to-speech/tsconfig.json index 5b6ea3e00..7634a8d53 100644 --- a/packages/text-to-speech/tsconfig.json +++ b/packages/text-to-speech/tsconfig.json @@ -6,5 +6,5 @@ // Generate d.ts files "declaration": true }, - "include": ["src"] + "include": ["src", "test"] }