diff --git a/packages/api/package.json b/packages/api/package.json index 554e99f45..8d4bad8d2 100644 --- a/packages/api/package.json +++ b/packages/api/package.json @@ -22,6 +22,7 @@ "@google-cloud/storage": "^7.0.1", "@google-cloud/tasks": "^4.0.0", "@graphql-tools/utils": "^9.1.1", + "@langchain/anthropic": "^0.1.16", "@langchain/openai": "^0.0.14", "@notionhq/client": "^2.2.14", "@omnivore/content-handler": "1.0.0", diff --git a/packages/api/src/jobs/ai/create_digest.ts b/packages/api/src/jobs/ai/create_digest.ts index a3be248a0..bd2c29497 100644 --- a/packages/api/src/jobs/ai/create_digest.ts +++ b/packages/api/src/jobs/ai/create_digest.ts @@ -1,5 +1,6 @@ +import { ChatAnthropic } from '@langchain/anthropic' import { JsonOutputParser } from '@langchain/core/output_parsers' -import { PromptTemplate } from '@langchain/core/prompts' +import { ChatPromptTemplate, PromptTemplate } from '@langchain/core/prompts' import { OpenAI } from '@langchain/openai' import { htmlToSpeechFile, @@ -8,6 +9,7 @@ import { } from '@omnivore/text-to-speech-handler' import axios from 'axios' import showdown from 'showdown' +import { v4 as uuid } from 'uuid' import yaml from 'yaml' import { LibraryItem } from '../../entity/library_item' import { TaskState } from '../../generated/graphql' @@ -18,14 +20,15 @@ import { searchLibraryItems, } from '../../services/library_item' import { findDeviceTokensByUserId } from '../../services/user_device_tokens' +import { wordsCount } from '../../utils/helpers' import { logger } from '../../utils/logger' import { htmlToMarkdown } from '../../utils/parser' import { sendMulticastPushNotifications } from '../../utils/sendNotification' export type CreateDigestJobSchedule = 'daily' | 'weekly' -export interface CreateDigestJobData { - id: string +export interface CreateDigestData { + id?: string userId: string voices?: string[] language?: string @@ -113,18 +116,17 @@ const getPreferencesList = async (userId: string): Promise => { // reason: "some older items that were interacted with" const preferences = await Promise.all( - digestDefinition.preferenceSelectors.map(async (selector) => { - // use the selector to fetch items - const results = await searchLibraryItems( - { - query: selector.query, - size: selector.count, - }, - userId - ) - - return results.libraryItems - }) + digestDefinition.preferenceSelectors.map( + async (selector) => + // use the selector to fetch items + await searchLibraryItems( + { + query: selector.query, + size: selector.count, + }, + userId + ) + ) ) // deduplicate and flatten the items @@ -137,11 +139,17 @@ const getPreferencesList = async (userId: string): Promise => { return dedupedPreferences } +const randomSelectCandidates = (candidates: LibraryItem[]): LibraryItem[] => { + // randomly choose at most 25 candidates + return candidates.sort(() => 0.5 - Math.random()).slice(0, 25) +} + // Makes multiple DB queries and combines the results const getCandidatesList = async ( userId: string, selectedLibraryItemIds?: string[] ): Promise => { + console.time('getCandidatesList') // use the queries from the digest definitions to lookup preferences // There should be a list of multiple queries we use. For now we can // hardcode these queries: @@ -153,28 +161,28 @@ const getCandidatesList = async ( return findLibraryItemsByIds(selectedLibraryItemIds, userId) } - // get the existing candidate ids from cache - const key = `digest:${userId}:existingCandidateIds` - const existingCandidateIds = await redisDataSource.redisClient?.get(key) + // // get the existing candidate ids from cache + // const key = `digest:${userId}:existingCandidateIds` + // const existingCandidateIds = await redisDataSource.redisClient?.get(key) - logger.info('existingCandidateIds: ', { existingCandidateIds }) + // logger.info('existingCandidateIds: ', { existingCandidateIds }) const candidates = await Promise.all( - digestDefinition.candidateSelectors.map(async (selector) => { - // use the selector to fetch items - const results = await searchLibraryItems( - { - includeContent: true, - query: existingCandidateIds - ? `(${selector.query}) -includes:${existingCandidateIds}` // exclude the existing candidates - : selector.query, - size: selector.count, - }, - userId - ) - - return results.libraryItems - }) + digestDefinition.candidateSelectors.map( + async (selector) => + // use the selector to fetch items + await searchLibraryItems( + { + includeContent: true, + // query: existingCandidateIds + // ? `(${selector.query}) -includes:${existingCandidateIds}` // exclude the existing candidates + // : selector.query, + query: selector.query, + size: selector.count, + }, + userId + ) + ) ) // deduplicate and flatten the items @@ -188,24 +196,38 @@ const getCandidatesList = async ( readableContent: htmlToMarkdown(item.readableContent), })) // convert the html content to markdown + logger.info( + 'dedupedCandidates: ', + dedupedCandidates.map((item) => item.title) + ) + + console.timeEnd('getCandidatesList') + if (dedupedCandidates.length === 0) { logger.info('No new candidates found') - if (existingCandidateIds) { - // reuse the existing candidates - const existingIds = existingCandidateIds.split(',') - return findLibraryItemsByIds(existingIds, userId) - } + // if (existingCandidateIds) { + // // reuse the existing candidates + // const existingIds = existingCandidateIds.split(',') + // return findLibraryItemsByIds(existingIds, userId) + // } // return empty array if no existing candidates return [] } - // store the ids in cache - const candidateIds = dedupedCandidates.map((item) => item.id).join(',') - await redisDataSource.redisClient?.set(key, candidateIds) + const selectedCandidates = randomSelectCandidates(dedupedCandidates) - return dedupedCandidates + logger.info( + 'selectedCandidates: ', + selectedCandidates.map((item) => item.title) + ) + + // // store the ids in cache + // const candidateIds = selectedCandidates.map((item) => item.id).join(',') + // await redisDataSource.redisClient?.set(key, candidateIds) + + return selectedCandidates } // Takes a list of library items, and uses a prompt to generate @@ -220,7 +242,7 @@ const createUserProfile = async ( }, }) - const contextualTemplate = PromptTemplate.fromTemplate( + const contextualTemplate = ChatPromptTemplate.fromTemplate( digestDefinition.zeroShot.userPreferencesProfilePrompt ) @@ -347,31 +369,58 @@ const chooseRankedSelections = (rankedCandidates: RankedItem[]) => { const summarizeItems = async ( rankedCandidates: RankedItem[] ): Promise => { - const llm = new OpenAI({ - modelName: 'gpt-4-0125-preview', - configuration: { - apiKey: process.env.OPENAI_API_KEY, - }, + console.time('summarizeItems') + // const llm = new OpenAI({ + // modelName: 'gpt-4-0125-preview', + // configuration: { + // apiKey: process.env.OPENAI_API_KEY, + // }, + // }) + + const llm = new ChatAnthropic({ + apiKey: process.env.CLAUDE_API_KEY, + model: 'claude-3-sonnet-20240229', }) - const contextualTemplate = PromptTemplate.fromTemplate( + const contextualTemplate = ChatPromptTemplate.fromTemplate( digestDefinition.summaryPrompt ) - const chain = contextualTemplate.pipe(llm) - // send all the ranked candidates to openAI at once in a batch - const summaries = await chain.batch( - rankedCandidates.map((item) => ({ - title: item.libraryItem.title, - author: item.libraryItem.author ?? '', - content: item.libraryItem.readableContent, // markdown content - })) + // // send all the ranked candidates to openAI at once in a batch + // const summaries = await chain.batch( + // rankedCandidates.map((item) => ({ + // title: item.libraryItem.title, + // author: item.libraryItem.author ?? '', + // content: item.libraryItem.readableContent, // markdown content + // })) + // ) + + const prompts = await Promise.all( + rankedCandidates.map(async (item) => { + try { + return await contextualTemplate.format({ + title: item.libraryItem.title, + author: item.libraryItem.author ?? '', + content: item.libraryItem.readableContent, // markdown content + }) + } catch (error) { + logger.error('summarizeItems error', error) + return '' + } + }) ) + logger.info('prompts: ', prompts) + + const summaries = await llm.batch(prompts) + logger.info('summaries: ', summaries) summaries.forEach( - (summary, index) => (rankedCandidates[index].summary = summary) + (summary, index) => + (rankedCandidates[index].summary = summary.content.toString()) ) + console.timeEnd('summarizeItems') + return rankedCandidates } @@ -380,6 +429,7 @@ const generateSpeechFiles = ( rankedItems: RankedItem[], options: SSMLOptions ): SpeechFile[] => { + console.time('generateSpeechFiles') // convert the summaries from markdown to HTML const converter = new showdown.Converter({ backslashEscapesHTMLTags: true, @@ -398,6 +448,8 @@ const generateSpeechFiles = ( }) }) + console.timeEnd('generateSpeechFiles') + return speechFiles } @@ -405,7 +457,10 @@ const generateSpeechFiles = ( // basic checks to make sure the summaries are good. const filterSummaries = (summaries: RankedItem[]): RankedItem[] => { return summaries.filter( - (item) => item.summary.length < item.libraryItem.readableContent.length + (item) => + wordsCount(item.summary) > 100 && + wordsCount(item.summary) < 1000 && + item.summary.length < item.libraryItem.readableContent.length ) } @@ -434,7 +489,11 @@ const generateByline = (summaries: RankedItem[]): string => .map((item) => item.libraryItem.author) .join(', ') -export const createDigestJob = async (jobData: CreateDigestJobData) => { +export const createDigest = async (jobData: CreateDigestData) => { + console.time('createDigestJob') + + // generate a unique id for the digest if not provided for scheduled jobs + const digestId = jobData.id ?? uuid() try { digestDefinition = await fetchDigestDefinition() @@ -445,18 +504,23 @@ export const createDigestJob = async (jobData: CreateDigestJobData) => { if (candidates.length === 0) { logger.info('No candidates found') return writeDigest(jobData.userId, { - id: jobData.id, + id: digestId, jobState: TaskState.Succeeded, title: 'No articles found', }) } - const userProfile = await findOrCreateUserProfile(jobData.userId) - const rankedCandidates = await rankCandidates(candidates, userProfile) - const { finalSelections, rankedTopics } = - chooseRankedSelections(rankedCandidates) + // const userProfile = await findOrCreateUserProfile(jobData.userId) + // const rankedCandidates = await rankCandidates(candidates, userProfile) + // const { finalSelections, rankedTopics } = + // chooseRankedSelections(rankedCandidates) - const summaries = await summarizeItems(finalSelections) + const selections = candidates.map((item) => ({ + topic: '', + libraryItem: item, + summary: '', + })) + const summaries = await summarizeItems(selections) const filteredSummaries = filterSummaries(summaries) @@ -467,7 +531,7 @@ export const createDigestJob = async (jobData: CreateDigestJobData) => { }) const title = generateTitle(summaries) const digest: Digest = { - id: jobData.id, + id: digestId, title, content: generateContent(summaries), jobState: TaskState.Succeeded, @@ -480,7 +544,8 @@ export const createDigestJob = async (jobData: CreateDigestJobData) => { wordCount: speechFiles[index].wordCount, })), createdAt: new Date(), - description: generateDescription(summaries, rankedTopics), + description: '', + // description: generateDescription(summaries, rankedTopics), byline: generateByline(summaries), urlsToAudio: [], } @@ -490,7 +555,7 @@ export const createDigestJob = async (jobData: CreateDigestJobData) => { logger.error('createDigestJob error', error) await writeDigest(jobData.userId, { - id: jobData.id, + id: digestId, jobState: TaskState.Failed, }) } finally { @@ -507,5 +572,7 @@ export const createDigestJob = async (jobData: CreateDigestJobData) => { await sendMulticastPushNotifications(jobData.userId, message, 'reminder') } + + console.timeEnd('createDigestJob') } } diff --git a/packages/api/src/jobs/trigger_rule.ts b/packages/api/src/jobs/trigger_rule.ts index dcc1c902f..27b3a6a64 100644 --- a/packages/api/src/jobs/trigger_rule.ts +++ b/packages/api/src/jobs/trigger_rule.ts @@ -243,14 +243,13 @@ const triggerActions = async ( } catch (error) { if (error instanceof RequiresSearchQueryError) { logger.info('Failed to filter items by metadata, running search query') - const searchResult = await searchLibraryItems( + results = await searchLibraryItems( { query: `includes:${data.id} AND (${rule.filter})`, size: 1, }, userId ) - results = searchResult.libraryItems } else { logger.error('Error filtering item events', error) await markRuleAsFailed(rule.id, userId) diff --git a/packages/api/src/queue-processor.ts b/packages/api/src/queue-processor.ts index 89448b96f..2d0bf5881 100644 --- a/packages/api/src/queue-processor.ts +++ b/packages/api/src/queue-processor.ts @@ -16,7 +16,7 @@ import { appDataSource } from './data_source' import { env } from './env' import { TaskState } from './generated/graphql' import { aiSummarize, AI_SUMMARIZE_JOB_NAME } from './jobs/ai-summarize' -import { createDigestJob, CREATE_DIGEST_JOB } from './jobs/ai/create_digest' +import { createDigest, CREATE_DIGEST_JOB } from './jobs/ai/create_digest' import { bulkAction, BULK_ACTION_JOB_NAME } from './jobs/bulk_action' import { callWebhook, CALL_WEBHOOK_JOB_NAME } from './jobs/call_webhook' import { @@ -181,7 +181,7 @@ export const createWorker = (connection: ConnectionOptions) => case FORWARD_EMAIL_JOB: return forwardEmailJob(job.data) case CREATE_DIGEST_JOB: - return createDigestJob(job.data) + return createDigest(job.data) default: logger.warning(`[queue-processor] unhandled job: ${job.name}`) } diff --git a/packages/api/src/resolvers/article/index.ts b/packages/api/src/resolvers/article/index.ts index 874d235da..660474c7b 100644 --- a/packages/api/src/resolvers/article/index.ts +++ b/packages/api/src/resolvers/article/index.ts @@ -74,7 +74,7 @@ import { countLibraryItems, createOrUpdateLibraryItem, findLibraryItemsByPrefix, - searchLibraryItems, + searchAndCountLibraryItems, softDeleteLibraryItem, sortParamsToSort, updateLibraryItem, @@ -675,7 +675,7 @@ export const searchResolver = authorized< return { errorCodes: [SearchErrorCode.QueryTooLong] } } - const { libraryItems, count } = await searchLibraryItems( + const { libraryItems, count } = await searchAndCountLibraryItems( { from: Number(startCursor), size: first + 1, // fetch one more item to get next cursor @@ -752,7 +752,7 @@ export const updatesSinceResolver = authorized< folder ? ' in:' + folder : '' } sort:${sort.by}-${sort.order}` - const { libraryItems, count } = await searchLibraryItems( + const { libraryItems, count } = await searchAndCountLibraryItems( { from: Number(startCursor), size: size + 1, // fetch one more item to get next cursor diff --git a/packages/api/src/routers/digest_router.ts b/packages/api/src/routers/digest_router.ts index 86f0a4636..6cbdd1e1d 100644 --- a/packages/api/src/routers/digest_router.ts +++ b/packages/api/src/routers/digest_router.ts @@ -1,6 +1,5 @@ import cors from 'cors' import express from 'express' -import { v4 as uuid } from 'uuid' import { env } from '../env' import { TaskState } from '../generated/graphql' import { CreateDigestJobSchedule } from '../jobs/ai/create_digest' @@ -66,12 +65,6 @@ export function digestRouter() { } try { - const user = await findActiveUser(userId) - if (!user) { - logger.info(`User not found: ${userId}`) - return res.sendStatus(401) - } - const feature = await findGrantedFeatureByName( FeatureName.AIDigest, userId @@ -95,9 +88,11 @@ export function digestRouter() { // enqueue job and return job id const result = await enqueueCreateDigest( { - id: uuid(), // generate job id userId, - ...data, + voices: data.voices, + language: data.language, + rate: data.rate, + libraryItemIds: data.libraryItemIds, }, data.schedule ) @@ -131,12 +126,6 @@ export function digestRouter() { } try { - const user = await findActiveUser(userId) - if (!user) { - logger.info(`User not found: ${userId}`) - return res.sendStatus(401) - } - const feature = await findGrantedFeatureByName( FeatureName.AIDigest, userId diff --git a/packages/api/src/services/library_item.ts b/packages/api/src/services/library_item.ts index 47ec7bba8..d91cfa10d 100644 --- a/packages/api/src/services/library_item.ts +++ b/packages/api/src/services/library_item.ts @@ -6,10 +6,10 @@ import { EntityManager, FindOptionsWhere, ObjectLiteral, - SelectQueryBuilder, } from 'typeorm' import { QueryDeepPartialEntity } from 'typeorm/query-builder/QueryPartialEntity' import { ReadingProgressDataSource } from '../datasources/reading_progress_data_source' +import { appDataSource } from '../data_source' import { EntityLabel } from '../entity/entity_label' import { Highlight } from '../entity/highlight' import { Label } from '../entity/label' @@ -18,12 +18,7 @@ import { env } from '../env' import { BulkActionType, InputMaybe, SortParams } from '../generated/graphql' import { createPubSubClient, EntityEvent, EntityType } from '../pubsub' import { redisDataSource } from '../redis_data_source' -import { - authTrx, - getColumns, - getRepository, - queryBuilderToRawSql, -} from '../repository' +import { authTrx, getColumns, queryBuilderToRawSql } from '../repository' import { libraryItemRepository } from '../repository/library_item' import { Merge, PickTuple } from '../util' import { deepDelete, setRecentlySavedItemInRedis } from '../utils/helpers' @@ -619,11 +614,13 @@ export const buildQueryString = ( return serialize(searchQuery) } -export const buildQuery = ( - queryBuilder: SelectQueryBuilder, +export const createSearchQueryBuilder = ( args: SearchArgs, - userId: string + userId: string, + em = appDataSource.manager ) => { + const queryBuilder = em.createQueryBuilder(LibraryItem, 'library_item') + // select all columns except content const selects: Select[] = getColumns(libraryItemRepository) .filter( @@ -688,44 +685,54 @@ export const buildQuery = ( orders.forEach((order) => { queryBuilder.addOrderBy(order.by, order.order, order.nulls) }) + + return queryBuilder } export const countLibraryItems = async (args: SearchArgs, userId: string) => { - const queryBuilder = - getRepository(LibraryItem).createQueryBuilder('library_item') - - buildQuery(queryBuilder, args, userId) - - return queryBuilder.getCount() + return authTrx( + async (tx) => createSearchQueryBuilder(args, userId, tx).getCount(), + undefined, + userId + ) } export const searchLibraryItems = async ( args: SearchArgs, userId: string -): Promise<{ libraryItems: LibraryItem[]; count: number }> => { +): Promise => { const { from = 0, size = 10 } = args + if (size === 0) { + // return only count if size is 0 because limit 0 is not allowed in typeorm + return [] + } + return authTrx( - async (tx) => { - const queryBuilder = tx.createQueryBuilder(LibraryItem, 'library_item') - buildQuery(queryBuilder, args, userId) - - const count = await queryBuilder.getCount() - if (size === 0) { - // return only count if size is 0 because limit 0 is not allowed in typeorm - return { libraryItems: [], count } - } - - // add pagination - const libraryItems = await queryBuilder.skip(from).take(size).getMany() - - return { libraryItems, count } - }, + async (tx) => + createSearchQueryBuilder(args, userId, tx) + .skip(from) + .take(size) + .getMany(), undefined, userId ) } +export const searchAndCountLibraryItems = async ( + args: SearchArgs, + userId: string +): Promise<{ libraryItems: LibraryItem[]; count: number }> => { + const count = await countLibraryItems(args, userId) + if (count === 0) { + return { libraryItems: [], count } + } + + const libraryItems = await searchLibraryItems(args, userId) + + return { libraryItems, count } +} + export const findRecentLibraryItems = async ( userId: string, limit = 1000, diff --git a/packages/api/src/utils/createTask.ts b/packages/api/src/utils/createTask.ts index a8c6e2fb1..d74f2ae10 100644 --- a/packages/api/src/utils/createTask.ts +++ b/packages/api/src/utils/createTask.ts @@ -18,7 +18,7 @@ import { } from '../generated/graphql' import { AISummarizeJobData, AI_SUMMARIZE_JOB_NAME } from '../jobs/ai-summarize' import { - CreateDigestJobData, + CreateDigestData, CreateDigestJobResponse, CreateDigestJobSchedule, CREATE_DIGEST_JOB, @@ -856,7 +856,7 @@ export const enqueueSendEmail = async (jobData: SendEmailJobData) => { } export const enqueueCreateDigest = async ( - data: CreateDigestJobData, + data: CreateDigestData, schedule?: CreateDigestJobSchedule ): Promise => { const queue = await getBackendQueue() @@ -864,6 +864,8 @@ export const enqueueCreateDigest = async ( throw new Error('No queue found') } + // generate unique id for the digest + data.id = uuid() // enqueue create digest job immediately const jobId = `${CREATE_DIGEST_JOB}_${data.userId}` const job = await queue.add(CREATE_DIGEST_JOB, data, { @@ -890,9 +892,9 @@ export const enqueueCreateDigest = async ( await writeDigest(data.userId, digest) if (schedule) { + // remove existing repeated job if any await Promise.all( Object.keys(CRON_PATTERNS).map(async (key) => { - // remove existing repeated job if any const isDeleted = await queue.removeRepeatable( CREATE_DIGEST_JOB, { @@ -909,6 +911,9 @@ export const enqueueCreateDigest = async ( ) // schedule repeated job + // delete the digest id to avoid duplication + delete data.id + const job = await queue.add(CREATE_DIGEST_JOB, data, { attempts: 1, priority: getJobPriority(CREATE_DIGEST_JOB), diff --git a/packages/api/test/routers/auth.test.ts b/packages/api/test/routers/auth.test.ts index 32b438047..415231149 100644 --- a/packages/api/test/routers/auth.test.ts +++ b/packages/api/test/routers/auth.test.ts @@ -6,7 +6,7 @@ import { userRepository } from '../../src/repository/user' import { isValidSignupRequest } from '../../src/routers/auth/auth_router' import { AuthProvider } from '../../src/routers/auth/auth_types' import { createPendingUserToken } from '../../src/routers/auth/jwt_helpers' -import { searchLibraryItems } from '../../src/services/library_item' +import { searchAndCountLibraryItems } from '../../src/services/library_item' import { deleteUser, updateUser } from '../../src/services/user' import { comparePassword, @@ -528,7 +528,7 @@ describe('auth router', () => { 'web' ).expect(200) const user = await userRepository.findOneByOrFail({ name }) - const { count } = await searchLibraryItems( + const { count } = await searchAndCountLibraryItems( { query: 'in:inbox sort:read-desc is:reading' }, user.id ) @@ -552,7 +552,10 @@ describe('auth router', () => { 'ios' ).expect(200) const user = await userRepository.findOneByOrFail({ name }) - const { count } = await searchLibraryItems({ query: 'in:all' }, user.id) + const { count } = await searchAndCountLibraryItems( + { query: 'in:all' }, + user.id + ) expect(count).to.eql(4) }) diff --git a/packages/text-to-speech/mocha-config.json b/packages/text-to-speech/mocha-config.json index 5a2631f42..8e24eb08b 100644 --- a/packages/text-to-speech/mocha-config.json +++ b/packages/text-to-speech/mocha-config.json @@ -1,4 +1,5 @@ { "extension": ["ts"], - "spec": "test/**/*.test.ts" + "spec": "test/**/*.test.ts", + "timeout": 10000 } diff --git a/packages/text-to-speech/src/htmlToSsml.ts b/packages/text-to-speech/src/htmlToSsml.ts index beebd4a71..e8e379a8b 100644 --- a/packages/text-to-speech/src/htmlToSsml.ts +++ b/packages/text-to-speech/src/htmlToSsml.ts @@ -5,6 +5,7 @@ import { SentenceTokenizerNew, WordPunctTokenizer, } from 'natural' +import { isElement } from 'underscore' // this code needs to be kept in sync with the // frontend code in: useReadingProgressAnchor @@ -49,6 +50,7 @@ const DEFAULT_LANGUAGE = 'en-US' const DEFAULT_VOICE = 'en-US-JennyNeural' const DEFAULT_SECONDARY_VOICE = 'en-US-GuyNeural' const DEFAULT_RATE = '1.1' +const ELEMENT_INDEX_ATTRIBUTE = 'data-omnivore-anchor-idx' const ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES = [ 'omnivore-highlight-id', @@ -74,6 +76,19 @@ const TOP_LEVEL_TAGS = [ 'LI', ] +const SKIP_TAGS = [ + 'SCRIPT', + 'STYLE', + 'IMG', + 'FIGURE', + 'FIGCAPTION', + 'IFRAME', + 'CODE', +] + +const getAnchorIndex = (element: Element): number => + Number(element.getAttribute(ELEMENT_INDEX_ATTRIBUTE)) + function parseDomTree(pageNode: Element) { if (!pageNode || pageNode.childNodes.length == 0) { console.log('no child nodes found') @@ -108,7 +123,7 @@ function parseDomTree(pageNode: Element) { visitedNodeList.forEach((node, index) => { // We start at index 3, because the frontend starts two nodes above us // on the #readability-page-1 element that wraps the entire content. - node.setAttribute('data-omnivore-anchor-idx', (index + 3).toString()) + node.setAttribute(ELEMENT_INDEX_ATTRIBUTE, (index + 3).toString()) }) return visitedNodeList } @@ -146,18 +161,8 @@ function emitElement( element: Element, isTopLevel: boolean ) { - const SKIP_TAGS = [ - 'SCRIPT', - 'STYLE', - 'IMG', - 'FIGURE', - 'FIGCAPTION', - 'IFRAME', - 'CODE', - ] - const topLevelTags = ssmlTagsForTopLevelElement() - const idx = element.getAttribute('data-omnivore-anchor-idx') + const idx = getAnchorIndex(element) let maxVisitedIdx = Number(idx) if (isTopLevel) { @@ -166,6 +171,11 @@ function emitElement( for (const child of Array.from(element.childNodes)) { if (SKIP_TAGS.indexOf(child.nodeName) >= 0) { + // Skip unwanted tags and update the index + if (isElement(child)) { + const childIdx = getAnchorIndex(child) + maxVisitedIdx = Math.max(maxVisitedIdx, childIdx) + } continue } @@ -180,8 +190,8 @@ function emitElement( } emitTextNode(textItems, cleanedText, child) } - if (child.nodeType == 1 /* Node.ELEMENT_NODE */) { - maxVisitedIdx = emitElement(textItems, child as HTMLElement, false) + if (isElement(child)) { + maxVisitedIdx = emitElement(textItems, child, false) if (child.nodeName === 'LI') { // add a new line after each list item emit(textItems, '\n') @@ -265,6 +275,11 @@ export const stripEmojis = (text: string): string => { return text.replace(emojiRegex, '').replace(/\s+/g, ' ') } +const filterUtterances = (utterances: Utterance[]): Utterance[] => { + const punctuationOrSpaceOnly = /^[\s.,;:!?]+$/ + return utterances.filter((u) => !punctuationOrSpaceOnly.test(u.text)) +} + const textToUtterances = ({ wordTokenizer, idx, @@ -309,7 +324,10 @@ const textToUtterances = ({ const sentenceTokenizer = new SentenceTokenizerNew() sentences = sentenceTokenizer.tokenize(text) } catch (err) { - console.debug('Unable to tokenize sentences') + console.log( + 'Unable to tokenize sentences, falling back to old tokenizer', + text + ) // fallback to old sentence tokenizer const sentenceTokenizer = new SentenceTokenizer() sentences = sentenceTokenizer.tokenize(text) @@ -369,6 +387,18 @@ const replaceSmartQuotes = (text: string): string => { return text.replace(/[\u2018\u2019]/g, "'").replace(/[\u201C\u201D]/g, '"') } +// get the max idx of the element and its children +const getMaxVisitedIdx = (element: Element): number => { + let maxVisitedIdx = getAnchorIndex(element) + for (const child of Array.from(element.childNodes)) { + if (isElement(child)) { + maxVisitedIdx = Math.max(maxVisitedIdx, getMaxVisitedIdx(child)) + } + } + + return maxVisitedIdx +} + export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => { const { title, content, options } = htmlInput console.log('creating speech file with options:', options) @@ -421,6 +451,12 @@ export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => { const textItems: string[] = [] const node = parsedNodes[i - 3] + // skip unwanted tags and update the index + if (SKIP_TAGS.includes(node.nodeName)) { + i = getMaxVisitedIdx(node) + continue + } + if (TOP_LEVEL_TAGS.includes(node.nodeName) || hasSignificantText(node)) { // use paragraph as anchor const idx = i.toString() @@ -441,10 +477,12 @@ export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => { } } + const filteredUtterances = filterUtterances(utterances) + return { wordCount: wordOffset, language, defaultVoice, - utterances, + utterances: filteredUtterances, } } diff --git a/packages/text-to-speech/test/htmlToSsml.test.ts b/packages/text-to-speech/test/htmlToSsml.test.ts index 88c23d386..62063c3d2 100644 --- a/packages/text-to-speech/test/htmlToSsml.test.ts +++ b/packages/text-to-speech/test/htmlToSsml.test.ts @@ -350,6 +350,41 @@ describe('convert HTML to Speech file', () => { options: TEST_OPTIONS, }) expect(speechFile.utterances).to.have.lengthOf(2) - expect(speechFile.utterances[1].text).to.eql('McMeekin makes an extended case that Stalin was preparing to attack Nazi Germany when Hitler attacked him, that the two dictators were basically in a race to see who could mobilize to betray the other first — and that the initial Soviet debacle in 1941 happened in part because Stalin was also pushing his military toward an offensive alignment, and they were caught in a "mid-mobilization limbo."') + expect(speechFile.utterances[1].text).to.eql( + 'McMeekin makes an extended case that Stalin was preparing to attack Nazi Germany when Hitler attacked him, that the two dictators were basically in a race to see who could mobilize to betray the other first — and that the initial Soviet debacle in 1941 happened in part because Stalin was also pushing his military toward an offensive alignment, and they were caught in a "mid-mobilization limbo."' + ) + }) + + it('skip unwanted elements', () => { + const html = `
+

This is a test.

+ + + +
test
+
` + + const speechFile = htmlToSpeechFile({ + content: html, + options: TEST_OPTIONS, + }) + expect(speechFile.utterances).to.have.lengthOf(1) + expect(speechFile.utterances[0].text).to.eql('This is a test.') + }) + + it('filters out utterances with only punctuation or whitespace', () => { + const html = `
+

This is a test.

+

.

+

+
` + + const speechFile = htmlToSpeechFile({ + content: html, + options: TEST_OPTIONS, + }) + + expect(speechFile.utterances).to.have.lengthOf(1) + expect(speechFile.utterances[0].text).to.eql('This is a test.') }) }) diff --git a/packages/text-to-speech/tsconfig.json b/packages/text-to-speech/tsconfig.json index 5b6ea3e00..7634a8d53 100644 --- a/packages/text-to-speech/tsconfig.json +++ b/packages/text-to-speech/tsconfig.json @@ -6,5 +6,5 @@ // Generate d.ts files "declaration": true }, - "include": ["src"] + "include": ["src", "test"] } diff --git a/yarn.lock b/yarn.lock index 08b76b456..960c4280a 100644 --- a/yarn.lock +++ b/yarn.lock @@ -50,6 +50,20 @@ lodash "^4.17.21" resize-observer-polyfill "^1.5.1" +"@anthropic-ai/sdk@^0.20.1": + version "0.20.7" + resolved "https://registry.yarnpkg.com/@anthropic-ai/sdk/-/sdk-0.20.7.tgz#b19b0e66ba070f928bbf583c06d76e6efdd93d5e" + integrity sha512-uyc+3WGLpe8ur6mSIKSab7P9JdBerTdmqb7popc/yROYLLCW/Ykyw4ZfjmN/cLmxjnAKnv5YUngzbPM0BJuGjg== + dependencies: + "@types/node" "^18.11.18" + "@types/node-fetch" "^2.6.4" + abort-controller "^3.0.0" + agentkeepalive "^4.2.1" + form-data-encoder "1.7.2" + formdata-node "^4.3.2" + node-fetch "^2.6.7" + web-streams-polyfill "^3.2.1" + "@anthropic-ai/sdk@^0.9.1": version "0.9.1" resolved "https://registry.yarnpkg.com/@anthropic-ai/sdk/-/sdk-0.9.1.tgz#b2d2b7bf05c90dce502c9a2e869066870f69ba88" @@ -3774,6 +3788,17 @@ dependencies: lodash "^4.17.21" +"@langchain/anthropic@^0.1.16": + version "0.1.16" + resolved "https://registry.yarnpkg.com/@langchain/anthropic/-/anthropic-0.1.16.tgz#c2a9d3dd4e02df7118dd97cf2503c9bd1a4de5ad" + integrity sha512-vCbwkZ3pkMSKf67fBgNlslvuW9f3EZGBbO8Ic2etgX3xFl6L0WuMtfS26P1FCDpRwaKuC1BrCj2aLAeMzMq/Fg== + dependencies: + "@anthropic-ai/sdk" "^0.20.1" + "@langchain/core" "~0.1.56" + fast-xml-parser "^4.3.5" + zod "^3.22.4" + zod-to-json-schema "^3.22.4" + "@langchain/community@~0.0.33": version "0.0.33" resolved "https://registry.yarnpkg.com/@langchain/community/-/community-0.0.33.tgz#5568fe36b1e2f8947d49414d47e14a27da5b65c9" @@ -3803,6 +3828,24 @@ zod "^3.22.4" zod-to-json-schema "^3.22.3" +"@langchain/core@~0.1.56": + version "0.1.61" + resolved "https://registry.yarnpkg.com/@langchain/core/-/core-0.1.61.tgz#9313363e04f1c6981a938b2909c44ce6fceb2736" + integrity sha512-C8OkAly+ugvXsL8TACCmFv9WTTcT4gvQaG6NbrXCOzibBCywfxxcTqEMOyg3zIKpxHEmR0DHqh0OiJRHocnsCg== + dependencies: + ansi-styles "^5.0.0" + camelcase "6" + decamelize "1.2.0" + js-tiktoken "^1.0.8" + langsmith "~0.1.7" + ml-distance "^4.0.0" + mustache "^4.2.0" + p-queue "^6.6.2" + p-retry "4" + uuid "^9.0.0" + zod "^3.22.4" + zod-to-json-schema "^3.22.3" + "@langchain/openai@^0.0.14": version "0.0.14" resolved "https://registry.yarnpkg.com/@langchain/openai/-/openai-0.0.14.tgz#27a6ba83f6b754391868b22f3b90cd440038acf0" @@ -15694,7 +15737,7 @@ fast-xml-parser@^4.2.2, fast-xml-parser@^4.3.0: dependencies: strnum "^1.0.5" -fast-xml-parser@^4.3.2: +fast-xml-parser@^4.3.2, fast-xml-parser@^4.3.5: version "4.3.6" resolved "https://registry.yarnpkg.com/fast-xml-parser/-/fast-xml-parser-4.3.6.tgz#190f9d99097f0c8f2d3a0e681a10404afca052ff" integrity sha512-M2SovcRxD4+vC493Uc2GZVcZaj66CCJhWurC4viynVSTvrpErCShNcDz1lAho6n9REQKvL/ll4A4/fw6Y9z8nw== @@ -22822,6 +22865,11 @@ multimatch@5.0.0: arrify "^2.0.1" minimatch "^3.0.4" +mustache@^4.2.0: + version "4.2.0" + resolved "https://registry.yarnpkg.com/mustache/-/mustache-4.2.0.tgz#e5892324d60a12ec9c2a73359edca52972bf6f64" + integrity sha512-71ippSywq5Yb7/tVYyGbkBggbU8H3u5Rz56fH60jGFgr8uHwxs+aSKeqmluIVzM0m0kB7xQjKS6qPfd0b2ZoqQ== + mute-stream@0.0.8, mute-stream@~0.0.4: version "0.0.8" resolved "https://registry.yarnpkg.com/mute-stream/-/mute-stream-0.0.8.tgz#1630c42b2251ff81e2a283de96a5497ea92e5e0d" @@ -32217,6 +32265,11 @@ zod-to-json-schema@^3.22.3: resolved "https://registry.yarnpkg.com/zod-to-json-schema/-/zod-to-json-schema-3.22.4.tgz#f8cc691f6043e9084375e85fb1f76ebafe253d70" integrity sha512-2Ed5dJ+n/O3cU383xSY28cuVi0BCQhF8nYqWU5paEpl7fVdqdAmiLdqLyfblbNdfOFwFfi/mqU4O1pwc60iBhQ== +zod-to-json-schema@^3.22.4: + version "3.23.0" + resolved "https://registry.yarnpkg.com/zod-to-json-schema/-/zod-to-json-schema-3.23.0.tgz#4fc60e88d3c709eedbfaae3f92f8a7bf786469f2" + integrity sha512-az0uJ243PxsRIa2x1WmNE/pnuA05gUq/JB8Lwe1EDCCL/Fz9MgjYQ0fPlyc2Tcv6aF2ZA7WM5TWaRZVEFaAIag== + zod@^3.22.3, zod@^3.22.4: version "3.22.4" resolved "https://registry.yarnpkg.com/zod/-/zod-3.22.4.tgz#f31c3a9386f61b1f228af56faa9255e845cf3fff"