Merge pull request #3876 from omnivore-app/fix/digest

fix/digest
This commit is contained in:
Hongbo Wu
2024-05-01 14:49:37 +08:00
committed by GitHub
14 changed files with 348 additions and 150 deletions

View File

@ -22,6 +22,7 @@
"@google-cloud/storage": "^7.0.1",
"@google-cloud/tasks": "^4.0.0",
"@graphql-tools/utils": "^9.1.1",
"@langchain/anthropic": "^0.1.16",
"@langchain/openai": "^0.0.14",
"@notionhq/client": "^2.2.14",
"@omnivore/content-handler": "1.0.0",

View File

@ -1,5 +1,6 @@
import { ChatAnthropic } from '@langchain/anthropic'
import { JsonOutputParser } from '@langchain/core/output_parsers'
import { PromptTemplate } from '@langchain/core/prompts'
import { ChatPromptTemplate, PromptTemplate } from '@langchain/core/prompts'
import { OpenAI } from '@langchain/openai'
import {
htmlToSpeechFile,
@ -8,6 +9,7 @@ import {
} from '@omnivore/text-to-speech-handler'
import axios from 'axios'
import showdown from 'showdown'
import { v4 as uuid } from 'uuid'
import yaml from 'yaml'
import { LibraryItem } from '../../entity/library_item'
import { TaskState } from '../../generated/graphql'
@ -18,14 +20,15 @@ import {
searchLibraryItems,
} from '../../services/library_item'
import { findDeviceTokensByUserId } from '../../services/user_device_tokens'
import { wordsCount } from '../../utils/helpers'
import { logger } from '../../utils/logger'
import { htmlToMarkdown } from '../../utils/parser'
import { sendMulticastPushNotifications } from '../../utils/sendNotification'
export type CreateDigestJobSchedule = 'daily' | 'weekly'
export interface CreateDigestJobData {
id: string
export interface CreateDigestData {
id?: string
userId: string
voices?: string[]
language?: string
@ -113,18 +116,17 @@ const getPreferencesList = async (userId: string): Promise<LibraryItem[]> => {
// reason: "some older items that were interacted with"
const preferences = await Promise.all(
digestDefinition.preferenceSelectors.map(async (selector) => {
// use the selector to fetch items
const results = await searchLibraryItems(
{
query: selector.query,
size: selector.count,
},
userId
)
return results.libraryItems
})
digestDefinition.preferenceSelectors.map(
async (selector) =>
// use the selector to fetch items
await searchLibraryItems(
{
query: selector.query,
size: selector.count,
},
userId
)
)
)
// deduplicate and flatten the items
@ -137,11 +139,17 @@ const getPreferencesList = async (userId: string): Promise<LibraryItem[]> => {
return dedupedPreferences
}
const randomSelectCandidates = (candidates: LibraryItem[]): LibraryItem[] => {
// randomly choose at most 25 candidates
return candidates.sort(() => 0.5 - Math.random()).slice(0, 25)
}
// Makes multiple DB queries and combines the results
const getCandidatesList = async (
userId: string,
selectedLibraryItemIds?: string[]
): Promise<LibraryItem[]> => {
console.time('getCandidatesList')
// use the queries from the digest definitions to lookup preferences
// There should be a list of multiple queries we use. For now we can
// hardcode these queries:
@ -153,28 +161,28 @@ const getCandidatesList = async (
return findLibraryItemsByIds(selectedLibraryItemIds, userId)
}
// get the existing candidate ids from cache
const key = `digest:${userId}:existingCandidateIds`
const existingCandidateIds = await redisDataSource.redisClient?.get(key)
// // get the existing candidate ids from cache
// const key = `digest:${userId}:existingCandidateIds`
// const existingCandidateIds = await redisDataSource.redisClient?.get(key)
logger.info('existingCandidateIds: ', { existingCandidateIds })
// logger.info('existingCandidateIds: ', { existingCandidateIds })
const candidates = await Promise.all(
digestDefinition.candidateSelectors.map(async (selector) => {
// use the selector to fetch items
const results = await searchLibraryItems(
{
includeContent: true,
query: existingCandidateIds
? `(${selector.query}) -includes:${existingCandidateIds}` // exclude the existing candidates
: selector.query,
size: selector.count,
},
userId
)
return results.libraryItems
})
digestDefinition.candidateSelectors.map(
async (selector) =>
// use the selector to fetch items
await searchLibraryItems(
{
includeContent: true,
// query: existingCandidateIds
// ? `(${selector.query}) -includes:${existingCandidateIds}` // exclude the existing candidates
// : selector.query,
query: selector.query,
size: selector.count,
},
userId
)
)
)
// deduplicate and flatten the items
@ -188,24 +196,38 @@ const getCandidatesList = async (
readableContent: htmlToMarkdown(item.readableContent),
})) // convert the html content to markdown
logger.info(
'dedupedCandidates: ',
dedupedCandidates.map((item) => item.title)
)
console.timeEnd('getCandidatesList')
if (dedupedCandidates.length === 0) {
logger.info('No new candidates found')
if (existingCandidateIds) {
// reuse the existing candidates
const existingIds = existingCandidateIds.split(',')
return findLibraryItemsByIds(existingIds, userId)
}
// if (existingCandidateIds) {
// // reuse the existing candidates
// const existingIds = existingCandidateIds.split(',')
// return findLibraryItemsByIds(existingIds, userId)
// }
// return empty array if no existing candidates
return []
}
// store the ids in cache
const candidateIds = dedupedCandidates.map((item) => item.id).join(',')
await redisDataSource.redisClient?.set(key, candidateIds)
const selectedCandidates = randomSelectCandidates(dedupedCandidates)
return dedupedCandidates
logger.info(
'selectedCandidates: ',
selectedCandidates.map((item) => item.title)
)
// // store the ids in cache
// const candidateIds = selectedCandidates.map((item) => item.id).join(',')
// await redisDataSource.redisClient?.set(key, candidateIds)
return selectedCandidates
}
// Takes a list of library items, and uses a prompt to generate
@ -220,7 +242,7 @@ const createUserProfile = async (
},
})
const contextualTemplate = PromptTemplate.fromTemplate(
const contextualTemplate = ChatPromptTemplate.fromTemplate(
digestDefinition.zeroShot.userPreferencesProfilePrompt
)
@ -347,31 +369,58 @@ const chooseRankedSelections = (rankedCandidates: RankedItem[]) => {
const summarizeItems = async (
rankedCandidates: RankedItem[]
): Promise<RankedItem[]> => {
const llm = new OpenAI({
modelName: 'gpt-4-0125-preview',
configuration: {
apiKey: process.env.OPENAI_API_KEY,
},
console.time('summarizeItems')
// const llm = new OpenAI({
// modelName: 'gpt-4-0125-preview',
// configuration: {
// apiKey: process.env.OPENAI_API_KEY,
// },
// })
const llm = new ChatAnthropic({
apiKey: process.env.CLAUDE_API_KEY,
model: 'claude-3-sonnet-20240229',
})
const contextualTemplate = PromptTemplate.fromTemplate(
const contextualTemplate = ChatPromptTemplate.fromTemplate(
digestDefinition.summaryPrompt
)
const chain = contextualTemplate.pipe(llm)
// send all the ranked candidates to openAI at once in a batch
const summaries = await chain.batch(
rankedCandidates.map((item) => ({
title: item.libraryItem.title,
author: item.libraryItem.author ?? '',
content: item.libraryItem.readableContent, // markdown content
}))
// // send all the ranked candidates to openAI at once in a batch
// const summaries = await chain.batch(
// rankedCandidates.map((item) => ({
// title: item.libraryItem.title,
// author: item.libraryItem.author ?? '',
// content: item.libraryItem.readableContent, // markdown content
// }))
// )
const prompts = await Promise.all(
rankedCandidates.map(async (item) => {
try {
return await contextualTemplate.format({
title: item.libraryItem.title,
author: item.libraryItem.author ?? '',
content: item.libraryItem.readableContent, // markdown content
})
} catch (error) {
logger.error('summarizeItems error', error)
return ''
}
})
)
logger.info('prompts: ', prompts)
const summaries = await llm.batch(prompts)
logger.info('summaries: ', summaries)
summaries.forEach(
(summary, index) => (rankedCandidates[index].summary = summary)
(summary, index) =>
(rankedCandidates[index].summary = summary.content.toString())
)
console.timeEnd('summarizeItems')
return rankedCandidates
}
@ -380,6 +429,7 @@ const generateSpeechFiles = (
rankedItems: RankedItem[],
options: SSMLOptions
): SpeechFile[] => {
console.time('generateSpeechFiles')
// convert the summaries from markdown to HTML
const converter = new showdown.Converter({
backslashEscapesHTMLTags: true,
@ -398,6 +448,8 @@ const generateSpeechFiles = (
})
})
console.timeEnd('generateSpeechFiles')
return speechFiles
}
@ -405,7 +457,10 @@ const generateSpeechFiles = (
// basic checks to make sure the summaries are good.
const filterSummaries = (summaries: RankedItem[]): RankedItem[] => {
return summaries.filter(
(item) => item.summary.length < item.libraryItem.readableContent.length
(item) =>
wordsCount(item.summary) > 100 &&
wordsCount(item.summary) < 1000 &&
item.summary.length < item.libraryItem.readableContent.length
)
}
@ -434,7 +489,11 @@ const generateByline = (summaries: RankedItem[]): string =>
.map((item) => item.libraryItem.author)
.join(', ')
export const createDigestJob = async (jobData: CreateDigestJobData) => {
export const createDigest = async (jobData: CreateDigestData) => {
console.time('createDigestJob')
// generate a unique id for the digest if not provided for scheduled jobs
const digestId = jobData.id ?? uuid()
try {
digestDefinition = await fetchDigestDefinition()
@ -445,18 +504,23 @@ export const createDigestJob = async (jobData: CreateDigestJobData) => {
if (candidates.length === 0) {
logger.info('No candidates found')
return writeDigest(jobData.userId, {
id: jobData.id,
id: digestId,
jobState: TaskState.Succeeded,
title: 'No articles found',
})
}
const userProfile = await findOrCreateUserProfile(jobData.userId)
const rankedCandidates = await rankCandidates(candidates, userProfile)
const { finalSelections, rankedTopics } =
chooseRankedSelections(rankedCandidates)
// const userProfile = await findOrCreateUserProfile(jobData.userId)
// const rankedCandidates = await rankCandidates(candidates, userProfile)
// const { finalSelections, rankedTopics } =
// chooseRankedSelections(rankedCandidates)
const summaries = await summarizeItems(finalSelections)
const selections = candidates.map((item) => ({
topic: '',
libraryItem: item,
summary: '',
}))
const summaries = await summarizeItems(selections)
const filteredSummaries = filterSummaries(summaries)
@ -467,7 +531,7 @@ export const createDigestJob = async (jobData: CreateDigestJobData) => {
})
const title = generateTitle(summaries)
const digest: Digest = {
id: jobData.id,
id: digestId,
title,
content: generateContent(summaries),
jobState: TaskState.Succeeded,
@ -480,7 +544,8 @@ export const createDigestJob = async (jobData: CreateDigestJobData) => {
wordCount: speechFiles[index].wordCount,
})),
createdAt: new Date(),
description: generateDescription(summaries, rankedTopics),
description: '',
// description: generateDescription(summaries, rankedTopics),
byline: generateByline(summaries),
urlsToAudio: [],
}
@ -490,7 +555,7 @@ export const createDigestJob = async (jobData: CreateDigestJobData) => {
logger.error('createDigestJob error', error)
await writeDigest(jobData.userId, {
id: jobData.id,
id: digestId,
jobState: TaskState.Failed,
})
} finally {
@ -507,5 +572,7 @@ export const createDigestJob = async (jobData: CreateDigestJobData) => {
await sendMulticastPushNotifications(jobData.userId, message, 'reminder')
}
console.timeEnd('createDigestJob')
}
}

View File

@ -243,14 +243,13 @@ const triggerActions = async (
} catch (error) {
if (error instanceof RequiresSearchQueryError) {
logger.info('Failed to filter items by metadata, running search query')
const searchResult = await searchLibraryItems(
results = await searchLibraryItems(
{
query: `includes:${data.id} AND (${rule.filter})`,
size: 1,
},
userId
)
results = searchResult.libraryItems
} else {
logger.error('Error filtering item events', error)
await markRuleAsFailed(rule.id, userId)

View File

@ -16,7 +16,7 @@ import { appDataSource } from './data_source'
import { env } from './env'
import { TaskState } from './generated/graphql'
import { aiSummarize, AI_SUMMARIZE_JOB_NAME } from './jobs/ai-summarize'
import { createDigestJob, CREATE_DIGEST_JOB } from './jobs/ai/create_digest'
import { createDigest, CREATE_DIGEST_JOB } from './jobs/ai/create_digest'
import { bulkAction, BULK_ACTION_JOB_NAME } from './jobs/bulk_action'
import { callWebhook, CALL_WEBHOOK_JOB_NAME } from './jobs/call_webhook'
import {
@ -181,7 +181,7 @@ export const createWorker = (connection: ConnectionOptions) =>
case FORWARD_EMAIL_JOB:
return forwardEmailJob(job.data)
case CREATE_DIGEST_JOB:
return createDigestJob(job.data)
return createDigest(job.data)
default:
logger.warning(`[queue-processor] unhandled job: ${job.name}`)
}

View File

@ -74,7 +74,7 @@ import {
countLibraryItems,
createOrUpdateLibraryItem,
findLibraryItemsByPrefix,
searchLibraryItems,
searchAndCountLibraryItems,
softDeleteLibraryItem,
sortParamsToSort,
updateLibraryItem,
@ -675,7 +675,7 @@ export const searchResolver = authorized<
return { errorCodes: [SearchErrorCode.QueryTooLong] }
}
const { libraryItems, count } = await searchLibraryItems(
const { libraryItems, count } = await searchAndCountLibraryItems(
{
from: Number(startCursor),
size: first + 1, // fetch one more item to get next cursor
@ -752,7 +752,7 @@ export const updatesSinceResolver = authorized<
folder ? ' in:' + folder : ''
} sort:${sort.by}-${sort.order}`
const { libraryItems, count } = await searchLibraryItems(
const { libraryItems, count } = await searchAndCountLibraryItems(
{
from: Number(startCursor),
size: size + 1, // fetch one more item to get next cursor

View File

@ -1,6 +1,5 @@
import cors from 'cors'
import express from 'express'
import { v4 as uuid } from 'uuid'
import { env } from '../env'
import { TaskState } from '../generated/graphql'
import { CreateDigestJobSchedule } from '../jobs/ai/create_digest'
@ -66,12 +65,6 @@ export function digestRouter() {
}
try {
const user = await findActiveUser(userId)
if (!user) {
logger.info(`User not found: ${userId}`)
return res.sendStatus(401)
}
const feature = await findGrantedFeatureByName(
FeatureName.AIDigest,
userId
@ -95,9 +88,11 @@ export function digestRouter() {
// enqueue job and return job id
const result = await enqueueCreateDigest(
{
id: uuid(), // generate job id
userId,
...data,
voices: data.voices,
language: data.language,
rate: data.rate,
libraryItemIds: data.libraryItemIds,
},
data.schedule
)
@ -131,12 +126,6 @@ export function digestRouter() {
}
try {
const user = await findActiveUser(userId)
if (!user) {
logger.info(`User not found: ${userId}`)
return res.sendStatus(401)
}
const feature = await findGrantedFeatureByName(
FeatureName.AIDigest,
userId

View File

@ -6,10 +6,10 @@ import {
EntityManager,
FindOptionsWhere,
ObjectLiteral,
SelectQueryBuilder,
} from 'typeorm'
import { QueryDeepPartialEntity } from 'typeorm/query-builder/QueryPartialEntity'
import { ReadingProgressDataSource } from '../datasources/reading_progress_data_source'
import { appDataSource } from '../data_source'
import { EntityLabel } from '../entity/entity_label'
import { Highlight } from '../entity/highlight'
import { Label } from '../entity/label'
@ -18,12 +18,7 @@ import { env } from '../env'
import { BulkActionType, InputMaybe, SortParams } from '../generated/graphql'
import { createPubSubClient, EntityEvent, EntityType } from '../pubsub'
import { redisDataSource } from '../redis_data_source'
import {
authTrx,
getColumns,
getRepository,
queryBuilderToRawSql,
} from '../repository'
import { authTrx, getColumns, queryBuilderToRawSql } from '../repository'
import { libraryItemRepository } from '../repository/library_item'
import { Merge, PickTuple } from '../util'
import { deepDelete, setRecentlySavedItemInRedis } from '../utils/helpers'
@ -619,11 +614,13 @@ export const buildQueryString = (
return serialize(searchQuery)
}
export const buildQuery = (
queryBuilder: SelectQueryBuilder<LibraryItem>,
export const createSearchQueryBuilder = (
args: SearchArgs,
userId: string
userId: string,
em = appDataSource.manager
) => {
const queryBuilder = em.createQueryBuilder(LibraryItem, 'library_item')
// select all columns except content
const selects: Select[] = getColumns(libraryItemRepository)
.filter(
@ -688,44 +685,54 @@ export const buildQuery = (
orders.forEach((order) => {
queryBuilder.addOrderBy(order.by, order.order, order.nulls)
})
return queryBuilder
}
export const countLibraryItems = async (args: SearchArgs, userId: string) => {
const queryBuilder =
getRepository(LibraryItem).createQueryBuilder('library_item')
buildQuery(queryBuilder, args, userId)
return queryBuilder.getCount()
return authTrx(
async (tx) => createSearchQueryBuilder(args, userId, tx).getCount(),
undefined,
userId
)
}
export const searchLibraryItems = async (
args: SearchArgs,
userId: string
): Promise<{ libraryItems: LibraryItem[]; count: number }> => {
): Promise<LibraryItem[]> => {
const { from = 0, size = 10 } = args
if (size === 0) {
// return only count if size is 0 because limit 0 is not allowed in typeorm
return []
}
return authTrx(
async (tx) => {
const queryBuilder = tx.createQueryBuilder(LibraryItem, 'library_item')
buildQuery(queryBuilder, args, userId)
const count = await queryBuilder.getCount()
if (size === 0) {
// return only count if size is 0 because limit 0 is not allowed in typeorm
return { libraryItems: [], count }
}
// add pagination
const libraryItems = await queryBuilder.skip(from).take(size).getMany()
return { libraryItems, count }
},
async (tx) =>
createSearchQueryBuilder(args, userId, tx)
.skip(from)
.take(size)
.getMany(),
undefined,
userId
)
}
export const searchAndCountLibraryItems = async (
args: SearchArgs,
userId: string
): Promise<{ libraryItems: LibraryItem[]; count: number }> => {
const count = await countLibraryItems(args, userId)
if (count === 0) {
return { libraryItems: [], count }
}
const libraryItems = await searchLibraryItems(args, userId)
return { libraryItems, count }
}
export const findRecentLibraryItems = async (
userId: string,
limit = 1000,

View File

@ -18,7 +18,7 @@ import {
} from '../generated/graphql'
import { AISummarizeJobData, AI_SUMMARIZE_JOB_NAME } from '../jobs/ai-summarize'
import {
CreateDigestJobData,
CreateDigestData,
CreateDigestJobResponse,
CreateDigestJobSchedule,
CREATE_DIGEST_JOB,
@ -856,7 +856,7 @@ export const enqueueSendEmail = async (jobData: SendEmailJobData) => {
}
export const enqueueCreateDigest = async (
data: CreateDigestJobData,
data: CreateDigestData,
schedule?: CreateDigestJobSchedule
): Promise<CreateDigestJobResponse> => {
const queue = await getBackendQueue()
@ -864,6 +864,8 @@ export const enqueueCreateDigest = async (
throw new Error('No queue found')
}
// generate unique id for the digest
data.id = uuid()
// enqueue create digest job immediately
const jobId = `${CREATE_DIGEST_JOB}_${data.userId}`
const job = await queue.add(CREATE_DIGEST_JOB, data, {
@ -890,9 +892,9 @@ export const enqueueCreateDigest = async (
await writeDigest(data.userId, digest)
if (schedule) {
// remove existing repeated job if any
await Promise.all(
Object.keys(CRON_PATTERNS).map(async (key) => {
// remove existing repeated job if any
const isDeleted = await queue.removeRepeatable(
CREATE_DIGEST_JOB,
{
@ -909,6 +911,9 @@ export const enqueueCreateDigest = async (
)
// schedule repeated job
// delete the digest id to avoid duplication
delete data.id
const job = await queue.add(CREATE_DIGEST_JOB, data, {
attempts: 1,
priority: getJobPriority(CREATE_DIGEST_JOB),

View File

@ -6,7 +6,7 @@ import { userRepository } from '../../src/repository/user'
import { isValidSignupRequest } from '../../src/routers/auth/auth_router'
import { AuthProvider } from '../../src/routers/auth/auth_types'
import { createPendingUserToken } from '../../src/routers/auth/jwt_helpers'
import { searchLibraryItems } from '../../src/services/library_item'
import { searchAndCountLibraryItems } from '../../src/services/library_item'
import { deleteUser, updateUser } from '../../src/services/user'
import {
comparePassword,
@ -528,7 +528,7 @@ describe('auth router', () => {
'web'
).expect(200)
const user = await userRepository.findOneByOrFail({ name })
const { count } = await searchLibraryItems(
const { count } = await searchAndCountLibraryItems(
{ query: 'in:inbox sort:read-desc is:reading' },
user.id
)
@ -552,7 +552,10 @@ describe('auth router', () => {
'ios'
).expect(200)
const user = await userRepository.findOneByOrFail({ name })
const { count } = await searchLibraryItems({ query: 'in:all' }, user.id)
const { count } = await searchAndCountLibraryItems(
{ query: 'in:all' },
user.id
)
expect(count).to.eql(4)
})

View File

@ -1,4 +1,5 @@
{
"extension": ["ts"],
"spec": "test/**/*.test.ts"
"spec": "test/**/*.test.ts",
"timeout": 10000
}

View File

@ -5,6 +5,7 @@ import {
SentenceTokenizerNew,
WordPunctTokenizer,
} from 'natural'
import { isElement } from 'underscore'
// this code needs to be kept in sync with the
// frontend code in: useReadingProgressAnchor
@ -49,6 +50,7 @@ const DEFAULT_LANGUAGE = 'en-US'
const DEFAULT_VOICE = 'en-US-JennyNeural'
const DEFAULT_SECONDARY_VOICE = 'en-US-GuyNeural'
const DEFAULT_RATE = '1.1'
const ELEMENT_INDEX_ATTRIBUTE = 'data-omnivore-anchor-idx'
const ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES = [
'omnivore-highlight-id',
@ -74,6 +76,19 @@ const TOP_LEVEL_TAGS = [
'LI',
]
const SKIP_TAGS = [
'SCRIPT',
'STYLE',
'IMG',
'FIGURE',
'FIGCAPTION',
'IFRAME',
'CODE',
]
const getAnchorIndex = (element: Element): number =>
Number(element.getAttribute(ELEMENT_INDEX_ATTRIBUTE))
function parseDomTree(pageNode: Element) {
if (!pageNode || pageNode.childNodes.length == 0) {
console.log('no child nodes found')
@ -108,7 +123,7 @@ function parseDomTree(pageNode: Element) {
visitedNodeList.forEach((node, index) => {
// We start at index 3, because the frontend starts two nodes above us
// on the #readability-page-1 element that wraps the entire content.
node.setAttribute('data-omnivore-anchor-idx', (index + 3).toString())
node.setAttribute(ELEMENT_INDEX_ATTRIBUTE, (index + 3).toString())
})
return visitedNodeList
}
@ -146,18 +161,8 @@ function emitElement(
element: Element,
isTopLevel: boolean
) {
const SKIP_TAGS = [
'SCRIPT',
'STYLE',
'IMG',
'FIGURE',
'FIGCAPTION',
'IFRAME',
'CODE',
]
const topLevelTags = ssmlTagsForTopLevelElement()
const idx = element.getAttribute('data-omnivore-anchor-idx')
const idx = getAnchorIndex(element)
let maxVisitedIdx = Number(idx)
if (isTopLevel) {
@ -166,6 +171,11 @@ function emitElement(
for (const child of Array.from(element.childNodes)) {
if (SKIP_TAGS.indexOf(child.nodeName) >= 0) {
// Skip unwanted tags and update the index
if (isElement(child)) {
const childIdx = getAnchorIndex(child)
maxVisitedIdx = Math.max(maxVisitedIdx, childIdx)
}
continue
}
@ -180,8 +190,8 @@ function emitElement(
}
emitTextNode(textItems, cleanedText, child)
}
if (child.nodeType == 1 /* Node.ELEMENT_NODE */) {
maxVisitedIdx = emitElement(textItems, child as HTMLElement, false)
if (isElement(child)) {
maxVisitedIdx = emitElement(textItems, child, false)
if (child.nodeName === 'LI') {
// add a new line after each list item
emit(textItems, '\n')
@ -265,6 +275,11 @@ export const stripEmojis = (text: string): string => {
return text.replace(emojiRegex, '').replace(/\s+/g, ' ')
}
const filterUtterances = (utterances: Utterance[]): Utterance[] => {
const punctuationOrSpaceOnly = /^[\s.,;:!?]+$/
return utterances.filter((u) => !punctuationOrSpaceOnly.test(u.text))
}
const textToUtterances = ({
wordTokenizer,
idx,
@ -309,7 +324,10 @@ const textToUtterances = ({
const sentenceTokenizer = new SentenceTokenizerNew()
sentences = sentenceTokenizer.tokenize(text)
} catch (err) {
console.debug('Unable to tokenize sentences')
console.log(
'Unable to tokenize sentences, falling back to old tokenizer',
text
)
// fallback to old sentence tokenizer
const sentenceTokenizer = new SentenceTokenizer()
sentences = sentenceTokenizer.tokenize(text)
@ -369,6 +387,18 @@ const replaceSmartQuotes = (text: string): string => {
return text.replace(/[\u2018\u2019]/g, "'").replace(/[\u201C\u201D]/g, '"')
}
// get the max idx of the element and its children
const getMaxVisitedIdx = (element: Element): number => {
let maxVisitedIdx = getAnchorIndex(element)
for (const child of Array.from(element.childNodes)) {
if (isElement(child)) {
maxVisitedIdx = Math.max(maxVisitedIdx, getMaxVisitedIdx(child))
}
}
return maxVisitedIdx
}
export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => {
const { title, content, options } = htmlInput
console.log('creating speech file with options:', options)
@ -421,6 +451,12 @@ export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => {
const textItems: string[] = []
const node = parsedNodes[i - 3]
// skip unwanted tags and update the index
if (SKIP_TAGS.includes(node.nodeName)) {
i = getMaxVisitedIdx(node)
continue
}
if (TOP_LEVEL_TAGS.includes(node.nodeName) || hasSignificantText(node)) {
// use paragraph as anchor
const idx = i.toString()
@ -441,10 +477,12 @@ export const htmlToSpeechFile = (htmlInput: HtmlInput): SpeechFile => {
}
}
const filteredUtterances = filterUtterances(utterances)
return {
wordCount: wordOffset,
language,
defaultVoice,
utterances,
utterances: filteredUtterances,
}
}

View File

@ -350,6 +350,41 @@ describe('convert HTML to Speech file', () => {
options: TEST_OPTIONS,
})
expect(speechFile.utterances).to.have.lengthOf(2)
expect(speechFile.utterances[1].text).to.eql('McMeekin makes an extended case that Stalin was preparing to attack Nazi Germany when Hitler attacked him, that the two dictators were basically in a race to see who could mobilize to betray the other first — and that the initial Soviet debacle in 1941 happened in part because Stalin was also pushing his military toward an offensive alignment, and they were caught in a "mid-mobilization limbo."')
expect(speechFile.utterances[1].text).to.eql(
'McMeekin makes an extended case that Stalin was preparing to attack Nazi Germany when Hitler attacked him, that the two dictators were basically in a race to see who could mobilize to betray the other first — and that the initial Soviet debacle in 1941 happened in part because Stalin was also pushing his military toward an offensive alignment, and they were caught in a "mid-mobilization limbo."'
)
})
it('skip unwanted elements', () => {
const html = `<div class="page" id="readability-page-1" data-omnivore-anchor-idx="1">
<p data-omnivore-anchor-idx="2">This is a test.</p>
<script data-omnivore-anchor-idx="3">alert('hello');</script>
<style data-omnivore-anchor-idx="4">body { color: red; }</style>
<iframe data-omnivore-anchor-idx="6" src="https://example.com">test</iframe>
<figcaption data-omnivore-anchor-idx="7">test</figcaption>
</div>`
const speechFile = htmlToSpeechFile({
content: html,
options: TEST_OPTIONS,
})
expect(speechFile.utterances).to.have.lengthOf(1)
expect(speechFile.utterances[0].text).to.eql('This is a test.')
})
it('filters out utterances with only punctuation or whitespace', () => {
const html = `<div class="page" id="readability-page-1" data-omnivore-anchor-idx="1">
<p data-omnivore-anchor-idx="2">This is a test.</p>
<p data-omnivore-anchor-idx="3">.</p>
<p data-omnivore-anchor-idx="4"> </p>
</div>`
const speechFile = htmlToSpeechFile({
content: html,
options: TEST_OPTIONS,
})
expect(speechFile.utterances).to.have.lengthOf(1)
expect(speechFile.utterances[0].text).to.eql('This is a test.')
})
})

View File

@ -6,5 +6,5 @@
// Generate d.ts files
"declaration": true
},
"include": ["src"]
"include": ["src", "test"]
}

View File

@ -50,6 +50,20 @@
lodash "^4.17.21"
resize-observer-polyfill "^1.5.1"
"@anthropic-ai/sdk@^0.20.1":
version "0.20.7"
resolved "https://registry.yarnpkg.com/@anthropic-ai/sdk/-/sdk-0.20.7.tgz#b19b0e66ba070f928bbf583c06d76e6efdd93d5e"
integrity sha512-uyc+3WGLpe8ur6mSIKSab7P9JdBerTdmqb7popc/yROYLLCW/Ykyw4ZfjmN/cLmxjnAKnv5YUngzbPM0BJuGjg==
dependencies:
"@types/node" "^18.11.18"
"@types/node-fetch" "^2.6.4"
abort-controller "^3.0.0"
agentkeepalive "^4.2.1"
form-data-encoder "1.7.2"
formdata-node "^4.3.2"
node-fetch "^2.6.7"
web-streams-polyfill "^3.2.1"
"@anthropic-ai/sdk@^0.9.1":
version "0.9.1"
resolved "https://registry.yarnpkg.com/@anthropic-ai/sdk/-/sdk-0.9.1.tgz#b2d2b7bf05c90dce502c9a2e869066870f69ba88"
@ -3774,6 +3788,17 @@
dependencies:
lodash "^4.17.21"
"@langchain/anthropic@^0.1.16":
version "0.1.16"
resolved "https://registry.yarnpkg.com/@langchain/anthropic/-/anthropic-0.1.16.tgz#c2a9d3dd4e02df7118dd97cf2503c9bd1a4de5ad"
integrity sha512-vCbwkZ3pkMSKf67fBgNlslvuW9f3EZGBbO8Ic2etgX3xFl6L0WuMtfS26P1FCDpRwaKuC1BrCj2aLAeMzMq/Fg==
dependencies:
"@anthropic-ai/sdk" "^0.20.1"
"@langchain/core" "~0.1.56"
fast-xml-parser "^4.3.5"
zod "^3.22.4"
zod-to-json-schema "^3.22.4"
"@langchain/community@~0.0.33":
version "0.0.33"
resolved "https://registry.yarnpkg.com/@langchain/community/-/community-0.0.33.tgz#5568fe36b1e2f8947d49414d47e14a27da5b65c9"
@ -3803,6 +3828,24 @@
zod "^3.22.4"
zod-to-json-schema "^3.22.3"
"@langchain/core@~0.1.56":
version "0.1.61"
resolved "https://registry.yarnpkg.com/@langchain/core/-/core-0.1.61.tgz#9313363e04f1c6981a938b2909c44ce6fceb2736"
integrity sha512-C8OkAly+ugvXsL8TACCmFv9WTTcT4gvQaG6NbrXCOzibBCywfxxcTqEMOyg3zIKpxHEmR0DHqh0OiJRHocnsCg==
dependencies:
ansi-styles "^5.0.0"
camelcase "6"
decamelize "1.2.0"
js-tiktoken "^1.0.8"
langsmith "~0.1.7"
ml-distance "^4.0.0"
mustache "^4.2.0"
p-queue "^6.6.2"
p-retry "4"
uuid "^9.0.0"
zod "^3.22.4"
zod-to-json-schema "^3.22.3"
"@langchain/openai@^0.0.14":
version "0.0.14"
resolved "https://registry.yarnpkg.com/@langchain/openai/-/openai-0.0.14.tgz#27a6ba83f6b754391868b22f3b90cd440038acf0"
@ -15694,7 +15737,7 @@ fast-xml-parser@^4.2.2, fast-xml-parser@^4.3.0:
dependencies:
strnum "^1.0.5"
fast-xml-parser@^4.3.2:
fast-xml-parser@^4.3.2, fast-xml-parser@^4.3.5:
version "4.3.6"
resolved "https://registry.yarnpkg.com/fast-xml-parser/-/fast-xml-parser-4.3.6.tgz#190f9d99097f0c8f2d3a0e681a10404afca052ff"
integrity sha512-M2SovcRxD4+vC493Uc2GZVcZaj66CCJhWurC4viynVSTvrpErCShNcDz1lAho6n9REQKvL/ll4A4/fw6Y9z8nw==
@ -22822,6 +22865,11 @@ multimatch@5.0.0:
arrify "^2.0.1"
minimatch "^3.0.4"
mustache@^4.2.0:
version "4.2.0"
resolved "https://registry.yarnpkg.com/mustache/-/mustache-4.2.0.tgz#e5892324d60a12ec9c2a73359edca52972bf6f64"
integrity sha512-71ippSywq5Yb7/tVYyGbkBggbU8H3u5Rz56fH60jGFgr8uHwxs+aSKeqmluIVzM0m0kB7xQjKS6qPfd0b2ZoqQ==
mute-stream@0.0.8, mute-stream@~0.0.4:
version "0.0.8"
resolved "https://registry.yarnpkg.com/mute-stream/-/mute-stream-0.0.8.tgz#1630c42b2251ff81e2a283de96a5497ea92e5e0d"
@ -32217,6 +32265,11 @@ zod-to-json-schema@^3.22.3:
resolved "https://registry.yarnpkg.com/zod-to-json-schema/-/zod-to-json-schema-3.22.4.tgz#f8cc691f6043e9084375e85fb1f76ebafe253d70"
integrity sha512-2Ed5dJ+n/O3cU383xSY28cuVi0BCQhF8nYqWU5paEpl7fVdqdAmiLdqLyfblbNdfOFwFfi/mqU4O1pwc60iBhQ==
zod-to-json-schema@^3.22.4:
version "3.23.0"
resolved "https://registry.yarnpkg.com/zod-to-json-schema/-/zod-to-json-schema-3.23.0.tgz#4fc60e88d3c709eedbfaae3f92f8a7bf786469f2"
integrity sha512-az0uJ243PxsRIa2x1WmNE/pnuA05gUq/JB8Lwe1EDCCL/Fz9MgjYQ0fPlyc2Tcv6aF2ZA7WM5TWaRZVEFaAIag==
zod@^3.22.3, zod@^3.22.4:
version "3.22.4"
resolved "https://registry.yarnpkg.com/zod/-/zod-3.22.4.tgz#f31c3a9386f61b1f228af56faa9255e845cf3fff"