Files
omnivore/packages/api/src/resolvers/article/index.ts
Jackson Harper e652a6ea8c Rebased version of the elastic PR (#225)
* Add elastic to our docker compose

* add AND/OR/NOT search operations

* add elastic and create article in elastic

* change error code when elastic throws error

* add search pages in elastic

* add search by labels

* Add elastic to GitHub Action

* Update elastic version

* Fix port for elastic

* add url in search query

* Set elastic features when running tests

* add debug logs

* Use localhost instead of service hostname

* refresh elastic after create/update

* update search labels query

* add typescript support

* search pages in elastic

* fix search queries

* use elastic for saving page

* fix test failure

* update getArticle api to use elastic

* use generic get page function

* add elastic migration python script

* fix bulk helper param

* save elastic page id in article_saving_request instead of postgres article_id

* fix page archiving and deleting

* add tests for deleteArticle

* remove custom date type in elastic mappings which not exist in older version of elastic

* fix timestamp format issue

* add tests for save reading progress

* add tests for save file

* optimize search results

* add alias to index

* update migration script to receive env var as params

* Add failing test to validate we don't decrease reading progress

This test is failing with Elastic because we aren't fetching
the reading progress from elastic here, and are fetching it
from postgres.

* Rename readingProgress to readingProgressPercent

This is the name stored in elastic, so fixes issues pulling the
value out.

* Linting

* Add failing test for creating highlights w/elastic

This test fails because the highlight can't be looked up. Is there
a different ID we should be passing in to query for highlights,
or do we need to update the query to look for elastic_id?

* add tests code coverage threshold

* update nyc config

* include more files in test coverage

* change alias name

* update updateContent to update pages in elastic

* remove debug log

* fix createhighlight test

* search pages by alias in elastic

* update set labels and delete labels in elastic

* migration script enumeration

* make BULK_SIZE an env var

* fix pdf search indexing

* debug github action exit issue

* call pubsub when create/update/delete page in elastic

* fix json parsing bug and reduce reading data from file

* replace a depreciated pubsub api call

* debug github action exit issue

* debug github action exit issue

* add handler to upload elastic page data to GCS

* fix tests

* Use http_auth instead of basic_auth

* add index creation and existing postgres tables update in migration script

* fix a typo to connect to elastic

* rename readingProgress to readingProgressPercent

* migrate elastic_page_id in highlights and article_saving_request tables

* update migration script to include number of updated rows

* update db migration query

* read index mappings from file

* fix upload pages to gcs

* fix tests failure due to pageContext

* fix upload file id not exist error

* Handle savedAt & isArchived attributes w/out quering elastic

* Fix prettier issues

* fix content-type mismatching

* revert pageId to linkId because frontend was not deployed yet

* fix newsletters and attachment not saved in elastic

* put linkId in article for setting labels

* exclude orginalHtml in the result of searching to improve performace

* exclude content in the result of searching to improve performace

* remove score sorting

* do not refresh immediately to reduce searching and indexing time

* do not replace the backup data in gcs

* fix no article id defined in articleSavingRequest

* add logging of elastic api running time

* reduce home feed pagination size to 15

* reduce home feed pagination size to 10

* stop revalidating first page

* do not use a separate api to fetch reading progress

* Remove unused comment

* get reading progress if not exists

* replace ngram tokenizer with standard tokenizer

* fix tests

* remove .env.local

* add sort keyword in searching to sort by score

Co-authored-by: Hongbo Wu <hongbo@omnivore.app>
2022-03-16 12:08:59 +08:00

798 lines
22 KiB
TypeScript

/* eslint-disable @typescript-eslint/no-unsafe-call */
/* eslint-disable @typescript-eslint/restrict-plus-operands */
/* eslint-disable @typescript-eslint/no-unsafe-assignment */
/* eslint-disable @typescript-eslint/no-unsafe-member-access */
/* eslint-disable @typescript-eslint/no-floating-promises */
import {
Article,
ArticleError,
ArticleErrorCode,
ArticlesError,
ArticleSuccess,
ContentReader,
CreateArticleError,
CreateArticleErrorCode,
CreateArticleSuccess,
FeedArticle,
MutationCreateArticleArgs,
MutationSaveArticleReadingProgressArgs,
MutationSetBookmarkArticleArgs,
MutationSetShareArticleArgs,
PageInfo,
PageType,
QueryArticleArgs,
QueryArticlesArgs,
ResolverFn,
SaveArticleReadingProgressError,
SaveArticleReadingProgressErrorCode,
SaveArticleReadingProgressSuccess,
SetBookmarkArticleError,
SetBookmarkArticleErrorCode,
SetBookmarkArticleSuccess,
SetShareArticleError,
SetShareArticleErrorCode,
SetShareArticleSuccess,
} from '../../generated/graphql'
/* eslint-disable @typescript-eslint/no-explicit-any */
import { Merge } from '../../util'
import {
getStorageFileDetails,
makeStorageFilePublic,
} from '../../utils/uploads'
import { ContentParseError } from '../../utils/errors'
import {
articleSavingRequestError,
articleSavingRequestPopulate,
authorized,
generateSlug,
stringToHash,
userDataToUser,
validatedDate,
} from '../../utils/helpers'
import {
ParsedContentPuppeteer,
parseOriginalContent,
parsePreparedContent,
} from '../../utils/parser'
import { isSiteBlockedForParse } from '../../utils/blocked'
import { Readability } from '@omnivore/readability'
import { traceAs } from '../../tracing'
import { createImageProxyUrl } from '../../utils/imageproxy'
import normalizeUrl from 'normalize-url'
import { WithDataSourcesContext } from '../types'
import { parseSearchQuery } from '../../utils/search'
import { createPageSaveRequest } from '../../services/create_page_save_request'
import { createIntercomEvent } from '../../utils/intercom'
import { analytics } from '../../utils/analytics'
import { env } from '../../env'
import {
createPage,
deletePage,
getPageById,
getPageByParam,
searchPages,
updatePage,
} from '../../elastic'
import { Page } from '../../elastic/types'
export type PartialArticle = Omit<
Article,
| 'updatedAt'
| 'readingProgressPercent'
| 'readingProgressAnchorIndex'
| 'savedAt'
| 'highlights'
| 'contentReader'
>
// These two page types are better handled by the backend
// where we can use APIs to fetch their underlying content.
const FORCE_PUPPETEER_URLS = [
// twitter status url regex
/twitter\.com\/(?:#!\/)?(\w+)\/status(?:es)?\/(\d+)(?:\/.*)?/,
/^((?:https?:)?\/\/)?((?:www|m)\.)?((?:youtube\.com|youtu.be))(\/(?:[\w-]+\?v=|embed\/|v\/)?)([\w-]+)(\S+)?$/,
]
export type CreateArticlesSuccessPartial = Merge<
CreateArticleSuccess,
{ createdArticle: PartialArticle }
>
export const createArticleResolver = authorized<
CreateArticlesSuccessPartial,
CreateArticleError,
MutationCreateArticleArgs
>(
async (
_,
{
input: {
url,
preparedDocument,
articleSavingRequestId,
uploadFileId,
skipParsing,
source,
},
},
ctx
) => {
const {
models,
authTrx,
claims: { uid },
log,
} = ctx
analytics.track({
userId: uid,
event: 'link_saved',
properties: {
url,
source,
env: env.server.apiEnv,
},
})
await createIntercomEvent('link-saved', uid)
const articleSavingRequest = articleSavingRequestId
? (await models.articleSavingRequest.get(articleSavingRequestId)) ||
(await authTrx((tx) =>
models.articleSavingRequest.create(
{ userId: uid, id: articleSavingRequestId },
tx
)
))
: undefined
const user = userDataToUser(await models.user.get(uid))
try {
if (isSiteBlockedForParse(url)) {
return articleSavingRequestError(
{
errorCodes: [CreateArticleErrorCode.NotAllowedToParse],
},
ctx,
articleSavingRequest
)
}
const { pathname } = new URL(url)
const croppedPathname = decodeURIComponent(
pathname
.split('/')
[pathname.split('/').length - 1].split('.')
.slice(0, -1)
.join('.')
).replace(/_/gi, ' ')
let parsedContent: Readability.ParseResult | null = null
let canonicalUrl
let userArticleUrl: string | null = null
let uploadFileHash = null
let domContent = null
let pageType = PageType.Unknown
const DUMMY_RESPONSE = {
user,
created: false,
createdArticle: {
id: '',
slug: '',
createdAt: new Date(),
originalHtml: domContent,
content: '',
description: '',
title: '',
pageType: PageType.Unknown,
contentReader: ContentReader.Web,
author: '',
url: normalizeUrl(canonicalUrl || url, {
stripHash: true,
stripWWW: false,
}),
hash: '',
isArchived: false,
},
}
if (uploadFileId) {
/* We do not trust the values from client, lookup upload file by querying
* with filtering on user ID and URL to verify client's uploadFileId is valid.
*/
const uploadFile = await models.uploadFile.getWhere({
id: uploadFileId,
userId: uid,
})
if (!uploadFile) {
return articleSavingRequestError(
{ errorCodes: [CreateArticleErrorCode.UploadFileMissing] },
ctx,
articleSavingRequest
)
}
const uploadFileDetails = await getStorageFileDetails(
uploadFileId,
uploadFile.fileName
)
uploadFileHash = uploadFileDetails.md5Hash
userArticleUrl = uploadFileDetails.fileUrl
canonicalUrl = uploadFile.url
pageType = PageType.File
} else if (
source !== 'puppeteer-parse' &&
FORCE_PUPPETEER_URLS.some((regex) => regex.test(url))
) {
await createPageSaveRequest(uid, url, models)
return DUMMY_RESPONSE
} else if (!skipParsing && preparedDocument?.document) {
const parseResults = await traceAs<Promise<ParsedContentPuppeteer>>(
{ spanName: 'article.parse' },
async (): Promise<ParsedContentPuppeteer> => {
return await parsePreparedContent(url, preparedDocument)
}
)
parsedContent = parseResults.parsedContent
canonicalUrl = parseResults.canonicalUrl
domContent = parseResults.domContent
pageType = parseOriginalContent(url, domContent)
} else if (!preparedDocument?.document) {
// We have a URL but no document, so we try to send this to puppeteer
// and return a dummy response.
await createPageSaveRequest(uid, url, models)
return DUMMY_RESPONSE
}
const saveTime = new Date()
const slug = generateSlug(parsedContent?.title || croppedPathname)
let articleToSave: Page = {
id: '',
userId: uid,
originalHtml: domContent,
content: parsedContent?.content || '',
description: parsedContent?.excerpt,
title:
parsedContent?.title ||
preparedDocument?.pageInfo.title ||
croppedPathname,
author: parsedContent?.byline,
url: normalizeUrl(canonicalUrl || url, {
stripHash: true,
stripWWW: false,
}),
pageType: pageType,
hash: uploadFileHash || stringToHash(parsedContent?.content || url),
image: parsedContent?.previewImage,
publishedAt: validatedDate(parsedContent?.publishedDate),
uploadFileId: uploadFileId,
slug,
createdAt: saveTime,
savedAt: saveTime,
siteName: parsedContent?.siteName,
}
let archive = false
if (articleSavingRequestId) {
const reminder = await models.reminder.getByRequestId(
uid,
articleSavingRequestId
)
if (reminder) {
archive = reminder.archiveUntil || false
}
}
log.info('New article saving', {
parsedArticle: Object.assign({}, articleToSave, {
content: undefined,
originalHtml: undefined,
}),
userArticleUrl,
labels: {
source: 'resolver',
resolver: 'createArticleResolver',
userId: uid,
},
})
let uploadFileUrlOverride = ''
if (uploadFileId) {
const uploadFileData = await authTrx(async (tx) => {
return await models.uploadFile.setFileUploadComplete(uploadFileId, tx)
})
if (!uploadFileData || !uploadFileData.id || !uploadFileData.fileName) {
return articleSavingRequestError(
{
errorCodes: [CreateArticleErrorCode.UploadFileMissing],
},
ctx,
articleSavingRequest
)
}
uploadFileUrlOverride = await makeStorageFilePublic(
uploadFileData.id,
uploadFileData.fileName
)
}
const existingPage = await getPageByParam({
userId: uid,
url: articleToSave.url,
})
if (existingPage) {
// update existing page in elastic
existingPage.slug = slug
existingPage.savedAt = saveTime
existingPage.archivedAt = archive ? saveTime : undefined
existingPage.url = uploadFileUrlOverride || articleToSave.url
existingPage.hash = articleToSave.hash
await updatePage(existingPage.id, existingPage, ctx)
log.info('page updated in elastic', existingPage.id)
articleToSave = existingPage
} else {
// create new page in elastic
const pageId = await createPage(articleToSave, ctx)
if (!pageId) {
return articleSavingRequestError(
{
errorCodes: [CreateArticleErrorCode.ElasticError],
},
ctx,
articleSavingRequest
)
}
log.info('page created in elastic', articleToSave)
articleToSave.id = pageId
}
const createdArticle: PartialArticle = {
...articleToSave,
isArchived: !!articleToSave.archivedAt,
}
return articleSavingRequestPopulate(
{
user,
created: false,
createdArticle: createdArticle,
},
ctx,
articleSavingRequest?.id,
createdArticle.id || undefined
)
} catch (error) {
if (
error instanceof ContentParseError &&
error.message === 'UNABLE_TO_PARSE'
) {
return articleSavingRequestError(
{ errorCodes: [CreateArticleErrorCode.UnableToParse] },
ctx,
articleSavingRequest
)
}
throw error
}
}
)
export type ArticleSuccessPartial = Merge<
ArticleSuccess,
{ article: PartialArticle }
>
export const getArticleResolver: ResolverFn<
ArticleSuccessPartial | ArticleError,
Record<string, unknown>,
WithDataSourcesContext,
QueryArticleArgs
> = async (_obj, { slug }, { claims }) => {
try {
if (!claims?.uid) {
return { errorCodes: [ArticleErrorCode.Unauthorized] }
}
analytics.track({
userId: claims?.uid,
event: 'link_fetched',
properties: {
slug,
env: env.server.apiEnv,
},
})
await createIntercomEvent('get-article', claims.uid)
console.log('start to get article', Date.now())
const page = await getPageByParam({ userId: claims.uid, slug })
console.log('get article from elastic', Date.now())
if (!page) {
return { errorCodes: [ArticleErrorCode.NotFound] }
}
return {
article: { ...page, isArchived: !!page.archivedAt, linkId: page.id },
}
} catch (error) {
return { errorCodes: [ArticleErrorCode.BadData] }
}
}
type PaginatedPartialArticles = {
edges: { cursor: string; node: PartialArticle }[]
pageInfo: PageInfo
}
export const getArticlesResolver = authorized<
PaginatedPartialArticles,
ArticlesError,
QueryArticlesArgs
>(async (_obj, params, { claims }) => {
const notNullField = params.sharedOnly ? 'sharedAt' : null
const startCursor = params.after || ''
const first = params.first || 10
console.log('getArticlesResolver starts', Date.now())
// Perform basic sanitization. Right now we just allow alphanumeric, space and quote
// so queries can contain phrases like "human race";
// We can also split out terms like "label:unread".
const searchQuery = parseSearchQuery(params.query || undefined)
analytics.track({
userId: claims.uid,
event: 'search',
properties: {
query: searchQuery.query,
inFilter: searchQuery.inFilter,
readFilter: searchQuery.readFilter,
typeFilter: searchQuery.typeFilter,
labelFilters: searchQuery.labelFilters,
sortParams: searchQuery.sortParams,
env: env.server.apiEnv,
},
})
console.log('parsed search query', Date.now())
await createIntercomEvent('search', claims.uid)
const [pages, totalCount] = (await searchPages(
{
from: Number(startCursor),
size: first + 1, // fetch one more item to get next cursor
sort: searchQuery.sortParams || params.sort || undefined,
query: searchQuery.query,
inFilter: searchQuery.inFilter,
readFilter: searchQuery.readFilter,
typeFilter: searchQuery.typeFilter,
labelFilters: searchQuery.labelFilters,
},
claims.uid,
notNullField
)) || [[], 0]
const start =
startCursor && !isNaN(Number(startCursor)) ? Number(startCursor) : 0
const hasNextPage = pages.length > first
const endCursor = String(start + pages.length - (hasNextPage ? 1 : 0))
console.log('get search result', Date.now())
console.log(
'start',
start,
'returning end cursor',
endCursor,
'length',
pages.length - 1
)
//TODO: refactor so that the lastCursor included
if (hasNextPage) {
// remove an extra if exists
pages.pop()
}
const edges = pages.map((a) => {
return {
node: {
...a,
image: a.image && createImageProxyUrl(a.image, 88, 88),
isArchived: !!a.archivedAt,
},
cursor: endCursor,
}
})
return {
edges,
pageInfo: {
hasPreviousPage: false,
startCursor,
hasNextPage: hasNextPage,
endCursor,
totalCount,
},
}
})
export type SetShareArticleSuccessPartial = Merge<
SetShareArticleSuccess,
{
updatedFeedArticle?: Omit<
FeedArticle,
| 'sharedBy'
| 'article'
| 'highlightsCount'
| 'annotationsCount'
| 'reactions'
>
updatedFeedArticleId?: string
updatedArticle: PartialArticle
}
>
export const setShareArticleResolver = authorized<
SetShareArticleSuccessPartial,
SetShareArticleError,
MutationSetShareArticleArgs
>(
async (
_,
{ input: { articleID, share, sharedComment, sharedWithHighlights } },
{ models, authTrx, claims: { uid }, log }
) => {
const article = await models.article.get(articleID)
if (!article) {
return { errorCodes: [SetShareArticleErrorCode.NotFound] }
}
const sharedAt = share ? new Date() : null
log.info(`${share ? 'S' : 'Uns'}haring an article`, {
article: Object.assign({}, article, {
content: undefined,
originalHtml: undefined,
sharedAt,
}),
labels: {
source: 'resolver',
resolver: 'setShareArticleResolver',
articleId: article.id,
userId: uid,
},
})
const result = await authTrx((tx) =>
models.userArticle.updateByArticleId(
uid,
articleID,
{ sharedAt, sharedComment, sharedWithHighlights },
tx
)
)
if (!result) {
return { errorCodes: [SetShareArticleErrorCode.NotFound] }
}
// Make sure article.id instead of userArticle.id has passed. We use it for cache updates
const updatedArticle = {
...result,
...article,
postedByViewer: !!sharedAt,
}
const updatedFeedArticle = sharedAt ? { ...result, sharedAt } : undefined
return {
updatedFeedArticleId: result.id,
updatedFeedArticle,
updatedArticle,
}
}
)
export type SetBookmarkArticleSuccessPartial = Merge<
SetBookmarkArticleSuccess,
{ bookmarkedArticle: PartialArticle }
>
export const setBookmarkArticleResolver = authorized<
SetBookmarkArticleSuccessPartial,
SetBookmarkArticleError,
MutationSetBookmarkArticleArgs
>(
async (
_,
{ input: { articleID, bookmark } },
{ models, authTrx, claims: { uid }, log, pubsub }
) => {
const article = await getPageById(articleID)
if (!article) {
return { errorCodes: [SetBookmarkArticleErrorCode.NotFound] }
}
if (!bookmark) {
const userArticleRemoved = await getPageByParam({
userId: uid,
_id: articleID,
})
if (!userArticleRemoved) {
return { errorCodes: [SetBookmarkArticleErrorCode.NotFound] }
}
await deletePage(userArticleRemoved.id, { pubsub })
const highlightsUnshared = await authTrx(async (tx) => {
return models.highlight.unshareAllHighlights(articleID, uid, tx)
})
log.info('Article unbookmarked', {
article: Object.assign({}, article, {
content: undefined,
originalHtml: undefined,
}),
highlightsUnshared: highlightsUnshared.length,
labels: {
source: 'resolver',
resolver: 'setBookmarkArticleResolver',
userId: uid,
articleID,
},
})
// Make sure article.id instead of userArticle.id has passed. We use it for cache updates
return {
bookmarkedArticle: {
...userArticleRemoved,
isArchived: false,
savedByViewer: false,
postedByViewer: false,
},
}
} else {
try {
const userArticle: Partial<Page> = {
userId: uid,
slug: generateSlug(article.title),
}
await updatePage(articleID, userArticle, { pubsub })
log.info('Article bookmarked', {
article: Object.assign({}, article, {
content: undefined,
originalHtml: undefined,
}),
labels: {
source: 'resolver',
resolver: 'setBookmarkArticleResolver',
userId: uid,
},
})
// Make sure article.id instead of userArticle.id has passed. We use it for cache updates
return {
bookmarkedArticle: {
...userArticle,
...article,
isArchived: false,
savedByViewer: true,
postedByViewer: false,
},
}
} catch (error) {
return { errorCodes: [SetBookmarkArticleErrorCode.BookmarkExists] }
}
}
}
)
export type SaveArticleReadingProgressSuccessPartial = Merge<
SaveArticleReadingProgressSuccess,
{ updatedArticle: PartialArticle }
>
export const saveArticleReadingProgressResolver = authorized<
SaveArticleReadingProgressSuccessPartial,
SaveArticleReadingProgressError,
MutationSaveArticleReadingProgressArgs
>(
async (
_,
{ input: { id, readingProgressPercent, readingProgressAnchorIndex } },
{ claims: { uid }, pubsub }
) => {
const userArticleRecord = await getPageByParam({ userId: uid, _id: id })
if (!userArticleRecord) {
return { errorCodes: [SaveArticleReadingProgressErrorCode.NotFound] }
}
if (
(!readingProgressPercent && readingProgressPercent !== 0) ||
readingProgressPercent < 0 ||
readingProgressPercent > 100
) {
return { errorCodes: [SaveArticleReadingProgressErrorCode.BadData] }
}
// If setting to zero we accept the update, otherwise we require it
// be greater than the current reading progress.
const shouldUpdate =
readingProgressPercent === 0 ||
(userArticleRecord.readingProgressPercent || 0) <
readingProgressPercent ||
(userArticleRecord.readingProgressAnchorIndex || 0) <
readingProgressAnchorIndex
const updatedArticle = Object.assign(userArticleRecord, {
readingProgressPercent: shouldUpdate
? readingProgressPercent
: userArticleRecord.readingProgressPercent,
readingProgressAnchorIndex: shouldUpdate
? readingProgressAnchorIndex
: userArticleRecord.readingProgressAnchorIndex,
})
shouldUpdate && (await updatePage(id, updatedArticle, { pubsub }))
return {
updatedArticle: {
...updatedArticle,
isArchived: !!updatedArticle.archivedAt,
},
}
}
)
export const getReadingProgressForArticleResolver: ResolverFn<
number | { errorCodes: string[] },
Article,
WithDataSourcesContext,
Record<string, unknown>
> = async (article, _params, { claims }) => {
if (!claims?.uid) {
return 0
}
if (
article.readingProgressPercent !== undefined &&
article.readingProgressPercent !== null
) {
return article.readingProgressPercent
}
const articleReadingProgress = (
await getPageByParam({ userId: claims.uid, _id: article.id })
)?.readingProgressPercent
return articleReadingProgress || 0
}
export const getReadingProgressAnchorIndexForArticleResolver: ResolverFn<
number | { errorCodes: string[] },
Article,
WithDataSourcesContext,
Record<string, unknown>
> = async (article, _params, { claims }) => {
if (!claims?.uid) {
return 0
}
if (
article.readingProgressAnchorIndex !== undefined &&
article.readingProgressAnchorIndex !== null
) {
return article.readingProgressAnchorIndex
}
const articleReadingProgressAnchorIndex = (
await getPageByParam({ userId: claims.uid, _id: article.id })
)?.readingProgressAnchorIndex
return articleReadingProgressAnchorIndex || 0
}