294 lines
7.9 KiB
TypeScript
294 lines
7.9 KiB
TypeScript
import { Readability } from '@omnivore/readability'
|
|
import { PubsubClient } from '../datalayer/pubsub'
|
|
import { addHighlightToPage } from '../elastic/highlights'
|
|
import { createPage, getPageByParam, updatePage } from '../elastic/pages'
|
|
import { ArticleSavingRequestStatus, Page, PageType } from '../elastic/types'
|
|
import { homePageURL } from '../env'
|
|
import {
|
|
HighlightType,
|
|
Maybe,
|
|
PreparedDocumentInput,
|
|
SaveErrorCode,
|
|
SavePageInput,
|
|
SaveResult,
|
|
} from '../generated/graphql'
|
|
import { DataModels } from '../resolvers/types'
|
|
import { enqueueThumbnailTask } from '../utils/createTask'
|
|
import {
|
|
cleanUrl,
|
|
generateSlug,
|
|
stringToHash,
|
|
TWEET_URL_REGEX,
|
|
validatedDate,
|
|
wordsCount,
|
|
} from '../utils/helpers'
|
|
import { logger } from '../utils/logger'
|
|
import { parsePreparedContent } from '../utils/parser'
|
|
import { createPageSaveRequest } from './create_page_save_request'
|
|
import { createLabels } from './labels'
|
|
|
|
type SaveContext = {
|
|
pubsub: PubsubClient
|
|
models: DataModels
|
|
uid: string
|
|
refresh?: boolean
|
|
}
|
|
|
|
type SaverUserData = {
|
|
userId: string
|
|
username: string
|
|
}
|
|
|
|
// where we can use APIs to fetch their underlying content.
|
|
const FORCE_PUPPETEER_URLS = [
|
|
TWEET_URL_REGEX,
|
|
/^((?:https?:)?\/\/)?((?:www|m)\.)?((?:youtube\.com|youtu.be))(\/(?:[\w-]+\?v=|embed\/|v\/)?)([\w-]+)(\S+)?$/,
|
|
]
|
|
|
|
const createSlug = (url: string, title?: Maybe<string> | undefined) => {
|
|
const { pathname } = new URL(url)
|
|
const croppedPathname = decodeURIComponent(
|
|
pathname
|
|
.split('/')
|
|
[pathname.split('/').length - 1].split('.')
|
|
.slice(0, -1)
|
|
.join('.')
|
|
).replace(/_/gi, ' ')
|
|
|
|
return [generateSlug(title || croppedPathname), croppedPathname]
|
|
}
|
|
|
|
const shouldParseInBackend = (input: SavePageInput): boolean => {
|
|
return (
|
|
input.source !== 'puppeteer-parse' &&
|
|
FORCE_PUPPETEER_URLS.some((regex) => regex.test(input.url))
|
|
)
|
|
}
|
|
|
|
export const createSavingRequest = (
|
|
ctx: SaveContext,
|
|
clientRequestId: string
|
|
) => {
|
|
return ctx.models.articleSavingRequest.create({
|
|
userId: ctx.uid,
|
|
id: clientRequestId,
|
|
})
|
|
}
|
|
|
|
export const savePage = async (
|
|
ctx: SaveContext,
|
|
saver: SaverUserData,
|
|
input: SavePageInput
|
|
): Promise<SaveResult> => {
|
|
const parseResult = await parsePreparedContent(
|
|
input.url,
|
|
{
|
|
document: input.originalContent,
|
|
pageInfo: {
|
|
title: input.title,
|
|
canonicalUrl: input.url,
|
|
},
|
|
},
|
|
input.parseResult
|
|
)
|
|
const [newSlug, croppedPathname] = createSlug(input.url, input.title)
|
|
let slug = newSlug
|
|
let pageId = input.clientRequestId
|
|
const articleToSave = parsedContentToPage({
|
|
url: input.url,
|
|
title: input.title,
|
|
userId: saver.userId,
|
|
pageId,
|
|
slug,
|
|
croppedPathname,
|
|
parsedContent: parseResult.parsedContent,
|
|
pageType: parseResult.pageType,
|
|
originalHtml: parseResult.domContent,
|
|
canonicalUrl: parseResult.canonicalUrl,
|
|
rssFeedUrl: input.rssFeedUrl,
|
|
saveTime: input.savedAt ? new Date(input.savedAt) : undefined,
|
|
publishedAt: input.publishedAt ? new Date(input.publishedAt) : undefined,
|
|
})
|
|
|
|
// save state
|
|
articleToSave.archivedAt =
|
|
input.state === ArticleSavingRequestStatus.Archived ? new Date() : null
|
|
// add labels to page
|
|
articleToSave.labels = input.labels
|
|
? await createLabels(ctx, input.labels)
|
|
: undefined
|
|
|
|
// always parse in backend if the url is in the force puppeteer list
|
|
if (shouldParseInBackend(input)) {
|
|
try {
|
|
await createPageSaveRequest({
|
|
userId: saver.userId,
|
|
url: articleToSave.url,
|
|
pubsub: ctx.pubsub,
|
|
articleSavingRequestId: input.clientRequestId,
|
|
archivedAt: articleToSave.archivedAt,
|
|
labels: articleToSave.labels,
|
|
})
|
|
} catch (e) {
|
|
return {
|
|
errorCodes: [SaveErrorCode.Unknown],
|
|
message: 'Failed to create page save request',
|
|
}
|
|
}
|
|
} else {
|
|
// check if the page already exists
|
|
const existingPage = await getPageByParam({
|
|
userId: saver.userId,
|
|
url: articleToSave.url,
|
|
})
|
|
if (existingPage) {
|
|
// we don't want to update an rss feed page if rss-feeder is tring to re-save it
|
|
if (
|
|
existingPage.rssFeedUrl &&
|
|
existingPage.rssFeedUrl === input.rssFeedUrl
|
|
) {
|
|
return {
|
|
clientRequestId: pageId,
|
|
url: `${homePageURL()}/${saver.username}/${slug}`,
|
|
}
|
|
}
|
|
|
|
pageId = existingPage.id
|
|
slug = existingPage.slug
|
|
if (
|
|
!(await updatePage(
|
|
existingPage.id,
|
|
{
|
|
// update the page with the new content
|
|
...articleToSave,
|
|
id: pageId, // we don't want to update the id
|
|
slug, // we don't want to update the slug
|
|
createdAt: existingPage.createdAt, // we don't want to update the createdAt
|
|
},
|
|
ctx
|
|
))
|
|
) {
|
|
return {
|
|
errorCodes: [SaveErrorCode.Unknown],
|
|
message: 'Failed to update existing page',
|
|
}
|
|
}
|
|
} else {
|
|
const newPageId = await createPage(articleToSave, ctx)
|
|
if (!newPageId) {
|
|
return {
|
|
errorCodes: [SaveErrorCode.Unknown],
|
|
message: 'Failed to create new page',
|
|
}
|
|
}
|
|
pageId = newPageId
|
|
}
|
|
}
|
|
|
|
// create a task to update thumbnail and pre-cache all images
|
|
if (input.source !== 'csv-importer') {
|
|
// we don't want to create thumbnail for imported pages
|
|
try {
|
|
const taskId = await enqueueThumbnailTask(saver.userId, slug)
|
|
logger.info('Created thumbnail task', { taskId })
|
|
} catch (e) {
|
|
logger.error('Failed to create thumbnail task', e)
|
|
}
|
|
}
|
|
|
|
if (parseResult.highlightData) {
|
|
const highlight = {
|
|
updatedAt: new Date(),
|
|
createdAt: new Date(),
|
|
userId: ctx.uid,
|
|
elasticPageId: pageId,
|
|
...parseResult.highlightData,
|
|
type: HighlightType.Highlight,
|
|
}
|
|
|
|
if (!(await addHighlightToPage(pageId, highlight, ctx))) {
|
|
return {
|
|
errorCodes: [SaveErrorCode.EmbeddedHighlightFailed],
|
|
message: 'Failed to save highlight',
|
|
}
|
|
}
|
|
}
|
|
|
|
return {
|
|
clientRequestId: pageId,
|
|
url: `${homePageURL()}/${saver.username}/${slug}`,
|
|
}
|
|
}
|
|
|
|
// convert parsed content to an elastic page
|
|
export const parsedContentToPage = ({
|
|
url,
|
|
userId,
|
|
originalHtml,
|
|
pageId,
|
|
parsedContent,
|
|
slug,
|
|
croppedPathname,
|
|
title,
|
|
preparedDocument,
|
|
canonicalUrl,
|
|
pageType,
|
|
uploadFileHash,
|
|
uploadFileId,
|
|
saveTime,
|
|
rssFeedUrl,
|
|
publishedAt,
|
|
}: {
|
|
url: string
|
|
userId: string
|
|
slug: string
|
|
croppedPathname: string
|
|
pageType: PageType
|
|
parsedContent: Readability.ParseResult | null
|
|
originalHtml?: string | null
|
|
pageId?: string | null
|
|
title?: string | null
|
|
preparedDocument?: PreparedDocumentInput | null
|
|
canonicalUrl?: string | null
|
|
uploadFileHash?: string | null
|
|
uploadFileId?: string | null
|
|
saveTime?: Date
|
|
rssFeedUrl?: string | null
|
|
publishedAt?: Date | null
|
|
}): Page => {
|
|
return {
|
|
id: pageId || '',
|
|
slug,
|
|
userId,
|
|
originalHtml,
|
|
content: parsedContent?.content || '',
|
|
description: parsedContent?.excerpt,
|
|
title:
|
|
title ||
|
|
parsedContent?.title ||
|
|
preparedDocument?.pageInfo.title ||
|
|
croppedPathname ||
|
|
parsedContent?.siteName ||
|
|
url,
|
|
author: parsedContent?.byline ?? undefined,
|
|
url: cleanUrl(canonicalUrl || url),
|
|
pageType,
|
|
hash: uploadFileHash || stringToHash(parsedContent?.content || url),
|
|
image: parsedContent?.previewImage ?? undefined,
|
|
publishedAt: validatedDate(
|
|
publishedAt || parsedContent?.publishedDate || undefined
|
|
),
|
|
uploadFileId,
|
|
readingProgressPercent: 0,
|
|
readingProgressAnchorIndex: 0,
|
|
state: ArticleSavingRequestStatus.Succeeded,
|
|
createdAt: validatedDate(saveTime) || new Date(),
|
|
savedAt: validatedDate(saveTime) || new Date(),
|
|
siteName: parsedContent?.siteName ?? undefined,
|
|
language: parsedContent?.language ?? undefined,
|
|
siteIcon: parsedContent?.siteIcon ?? undefined,
|
|
wordsCount: wordsCount(parsedContent?.textContent || ''),
|
|
rssFeedUrl: rssFeedUrl || undefined,
|
|
}
|
|
}
|