Merge pull request #2336 from omnivore-app/fix/thumbnailer-task-queue
fix/thumbnailer task queue
This commit is contained in:
@ -56,7 +56,9 @@ const createHttpTaskWithToken = async ({
|
||||
}
|
||||
|
||||
// Construct the fully qualified queue name.
|
||||
priority === 'low' && (queue = `${queue}-low`)
|
||||
if (priority === 'low') {
|
||||
queue = `${queue}-low`
|
||||
}
|
||||
|
||||
const parent = client.queuePath(project, location, queue)
|
||||
// Convert message to buffer.
|
||||
@ -534,6 +536,7 @@ export const enqueueThumbnailTask = async (
|
||||
payload,
|
||||
taskHandlerUrl: env.queue.thumbnailTaskHandlerUrl,
|
||||
requestHeaders: headers,
|
||||
queue: 'omnivore-thumbnail-queue',
|
||||
})
|
||||
|
||||
if (!createdTasks || !createdTasks[0].name) {
|
||||
|
||||
@ -223,15 +223,29 @@ const sendSavePageMutation = async (userId: string, input: unknown) => {
|
||||
})
|
||||
|
||||
const auth = (await signToken({ uid: userId }, JWT_SECRET)) as string
|
||||
const response = await axios.post(`${REST_BACKEND_ENDPOINT}/graphql`, data, {
|
||||
headers: {
|
||||
Cookie: `auth=${auth};`,
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
})
|
||||
try {
|
||||
const response = await axios.post(
|
||||
`${REST_BACKEND_ENDPOINT}/graphql`,
|
||||
data,
|
||||
{
|
||||
headers: {
|
||||
Cookie: `auth=${auth};`,
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
timeout: 30000, // 30s
|
||||
}
|
||||
)
|
||||
|
||||
/* eslint-disable @typescript-eslint/no-unsafe-member-access */
|
||||
return !!response.data.data.savePage
|
||||
/* eslint-disable @typescript-eslint/no-unsafe-member-access */
|
||||
return !!response.data.data.savePage
|
||||
} catch (error) {
|
||||
if (axios.isAxiosError(error)) {
|
||||
console.error('save page mutation error', error.message)
|
||||
} else {
|
||||
console.error(error)
|
||||
}
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
const contentHandler = async (
|
||||
|
||||
@ -46,6 +46,8 @@ const ALLOWED_CONTENT_TYPES = ['text/html', 'application/octet-stream', 'text/pl
|
||||
|
||||
const IMPORTER_METRICS_COLLECTOR_URL = process.env.IMPORTER_METRICS_COLLECTOR_URL;
|
||||
|
||||
const REQUEST_TIMEOUT = 30000; // 30 seconds
|
||||
|
||||
const userAgentForUrl = (url) => {
|
||||
try {
|
||||
const u = new URL(url);
|
||||
@ -69,7 +71,8 @@ const fetchContentWithScrapingBee = async (url) => {
|
||||
'render_js': 'false',
|
||||
'premium_proxy': 'true',
|
||||
'country_code':'us'
|
||||
}
|
||||
},
|
||||
timeout: REQUEST_TIMEOUT,
|
||||
})
|
||||
|
||||
const dom = parseHTML(response.data).document;
|
||||
@ -131,14 +134,19 @@ const getBrowserPromise = (async () => {
|
||||
})();
|
||||
|
||||
const uploadToSignedUrl = async ({ id, uploadSignedUrl }, contentType, contentObjUrl) => {
|
||||
const stream = await axios.get(contentObjUrl, { responseType: 'stream' });
|
||||
return await axios.put(uploadSignedUrl, stream.data, {
|
||||
headers: {
|
||||
'Content-Type': contentType,
|
||||
},
|
||||
maxBodyLength: 1000000000,
|
||||
maxContentLength: 100000000,
|
||||
})
|
||||
try {
|
||||
const stream = await axios.get(contentObjUrl, { responseType: 'stream' });
|
||||
return axios.put(uploadSignedUrl, stream.data, {
|
||||
headers: {
|
||||
'Content-Type': contentType,
|
||||
},
|
||||
maxBodyLength: 1000000000,
|
||||
maxContentLength: 100000000,
|
||||
});
|
||||
} catch (error) {
|
||||
console.error('error uploading to signed url', error.message);
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
const getUploadIdAndSignedUrl = async (userId, url, articleSavingRequestId) => {
|
||||
@ -164,21 +172,39 @@ const getUploadIdAndSignedUrl = async (userId, url, articleSavingRequestId) => {
|
||||
}
|
||||
});
|
||||
|
||||
const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data,
|
||||
try {
|
||||
const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data,
|
||||
{
|
||||
headers: {
|
||||
Cookie: `auth=${auth};`,
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
timeout: REQUEST_TIMEOUT,
|
||||
});
|
||||
return response.data.data.uploadFileRequest;
|
||||
|
||||
if (response.data.data.uploadFileRequest.errorCodes && response.data.data.uploadFileRequest.errorCodes.length > 0) {
|
||||
console.error('Error while getting upload id and signed url', response.data.data.uploadFileRequest.errorCodes[0]);
|
||||
return null;
|
||||
}
|
||||
|
||||
return response.data.data.uploadFileRequest;
|
||||
} catch (e) {
|
||||
console.error('error getting upload id and signed url', e.message);
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
const uploadPdf = async (url, userId, articleSavingRequestId) => {
|
||||
validateUrlString(url);
|
||||
|
||||
const uploadResult = await getUploadIdAndSignedUrl(userId, url, articleSavingRequestId);
|
||||
await uploadToSignedUrl(uploadResult, 'application/pdf', url);
|
||||
if (!uploadResult) {
|
||||
throw new Error('error while getting upload id and signed url');
|
||||
}
|
||||
const uploaded = await uploadToSignedUrl(uploadResult, 'application/pdf', url);
|
||||
if (!uploaded) {
|
||||
throw new Error('error while uploading pdf');
|
||||
}
|
||||
return uploadResult.id;
|
||||
};
|
||||
|
||||
@ -202,14 +228,26 @@ const sendCreateArticleMutation = async (userId, input) => {
|
||||
});
|
||||
|
||||
const auth = await signToken({ uid: userId }, process.env.JWT_SECRET);
|
||||
const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data,
|
||||
try {
|
||||
const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data,
|
||||
{
|
||||
headers: {
|
||||
Cookie: `auth=${auth};`,
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
timeout: REQUEST_TIMEOUT,
|
||||
});
|
||||
return response.data.data.createArticle;
|
||||
|
||||
if (response.data.data.createArticle.errorCodes && response.data.data.createArticle.errorCodes.length > 0) {
|
||||
console.error('error while creating article', response.data.data.createArticle.errorCodes[0]);
|
||||
return null;
|
||||
}
|
||||
|
||||
return response.data.data.createArticle;
|
||||
} catch (error) {
|
||||
console.error('error creating article', error.message);
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
const sendSavePageMutation = async (userId, input) => {
|
||||
@ -231,14 +269,26 @@ const sendSavePageMutation = async (userId, input) => {
|
||||
});
|
||||
|
||||
const auth = await signToken({ uid: userId }, process.env.JWT_SECRET);
|
||||
const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data,
|
||||
try {
|
||||
const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data,
|
||||
{
|
||||
headers: {
|
||||
Cookie: `auth=${auth};`,
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
timeout: REQUEST_TIMEOUT,
|
||||
});
|
||||
return response.data.data.savePage;
|
||||
|
||||
if (response.data.data.savePage.errorCodes && response.data.data.savePage.errorCodes.length > 0) {
|
||||
console.error('error while saving page', response.data.data.savePage.errorCodes[0]);
|
||||
return null;
|
||||
}
|
||||
|
||||
return response.data.data.savePage;
|
||||
} catch (error) {
|
||||
console.error('error saving page', error.message);
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
const saveUploadedPdf = async (userId, url, uploadFileId, articleSavingRequestId) => {
|
||||
@ -265,9 +315,10 @@ const sendImportStatusUpdate = async (userId, taskId, status) => {
|
||||
'Authorization': auth,
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
timeout: REQUEST_TIMEOUT,
|
||||
});
|
||||
} catch (e) {
|
||||
console.error('Error while sending import status update', e);
|
||||
console.error('error while sending import status update', e);
|
||||
}
|
||||
};
|
||||
|
||||
@ -318,7 +369,7 @@ async function fetchContent(req, res) {
|
||||
console.info('error with handler: ', e);
|
||||
}
|
||||
|
||||
let context, page, finalUrl;
|
||||
let context, page, finalUrl, statusCode = 200;
|
||||
try {
|
||||
if ((!content || !title) && contentType !== 'application/pdf') {
|
||||
const result = await retrievePage(url, logRecord, functionStartTime);
|
||||
@ -332,7 +383,13 @@ async function fetchContent(req, res) {
|
||||
|
||||
if (contentType === 'application/pdf') {
|
||||
const uploadedFileId = await uploadPdf(finalUrl, userId, articleSavingRequestId);
|
||||
await saveUploadedPdf(userId, finalUrl, uploadedFileId, articleSavingRequestId);
|
||||
const uploadedPdf = await saveUploadedPdf(userId, finalUrl, uploadedFileId, articleSavingRequestId);
|
||||
if (!uploadedPdf) {
|
||||
statusCode = 500;
|
||||
logRecord.error = 'error while saving uploaded pdf';
|
||||
} else {
|
||||
importStatus = 'imported';
|
||||
}
|
||||
} else {
|
||||
if (!content || !title) {
|
||||
const result = await retrieveHtml(page, logRecord);
|
||||
@ -347,19 +404,39 @@ async function fetchContent(req, res) {
|
||||
} else {
|
||||
console.info('using prefetched content and title');
|
||||
}
|
||||
|
||||
logRecord.fetchContentTime = Date.now() - functionStartTime;
|
||||
}
|
||||
} catch (e) {
|
||||
logRecord.error = e.message;
|
||||
console.error(`Error while retrieving page`, logRecord);
|
||||
|
||||
// fallback to scrapingbee for non pdf content
|
||||
if (contentType !== 'application/pdf') {
|
||||
const fetchStartTime = Date.now();
|
||||
const sbResult = await fetchContentWithScrapingBee(url);
|
||||
content = sbResult.domContent;
|
||||
title = sbResult.title;
|
||||
logRecord.fetchContentTime = Date.now() - fetchStartTime;
|
||||
}
|
||||
} finally {
|
||||
// close browser context if it was opened
|
||||
if (context) {
|
||||
await context.close();
|
||||
}
|
||||
// save non pdf content
|
||||
if (contentType !== 'application/pdf') {
|
||||
// parse content if it is not empty
|
||||
let readabilityResult = null;
|
||||
if (content) {
|
||||
const document = parseHTML(content).document;
|
||||
|
||||
let document = parseHTML(content).document;
|
||||
// preParse content
|
||||
const preParsedDom = (await preParseContent(url, document)) || document;
|
||||
|
||||
readabilityResult = await getReadabilityResult(url, preParsedDom);
|
||||
const preParsedDom = await preParseContent(url, document)
|
||||
if (preParsedDom) {
|
||||
document = preParsedDom
|
||||
}
|
||||
readabilityResult = await getReadabilityResult(url, document);
|
||||
}
|
||||
|
||||
|
||||
const apiResponse = await sendSavePageMutation(userId, {
|
||||
url,
|
||||
clientRequestId: articleSavingRequestId,
|
||||
@ -369,61 +446,23 @@ async function fetchContent(req, res) {
|
||||
state,
|
||||
labels,
|
||||
});
|
||||
|
||||
logRecord.totalTime = Date.now() - functionStartTime;
|
||||
logRecord.result = apiResponse.createArticle;
|
||||
}
|
||||
|
||||
importStatus = 'imported';
|
||||
} catch (e) {
|
||||
logRecord.error = e.message;
|
||||
console.error(`Error while retrieving page`, logRecord);
|
||||
|
||||
// fallback to scrapingbee
|
||||
const sbResult = await fetchContentWithScrapingBee(url);
|
||||
const content = sbResult.domContent;
|
||||
const title = sbResult.title;
|
||||
logRecord.fetchContentTime = Date.now() - functionStartTime;
|
||||
|
||||
let readabilityResult = null;
|
||||
if (content) {
|
||||
let document = parseHTML(content).document;
|
||||
|
||||
// preParse content
|
||||
const preParsedDom = await preParseContent(url, document)
|
||||
if (preParsedDom) {
|
||||
document = preParsedDom
|
||||
if (!apiResponse) {
|
||||
logRecord.error = 'error while saving page';
|
||||
statusCode = 500;
|
||||
} else {
|
||||
importStatus = readabilityResult ? 'imported' : 'failed';
|
||||
}
|
||||
|
||||
readabilityResult = await getReadabilityResult(url, document);
|
||||
}
|
||||
|
||||
const apiResponse = await sendSavePageMutation(userId, {
|
||||
url,
|
||||
clientRequestId: articleSavingRequestId,
|
||||
title,
|
||||
originalContent: content,
|
||||
parseResult: readabilityResult,
|
||||
state,
|
||||
labels,
|
||||
});
|
||||
|
||||
logRecord.totalTime = Date.now() - functionStartTime;
|
||||
logRecord.result = apiResponse.createArticle;
|
||||
|
||||
importStatus = 'failed';
|
||||
} finally {
|
||||
if (context) {
|
||||
await context.close();
|
||||
}
|
||||
console.info(`parse-page`, logRecord);
|
||||
|
||||
// send import status to update the metrics
|
||||
if (taskId) {
|
||||
if (taskId && importStatus) {
|
||||
await sendImportStatusUpdate(userId, taskId, importStatus);
|
||||
}
|
||||
|
||||
res.sendStatus(200);
|
||||
res.sendStatus(statusCode);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -40,8 +40,12 @@ Sentry.GCPFunction.init({
|
||||
})
|
||||
|
||||
const signToken = promisify(jwt.sign)
|
||||
const REQUEST_TIMEOUT = 30000 // 30s
|
||||
|
||||
const articleQuery = async (userId: string, slug: string): Promise<Page> => {
|
||||
const articleQuery = async (
|
||||
userId: string,
|
||||
slug: string
|
||||
): Promise<Page | null> => {
|
||||
const JWT_SECRET = process.env.JWT_SECRET
|
||||
const REST_BACKEND_ENDPOINT = process.env.REST_BACKEND_ENDPOINT
|
||||
|
||||
@ -71,18 +75,28 @@ const articleQuery = async (userId: string, slug: string): Promise<Page> => {
|
||||
})
|
||||
const auth = (await signToken({ uid: userId }, JWT_SECRET)) as string
|
||||
|
||||
const response = await axios.post<ArticleResponse>(
|
||||
`${REST_BACKEND_ENDPOINT}/graphql`,
|
||||
data,
|
||||
{
|
||||
headers: {
|
||||
Cookie: `auth=${auth};`,
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
}
|
||||
)
|
||||
try {
|
||||
const response = await axios.post<ArticleResponse>(
|
||||
`${REST_BACKEND_ENDPOINT}/graphql`,
|
||||
data,
|
||||
{
|
||||
headers: {
|
||||
Cookie: `auth=${auth};`,
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
timeout: REQUEST_TIMEOUT,
|
||||
}
|
||||
)
|
||||
|
||||
return response.data.data.article.article
|
||||
return response.data.data.article.article
|
||||
} catch (error) {
|
||||
if (axios.isAxiosError(error)) {
|
||||
console.error('article query error', error.message)
|
||||
} else {
|
||||
console.error(error)
|
||||
}
|
||||
return null
|
||||
}
|
||||
}
|
||||
|
||||
const updatePageMutation = async (
|
||||
@ -119,18 +133,28 @@ const updatePageMutation = async (
|
||||
})
|
||||
|
||||
const auth = (await signToken({ uid: userId }, JWT_SECRET)) as string
|
||||
const response = await axios.post<UpdatePageResponse>(
|
||||
`${REST_BACKEND_ENDPOINT}/graphql`,
|
||||
data,
|
||||
{
|
||||
headers: {
|
||||
Cookie: `auth=${auth};`,
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
}
|
||||
)
|
||||
try {
|
||||
const response = await axios.post<UpdatePageResponse>(
|
||||
`${REST_BACKEND_ENDPOINT}/graphql`,
|
||||
data,
|
||||
{
|
||||
headers: {
|
||||
Cookie: `auth=${auth};`,
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
timeout: REQUEST_TIMEOUT,
|
||||
}
|
||||
)
|
||||
|
||||
return !!response.data.data.updatePage
|
||||
return !!response.data.data.updatePage
|
||||
} catch (error) {
|
||||
if (axios.isAxiosError(error)) {
|
||||
console.error('update page mutation error', error.message)
|
||||
} else {
|
||||
console.error(error)
|
||||
}
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
const isThumbnailRequest = (body: any): body is ThumbnailRequest => {
|
||||
@ -142,6 +166,8 @@ const getImageSize = async (url: string): Promise<[number, number] | null> => {
|
||||
// get image file by url
|
||||
const response = await axios.get(url, {
|
||||
responseType: 'arraybuffer',
|
||||
timeout: 5000, // 5s
|
||||
maxContentLength: 10000000, // 10mb
|
||||
})
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/no-unsafe-argument
|
||||
@ -265,7 +291,11 @@ export const thumbnailHandler = Sentry.GCPFunction.wrapHttpFunction(
|
||||
}
|
||||
|
||||
const page = await articleQuery(uid, slug)
|
||||
console.debug('find page', page.id)
|
||||
if (!page) {
|
||||
console.info('page not found')
|
||||
return res.status(200).send('NOT_FOUND')
|
||||
}
|
||||
|
||||
// update page with thumbnail if not already set
|
||||
if (page.image) {
|
||||
console.debug('thumbnail already set')
|
||||
|
||||
Reference in New Issue
Block a user