Make pdf-handler subscribe to a pubsub notification for cloud storage

This commit is contained in:
Hongbo Wu
2023-04-20 16:31:06 +08:00
parent ebc760e522
commit 0dc8724fcb
2 changed files with 77 additions and 47 deletions

View File

@ -29,6 +29,7 @@
"@google-cloud/functions-framework": "3.1.2",
"@google-cloud/pubsub": "^3.4.1",
"@google-cloud/storage": "^5.18.1",
"@sentry/serverless": "^7.30.0",
"axios": "^0.27.2",
"concurrently": "^7.0.0",
"pdfjs-dist": "^2.9.359"

View File

@ -1,11 +1,13 @@
import {
EventFunction,
CloudFunctionsContext,
} from '@google-cloud/functions-framework/build/src/functions'
import { GetSignedUrlConfig, Storage } from '@google-cloud/storage'
import { PubSub } from '@google-cloud/pubsub'
import { GetSignedUrlConfig, Storage } from '@google-cloud/storage'
import * as Sentry from '@sentry/serverless'
import { parsePdf } from './pdf'
Sentry.GCPFunction.init({
dsn: process.env.SENTRY_DSN,
tracesSampleRate: 0,
})
const pubsub = new PubSub()
const storage = new Storage()
const CONTENT_UPDATE_TOPIC = 'updatePageContent'
@ -16,18 +18,16 @@ interface StorageEventData {
contentType: string
}
function isStorageEventData(event: any): event is StorageEventData {
return 'name' in event && 'bucket' in event && 'contentType' in event
}
// Ensure this is a finalize event and that it is stored in the `u/` directory and is a PDF
const shouldHandle = (data: StorageEventData, ctx: CloudFunctionsContext) => {
if (ctx.eventType !== 'google.storage.object.finalize') {
return false
}
if (
!data.name.startsWith('u/') ||
data.contentType.toLowerCase() != 'application/pdf'
) {
return false
}
return true
const shouldHandle = (data: StorageEventData) => {
return (
data.name.startsWith('u/') &&
data.contentType.toLowerCase() === 'application/pdf'
)
}
const getDocumentUrl = async (
@ -69,37 +69,66 @@ export const updatePageContent = (
})
}
export const pdfHandler: EventFunction = async (event, context) => {
const data = event as StorageEventData
const ctx = context as CloudFunctionsContext
if (shouldHandle(data, ctx)) {
console.log('handling pdf data', data)
const url = await getDocumentUrl(data)
console.log('PDF url: ', url)
if (!url) {
console.log('Could not fetch PDF', data.bucket, data.name)
return
const getStorageEventData = (
pubSubMessage: string
): StorageEventData | undefined => {
try {
const str = Buffer.from(pubSubMessage, 'base64').toString().trim()
const obj = JSON.parse(str) as unknown
if (isStorageEventData(obj)) {
return obj
}
const parsed = await parsePdf(url)
const res = await updatePageContent(
data.name,
parsed.content,
parsed.title,
parsed.author,
parsed.description
)
console.log(
'publish result',
res,
'title',
parsed.title,
'author',
parsed.author
)
} else {
console.log('not handling pdf data', data)
} catch (err) {
console.log('error deserializing event: ', { pubSubMessage, err })
}
return undefined
}
export const pdfHandler = Sentry.GCPFunction.wrapHttpFunction(
async (req, res) => {
/* eslint-disable @typescript-eslint/no-unsafe-member-access */
if ('message' in req.body && 'data' in req.body.message) {
const pubSubMessage = req.body.message.data as string
const data = getStorageEventData(pubSubMessage)
if (data) {
try {
if (shouldHandle(data)) {
console.log('handling pdf data', data)
const url = await getDocumentUrl(data)
console.log('PDF url: ', url)
if (!url) {
console.log('Could not fetch PDF', data.bucket, data.name)
return res.status(404).send('Could not fetch PDF')
}
const parsed = await parsePdf(url)
const result = await updatePageContent(
data.name,
parsed.content,
parsed.title,
parsed.author,
parsed.description
)
console.log(
'publish result',
result,
'title',
parsed.title,
'author',
parsed.author
)
} else {
console.log('not handling pdf data', data)
}
} catch (err) {
console.log('error handling event', { err, data })
return res.status(500).send('Error handling event')
}
}
} else {
console.log('no pubsub message')
}
res.send('ok')
}
)