parse bloomberg newsletter

This commit is contained in:
Hongbo Wu
2022-02-17 20:14:06 +08:00
parent bd8178a464
commit e3839647e1
3 changed files with 31 additions and 15 deletions

View File

@ -6,7 +6,13 @@ export class SubstackHandler {
shouldPrehandle = (url: URL, dom: DOMWindow): boolean => {
const host = this.name + '.com'
// check if url ends with bloomberg.com
return url.hostname.endsWith(host)
return (
url.hostname.endsWith(host) ||
dom.document
.querySelector('.logo-image')
?.getAttribute('alt')
?.toLowerCase() === this.name
)
}
prehandle = (url: URL, dom: DOMWindow): Promise<DOMWindow> => {

View File

@ -7,10 +7,11 @@ const EMAIL_FORWARDING_SENDER_ADDRESSES = [
'Gmail Team <forwarding-noreply@google.com>',
]
const NEWSLETTER_SENDER_REGEX =
'<.+@((axios.com)|(mail.bloombergbusiness.com))>'
const CONFIRMATION_CODE_PATTERN = '^\\(#\\d+\\)'
const AXIOS_URL_PATTERN = 'View in browser at <.+>'
const BLOOMBERG_URL_PATTERN = '<a class="view-in-browser__url".+>'
/<.+@((axios.com)|(mail.bloombergbusiness.com))>/
const CONFIRMATION_CODE_PATTERN = /^\\(#\\d+\\)/
const AXIOS_URL_PATTERN = /View in browser at <a.*>(.*)<\/a>/
const BLOOMBERG_URL_PATTERN =
/<a class="view-in-browser__url" href=["']([^"']*)["']/
export const handleConfirmation = async (email: string, subject: string) => {
console.log('confirmation email')
@ -116,19 +117,15 @@ export const getNewsletterUrl = (
}
// axios newsletter url from html
let re = new RegExp(AXIOS_URL_PATTERN)
let matches = html.match(re)
let matches = html.match(AXIOS_URL_PATTERN)
if (matches) {
const match = matches[0]
return match.slice(match.indexOf('>') + 1, match.lastIndexOf('<'))
return matches[1]
}
// bloomberg newsletter url from html
re = new RegExp(BLOOMBERG_URL_PATTERN)
matches = html.match(re)
matches = html.match(BLOOMBERG_URL_PATTERN)
if (matches) {
const match = matches[0]
return match.slice(match.indexOf('href=') + 1, match.lastIndexOf('style'))
return matches[1]
}
return undefined
}

View File

@ -46,10 +46,23 @@ describe('Newsletter email test', () => {
it('returns url when email is from Axios', () => {
const rawUrl = ''
const html = `View in browser at <a>https://axios.com/blog/2019/02/28/the-best-way-to-build-a-web-app</a>`
const html = `View in browser at <a>https://axios.com/blog/the-best-way-to-build-a-web-app</a>`
expect(getNewsletterUrl(rawUrl, html)).to.equal(
'https://axios.com/blog/2019/02/28/the-best-way-to-build-a-web-app'
'https://axios.com/blog/the-best-way-to-build-a-web-app'
)
})
it('returns url when email is from Bloomberg', () => {
const rawUrl = ''
const html = `
<a class="view-in-browser__url" href="https://www.bloomberg.com/news/google-is-now-a-partner">
View in browser
</a>
`
expect(getNewsletterUrl(rawUrl, html)).to.equal(
'https://www.bloomberg.com/news/google-is-now-a-partner'
)
})
})