Merge pull request #1373 from omnivore-app/fix/substack-embedded-tweets
Handle embedded tweets in substack emails
This commit is contained in:
File diff suppressed because one or more lines are too long
@ -38,11 +38,17 @@ describe('parseMetadata', async () => {
|
||||
|
||||
describe('parsePreparedContent', async () => {
|
||||
it('gets published date when JSONLD fails to load', async () => {
|
||||
nock('https://stratechery.com:443', {"encodedQueryParams":true})
|
||||
.get('/wp-json/oembed/1.0/embed')
|
||||
.query({"url":"https%3A%2F%2Fstratechery.com%2F2016%2Fits-a-tesla%2F"})
|
||||
.reply(401)
|
||||
|
||||
const html = load('./test/utils/data/stratechery-blog-post.html')
|
||||
const result = await parsePreparedContent('https://blog.omnivore.app/', {
|
||||
document: html,
|
||||
pageInfo: {},
|
||||
})
|
||||
|
||||
expect(result.parsedContent?.publishedDate?.getTime()).to.equal(
|
||||
new Date('2016-04-05T15:27:51+00:00').getTime()
|
||||
)
|
||||
|
||||
@ -9,14 +9,15 @@ export class SubstackHandler extends ContentHandler {
|
||||
|
||||
shouldPreParse(url: string, dom: Document): boolean {
|
||||
const host = this.name + '.com'
|
||||
const cdnHost = 'substackcdn.com'
|
||||
// check if url ends with substack.com
|
||||
// or has a profile image hosted at substack.com
|
||||
// or has a profile image hosted at substack.com or substackcdn.com
|
||||
return (
|
||||
new URL(url).hostname.endsWith(host) ||
|
||||
!!dom
|
||||
.querySelector('.email-body img')
|
||||
?.getAttribute('src')
|
||||
?.includes(host)
|
||||
?.includes(host || cdnHost)
|
||||
)
|
||||
}
|
||||
|
||||
@ -34,6 +35,8 @@ export class SubstackHandler extends ContentHandler {
|
||||
body?.querySelector('.container-border')?.remove()
|
||||
body?.querySelector('.footer')?.remove()
|
||||
|
||||
dom = this.fixupStaticTweets(dom)
|
||||
|
||||
return Promise.resolve(dom)
|
||||
}
|
||||
|
||||
@ -65,13 +68,20 @@ export class SubstackHandler extends ContentHandler {
|
||||
}
|
||||
// If the article has a header link, and substack icons its probably a newsletter
|
||||
const href = this.findNewsletterHeaderHref(dom)
|
||||
const heartIcon = dom.querySelector(
|
||||
const oldHeartIcon = dom.querySelector(
|
||||
'table tbody td span a img[src*="HeartIcon"]'
|
||||
)
|
||||
const recommendIcon = dom.querySelector(
|
||||
const oldRecommendIcon = dom.querySelector(
|
||||
'table tbody td span a img[src*="RecommendIconRounded"]'
|
||||
)
|
||||
return Promise.resolve(!!(href && (heartIcon || recommendIcon)))
|
||||
const heartIcon = dom.querySelector('a img[src*="LucideHeart"]')
|
||||
const commentsIcon = dom.querySelector('a img[src*="LucideComments"]')
|
||||
return Promise.resolve(
|
||||
!!(
|
||||
href &&
|
||||
(oldHeartIcon || oldRecommendIcon || heartIcon || commentsIcon)
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
async parseNewsletterUrl(
|
||||
@ -85,4 +95,38 @@ export class SubstackHandler extends ContentHandler {
|
||||
}
|
||||
return this.findNewsletterUrl(html)
|
||||
}
|
||||
|
||||
fixupStaticTweets(dom: Document): Document {
|
||||
const preClassName = '_omnivore-static-'
|
||||
const staticTweets = dom.querySelectorAll('div[class="tweet static"]')
|
||||
|
||||
if (staticTweets.length < 1) {
|
||||
return dom
|
||||
}
|
||||
|
||||
const recurse = (node: Element, f: (node: Element) => void) => {
|
||||
for (let i = 0; i < node.children.length; i++) {
|
||||
const child = node.children[i]
|
||||
recurse(child, f)
|
||||
f(child)
|
||||
}
|
||||
}
|
||||
|
||||
for (const tweet of Array.from(staticTweets)) {
|
||||
tweet.className = preClassName + 'tweet'
|
||||
tweet.removeAttribute('style')
|
||||
|
||||
// get all children, rename their class, remove style
|
||||
// elements (style will be handled in the reader)
|
||||
recurse(tweet, (n: Element) => {
|
||||
const className = n.className
|
||||
if (className.startsWith('tweet-')) {
|
||||
n.className = preClassName + className
|
||||
}
|
||||
n.removeAttribute('style')
|
||||
})
|
||||
}
|
||||
|
||||
return dom
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -16,6 +16,7 @@ import { ConvertkitHandler } from '../src/newsletters/convertkit-handler'
|
||||
import { GhostHandler } from '../src/newsletters/ghost-handler'
|
||||
import { CooperPressHandler } from '../src/newsletters/cooper-press-handler'
|
||||
import { getNewsletterHandler } from '../src'
|
||||
import { parseHTML } from 'linkedom'
|
||||
|
||||
chai.use(chaiAsPromised)
|
||||
chai.use(chaiString)
|
||||
@ -129,6 +130,41 @@ describe('Newsletter email test', () => {
|
||||
expect(handler).to.be.undefined
|
||||
})
|
||||
|
||||
it('returns SubstackHandler for substack newsletter with static tweets', async () => {
|
||||
const html = load(
|
||||
'./test/data/substack-with-static-tweets-newsletter.html'
|
||||
)
|
||||
const handler = await getNewsletterHandler({
|
||||
html,
|
||||
postHeader: '',
|
||||
from: '',
|
||||
unSubHeader: '',
|
||||
})
|
||||
expect(handler).to.be.instanceOf(SubstackHandler)
|
||||
})
|
||||
|
||||
it('fixes up static tweets in Substack newsletters', async () => {
|
||||
const url = 'https://astralcodexten.substack.com/p/nick-cammarata-on-jhana'
|
||||
const html = load(
|
||||
'./test/data/substack-with-static-tweets-newsletter.html'
|
||||
)
|
||||
const handler = await getNewsletterHandler({
|
||||
html,
|
||||
postHeader: '',
|
||||
from: '',
|
||||
unSubHeader: '',
|
||||
})
|
||||
expect(handler).to.be.instanceOf(SubstackHandler)
|
||||
|
||||
const dom = parseHTML(html).document
|
||||
expect(handler?.shouldPreParse(url, dom)).to.be.true
|
||||
|
||||
const preparsed = await handler?.preParse(url, dom)
|
||||
const tweets = Array.from(preparsed?.querySelectorAll('div[class="_omnivore-static-tweet"]') ?? [])
|
||||
|
||||
expect(tweets.length).to.eq(7)
|
||||
})
|
||||
|
||||
it('returns BeehiivHandler for beehiiv.com newsletter', async () => {
|
||||
const html = load('./test/data/beehiiv-newsletter.html')
|
||||
const handler = await getNewsletterHandler({
|
||||
|
||||
@ -414,6 +414,10 @@ Readability.prototype = {
|
||||
* @return void
|
||||
*/
|
||||
_cleanClasses: function (node) {
|
||||
if (node.className.startsWith("_omnivore")) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (this.EMBEDS_CLASSES.includes(node.className) || this.hasEmbed(node)) {
|
||||
return;
|
||||
}
|
||||
@ -598,7 +602,7 @@ Readability.prototype = {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (node.parentNode && ["DIV", "SECTION"].includes(node.tagName) && !(node.id && node.id.startsWith("readability"))) {
|
||||
if (node.parentNode && ["DIV", "SECTION"].includes(node.tagName) && !this._isOmnivoreNode(node) && !(node.id && node.id.startsWith("readability"))) {
|
||||
if (this._isElementWithoutContent(node)) {
|
||||
node = this._removeAndGetNext(node);
|
||||
continue;
|
||||
@ -1136,6 +1140,11 @@ Readability.prototype = {
|
||||
while (node) {
|
||||
var matchString = node.className + " " + node.id;
|
||||
|
||||
if (this._isOmnivoreNode(node)) {
|
||||
node = this._getNextNode(node);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!this._isProbablyVisible(node)) {
|
||||
this.log("Removing hidden node - " + matchString);
|
||||
node = this._removeAndGetNext(node);
|
||||
@ -1337,7 +1346,7 @@ Readability.prototype = {
|
||||
var candidateScore = candidate.readability.contentScore * (1 - this._getLinkDensity(candidate));
|
||||
candidate.readability.contentScore = candidateScore;
|
||||
|
||||
this.log("Candidate:", candidate, "with score " + candidateScore);
|
||||
this.log("Candidate:", candidate.nodeName, candidate.className, "with score " + candidateScore);
|
||||
|
||||
for (var t = 0; t < this._nbTopCandidates; t++) {
|
||||
var aTopCandidate = topCandidates[t];
|
||||
@ -1465,7 +1474,7 @@ Readability.prototype = {
|
||||
var sibling = siblings[s];
|
||||
var append = false;
|
||||
|
||||
this.log("Looking at sibling node:", sibling, sibling.readability ? ("with score " + sibling.readability.contentScore) : "");
|
||||
this.log("Looking at sibling node:", sibling.nodeName, sibling.className, sibling.readability ? ("with score " + sibling.readability.contentScore) : "");
|
||||
this.log("Sibling has score", sibling.readability ? sibling.readability.contentScore : "Unknown");
|
||||
|
||||
if (sibling === topCandidate) {
|
||||
@ -1496,7 +1505,7 @@ Readability.prototype = {
|
||||
}
|
||||
|
||||
if (append) {
|
||||
this.log("Appending node:", sibling);
|
||||
this.log("Appending node:", sibling.nodeName);
|
||||
|
||||
if (this.ALTER_TO_DIV_EXCEPTIONS.indexOf(sibling.nodeName) === -1) {
|
||||
// We have a node that isn't a common block level element, like a form or td tag.
|
||||
@ -2556,12 +2565,12 @@ Readability.prototype = {
|
||||
if (imgHeight && imgWidhth && imgHeight === imgWidhth) {
|
||||
if (elem.tagName.toLowerCase() === 'svg') {
|
||||
if(imgHeight <= 21){
|
||||
this.log(`Removing small square SVG: ${imgWidhth}x${imgHeight}`, elem, `className: ${elem.className}`, `src: ${elem.src}`);
|
||||
this.log(`Removing small square SVG: ${imgWidhth}x${imgHeight}`, `className: ${elem.className}`, `src: ${elem.src}`);
|
||||
elem.parentNode.removeChild(elem);
|
||||
}
|
||||
return;
|
||||
} else if(imgHeight <= 80) {
|
||||
this.log(`Removing small square image: ${imgWidhth}x${imgHeight}`, elem, `className: ${elem.className}`, `src: ${elem.src}`);
|
||||
this.log(`Removing small square image: ${imgWidhth}x${imgHeight}`, `className: ${elem.className}`, `src: ${elem.src}`);
|
||||
elem.parentNode.removeChild(elem);
|
||||
return;
|
||||
}
|
||||
@ -2765,6 +2774,7 @@ Readability.prototype = {
|
||||
}
|
||||
|
||||
var haveToRemove =
|
||||
!this._isOmnivoreNode(node) && (
|
||||
(img > 1 && p / img < 0.5 && !this._hasAncestorTag(node, "figure")) ||
|
||||
(!isList && li > p) ||
|
||||
(input > Math.floor(p/3)) ||
|
||||
@ -2774,10 +2784,11 @@ Readability.prototype = {
|
||||
// some website like https://substack.com might have their custom styling of tweets
|
||||
// we should omit ignoring their particular case by checking against "tweet" classname
|
||||
(weight >= 25 && linkDensity > 0.5 && !(node.className === "tweet" && linkDensity === 1)) ||
|
||||
((embedCount === 1 && contentLength < 75) || embedCount > 1);
|
||||
((embedCount === 1 && contentLength < 75) || embedCount > 1))
|
||||
|
||||
if (haveToRemove)
|
||||
this.log("Cleaning Conditionally", { node, className: node.className, children: Array.from(node.children).map(ch => ch.tagName) });
|
||||
if (haveToRemove) {
|
||||
this.log("Cleaning Conditionally", { className: node.className, children: Array.from(node.children).map(ch => ch.tagName) });
|
||||
}
|
||||
|
||||
return haveToRemove;
|
||||
}
|
||||
@ -2785,6 +2796,22 @@ Readability.prototype = {
|
||||
});
|
||||
},
|
||||
|
||||
_isOmnivoreNode: function(node) {
|
||||
const prefix = '_omnivore'
|
||||
var walk = node
|
||||
|
||||
while (walk) {
|
||||
if (walk.className && walk.className.startsWith && walk.className.startsWith(prefix)) {
|
||||
return true
|
||||
}
|
||||
if (walk.className && walk.className.hasOwnProperty && walk.className.hasOwnProperty(prefix)) {
|
||||
return true
|
||||
}
|
||||
walk = walk.parentElement
|
||||
}
|
||||
return false
|
||||
},
|
||||
|
||||
/**
|
||||
* Clean out elements that match the specified conditions
|
||||
*
|
||||
|
||||
@ -361,3 +361,90 @@ on smaller screens we display the note icon
|
||||
.article-inner-css .morning-brew-markets td:nth-child(4) {
|
||||
width: 20%;
|
||||
}
|
||||
|
||||
._omnivore-static-tweet {
|
||||
background: #ffffff;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 12px;
|
||||
max-width: 550px;
|
||||
margin: 32px auto;
|
||||
border: 1px solid #e0e0e0;
|
||||
direction: ltr;
|
||||
border-radius: 8;
|
||||
padding: 16px 16px ;
|
||||
box-sizing: border-box;
|
||||
-webkit-font-smoothing: subpixel-antialiased;
|
||||
}
|
||||
|
||||
._omnivore-static-tweet-header {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
flex-direction: row;
|
||||
gap: 12px;
|
||||
}
|
||||
|
||||
._omnivore-static-tweet-link-top {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 12px;
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
._omnivore-static-tweet-header-avatar {
|
||||
-ms-interpolation-mode: bicubic;
|
||||
border: none !important;
|
||||
border-radius: 50%;
|
||||
float: left;
|
||||
height: 48px;
|
||||
width: 48px;
|
||||
margin: 0;
|
||||
margin-right: 12px;
|
||||
max-width: 100%;
|
||||
vertical-align: middle;
|
||||
}
|
||||
|
||||
._omnivore-static-tweet-header {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
flex-direction: row;
|
||||
gap: 12px;
|
||||
}
|
||||
|
||||
._omnivore-static-tweet-link-bottom {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
text-decoration: none;
|
||||
white-space: pre-wrap;
|
||||
}
|
||||
|
||||
._omnivore-static-tweet-footer {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
align-items: flex-start;
|
||||
}
|
||||
|
||||
._omnivore-static-tweet-footer hr {
|
||||
background: #e0e0e0;
|
||||
border: none;
|
||||
height: 1px;
|
||||
margin: 12px 0;
|
||||
padding: 0;
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
._omnivore-static-tweet-author-handle {
|
||||
display: block;
|
||||
}
|
||||
|
||||
._omnivore-static-tweet-ufi {
|
||||
display: flex;
|
||||
gap: 24px;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
._omnivore-static-tweet-ufi .likes, .retweets {
|
||||
display: flex;
|
||||
gap: 4px;
|
||||
text-decoration: none;
|
||||
}
|
||||
Reference in New Issue
Block a user