Merge pull request #1373 from omnivore-app/fix/substack-embedded-tweets

Handle embedded tweets in substack emails
This commit is contained in:
Jackson Harper
2022-11-01 15:04:03 +08:00
committed by GitHub
7 changed files with 3485 additions and 15 deletions

File diff suppressed because one or more lines are too long

View File

@ -38,11 +38,17 @@ describe('parseMetadata', async () => {
describe('parsePreparedContent', async () => {
it('gets published date when JSONLD fails to load', async () => {
nock('https://stratechery.com:443', {"encodedQueryParams":true})
.get('/wp-json/oembed/1.0/embed')
.query({"url":"https%3A%2F%2Fstratechery.com%2F2016%2Fits-a-tesla%2F"})
.reply(401)
const html = load('./test/utils/data/stratechery-blog-post.html')
const result = await parsePreparedContent('https://blog.omnivore.app/', {
document: html,
pageInfo: {},
})
expect(result.parsedContent?.publishedDate?.getTime()).to.equal(
new Date('2016-04-05T15:27:51+00:00').getTime()
)

View File

@ -9,14 +9,15 @@ export class SubstackHandler extends ContentHandler {
shouldPreParse(url: string, dom: Document): boolean {
const host = this.name + '.com'
const cdnHost = 'substackcdn.com'
// check if url ends with substack.com
// or has a profile image hosted at substack.com
// or has a profile image hosted at substack.com or substackcdn.com
return (
new URL(url).hostname.endsWith(host) ||
!!dom
.querySelector('.email-body img')
?.getAttribute('src')
?.includes(host)
?.includes(host || cdnHost)
)
}
@ -34,6 +35,8 @@ export class SubstackHandler extends ContentHandler {
body?.querySelector('.container-border')?.remove()
body?.querySelector('.footer')?.remove()
dom = this.fixupStaticTweets(dom)
return Promise.resolve(dom)
}
@ -65,13 +68,20 @@ export class SubstackHandler extends ContentHandler {
}
// If the article has a header link, and substack icons its probably a newsletter
const href = this.findNewsletterHeaderHref(dom)
const heartIcon = dom.querySelector(
const oldHeartIcon = dom.querySelector(
'table tbody td span a img[src*="HeartIcon"]'
)
const recommendIcon = dom.querySelector(
const oldRecommendIcon = dom.querySelector(
'table tbody td span a img[src*="RecommendIconRounded"]'
)
return Promise.resolve(!!(href && (heartIcon || recommendIcon)))
const heartIcon = dom.querySelector('a img[src*="LucideHeart"]')
const commentsIcon = dom.querySelector('a img[src*="LucideComments"]')
return Promise.resolve(
!!(
href &&
(oldHeartIcon || oldRecommendIcon || heartIcon || commentsIcon)
)
)
}
async parseNewsletterUrl(
@ -85,4 +95,38 @@ export class SubstackHandler extends ContentHandler {
}
return this.findNewsletterUrl(html)
}
fixupStaticTweets(dom: Document): Document {
const preClassName = '_omnivore-static-'
const staticTweets = dom.querySelectorAll('div[class="tweet static"]')
if (staticTweets.length < 1) {
return dom
}
const recurse = (node: Element, f: (node: Element) => void) => {
for (let i = 0; i < node.children.length; i++) {
const child = node.children[i]
recurse(child, f)
f(child)
}
}
for (const tweet of Array.from(staticTweets)) {
tweet.className = preClassName + 'tweet'
tweet.removeAttribute('style')
// get all children, rename their class, remove style
// elements (style will be handled in the reader)
recurse(tweet, (n: Element) => {
const className = n.className
if (className.startsWith('tweet-')) {
n.className = preClassName + className
}
n.removeAttribute('style')
})
}
return dom
}
}

File diff suppressed because it is too large Load Diff

View File

@ -16,6 +16,7 @@ import { ConvertkitHandler } from '../src/newsletters/convertkit-handler'
import { GhostHandler } from '../src/newsletters/ghost-handler'
import { CooperPressHandler } from '../src/newsletters/cooper-press-handler'
import { getNewsletterHandler } from '../src'
import { parseHTML } from 'linkedom'
chai.use(chaiAsPromised)
chai.use(chaiString)
@ -129,6 +130,41 @@ describe('Newsletter email test', () => {
expect(handler).to.be.undefined
})
it('returns SubstackHandler for substack newsletter with static tweets', async () => {
const html = load(
'./test/data/substack-with-static-tweets-newsletter.html'
)
const handler = await getNewsletterHandler({
html,
postHeader: '',
from: '',
unSubHeader: '',
})
expect(handler).to.be.instanceOf(SubstackHandler)
})
it('fixes up static tweets in Substack newsletters', async () => {
const url = 'https://astralcodexten.substack.com/p/nick-cammarata-on-jhana'
const html = load(
'./test/data/substack-with-static-tweets-newsletter.html'
)
const handler = await getNewsletterHandler({
html,
postHeader: '',
from: '',
unSubHeader: '',
})
expect(handler).to.be.instanceOf(SubstackHandler)
const dom = parseHTML(html).document
expect(handler?.shouldPreParse(url, dom)).to.be.true
const preparsed = await handler?.preParse(url, dom)
const tweets = Array.from(preparsed?.querySelectorAll('div[class="_omnivore-static-tweet"]') ?? [])
expect(tweets.length).to.eq(7)
})
it('returns BeehiivHandler for beehiiv.com newsletter', async () => {
const html = load('./test/data/beehiiv-newsletter.html')
const handler = await getNewsletterHandler({

View File

@ -414,6 +414,10 @@ Readability.prototype = {
* @return void
*/
_cleanClasses: function (node) {
if (node.className.startsWith("_omnivore")) {
return;
}
if (this.EMBEDS_CLASSES.includes(node.className) || this.hasEmbed(node)) {
return;
}
@ -598,7 +602,7 @@ Readability.prototype = {
continue;
}
if (node.parentNode && ["DIV", "SECTION"].includes(node.tagName) && !(node.id && node.id.startsWith("readability"))) {
if (node.parentNode && ["DIV", "SECTION"].includes(node.tagName) && !this._isOmnivoreNode(node) && !(node.id && node.id.startsWith("readability"))) {
if (this._isElementWithoutContent(node)) {
node = this._removeAndGetNext(node);
continue;
@ -1136,6 +1140,11 @@ Readability.prototype = {
while (node) {
var matchString = node.className + " " + node.id;
if (this._isOmnivoreNode(node)) {
node = this._getNextNode(node);
continue;
}
if (!this._isProbablyVisible(node)) {
this.log("Removing hidden node - " + matchString);
node = this._removeAndGetNext(node);
@ -1337,7 +1346,7 @@ Readability.prototype = {
var candidateScore = candidate.readability.contentScore * (1 - this._getLinkDensity(candidate));
candidate.readability.contentScore = candidateScore;
this.log("Candidate:", candidate, "with score " + candidateScore);
this.log("Candidate:", candidate.nodeName, candidate.className, "with score " + candidateScore);
for (var t = 0; t < this._nbTopCandidates; t++) {
var aTopCandidate = topCandidates[t];
@ -1465,7 +1474,7 @@ Readability.prototype = {
var sibling = siblings[s];
var append = false;
this.log("Looking at sibling node:", sibling, sibling.readability ? ("with score " + sibling.readability.contentScore) : "");
this.log("Looking at sibling node:", sibling.nodeName, sibling.className, sibling.readability ? ("with score " + sibling.readability.contentScore) : "");
this.log("Sibling has score", sibling.readability ? sibling.readability.contentScore : "Unknown");
if (sibling === topCandidate) {
@ -1496,7 +1505,7 @@ Readability.prototype = {
}
if (append) {
this.log("Appending node:", sibling);
this.log("Appending node:", sibling.nodeName);
if (this.ALTER_TO_DIV_EXCEPTIONS.indexOf(sibling.nodeName) === -1) {
// We have a node that isn't a common block level element, like a form or td tag.
@ -2556,12 +2565,12 @@ Readability.prototype = {
if (imgHeight && imgWidhth && imgHeight === imgWidhth) {
if (elem.tagName.toLowerCase() === 'svg') {
if(imgHeight <= 21){
this.log(`Removing small square SVG: ${imgWidhth}x${imgHeight}`, elem, `className: ${elem.className}`, `src: ${elem.src}`);
this.log(`Removing small square SVG: ${imgWidhth}x${imgHeight}`, `className: ${elem.className}`, `src: ${elem.src}`);
elem.parentNode.removeChild(elem);
}
return;
} else if(imgHeight <= 80) {
this.log(`Removing small square image: ${imgWidhth}x${imgHeight}`, elem, `className: ${elem.className}`, `src: ${elem.src}`);
this.log(`Removing small square image: ${imgWidhth}x${imgHeight}`, `className: ${elem.className}`, `src: ${elem.src}`);
elem.parentNode.removeChild(elem);
return;
}
@ -2765,6 +2774,7 @@ Readability.prototype = {
}
var haveToRemove =
!this._isOmnivoreNode(node) && (
(img > 1 && p / img < 0.5 && !this._hasAncestorTag(node, "figure")) ||
(!isList && li > p) ||
(input > Math.floor(p/3)) ||
@ -2774,10 +2784,11 @@ Readability.prototype = {
// some website like https://substack.com might have their custom styling of tweets
// we should omit ignoring their particular case by checking against "tweet" classname
(weight >= 25 && linkDensity > 0.5 && !(node.className === "tweet" && linkDensity === 1)) ||
((embedCount === 1 && contentLength < 75) || embedCount > 1);
((embedCount === 1 && contentLength < 75) || embedCount > 1))
if (haveToRemove)
this.log("Cleaning Conditionally", { node, className: node.className, children: Array.from(node.children).map(ch => ch.tagName) });
if (haveToRemove) {
this.log("Cleaning Conditionally", { className: node.className, children: Array.from(node.children).map(ch => ch.tagName) });
}
return haveToRemove;
}
@ -2785,6 +2796,22 @@ Readability.prototype = {
});
},
_isOmnivoreNode: function(node) {
const prefix = '_omnivore'
var walk = node
while (walk) {
if (walk.className && walk.className.startsWith && walk.className.startsWith(prefix)) {
return true
}
if (walk.className && walk.className.hasOwnProperty && walk.className.hasOwnProperty(prefix)) {
return true
}
walk = walk.parentElement
}
return false
},
/**
* Clean out elements that match the specified conditions
*

View File

@ -361,3 +361,90 @@ on smaller screens we display the note icon
.article-inner-css .morning-brew-markets td:nth-child(4) {
width: 20%;
}
._omnivore-static-tweet {
background: #ffffff;
display: flex;
flex-direction: column;
gap: 12px;
max-width: 550px;
margin: 32px auto;
border: 1px solid #e0e0e0;
direction: ltr;
border-radius: 8;
padding: 16px 16px ;
box-sizing: border-box;
-webkit-font-smoothing: subpixel-antialiased;
}
._omnivore-static-tweet-header {
display: flex;
align-items: center;
flex-direction: row;
gap: 12px;
}
._omnivore-static-tweet-link-top {
display: flex;
flex-direction: column;
gap: 12px;
text-decoration: none;
}
._omnivore-static-tweet-header-avatar {
-ms-interpolation-mode: bicubic;
border: none !important;
border-radius: 50%;
float: left;
height: 48px;
width: 48px;
margin: 0;
margin-right: 12px;
max-width: 100%;
vertical-align: middle;
}
._omnivore-static-tweet-header {
display: flex;
align-items: center;
flex-direction: row;
gap: 12px;
}
._omnivore-static-tweet-link-bottom {
display: flex;
flex-direction: column;
text-decoration: none;
white-space: pre-wrap;
}
._omnivore-static-tweet-footer {
display: flex;
flex-direction: column;
align-items: flex-start;
}
._omnivore-static-tweet-footer hr {
background: #e0e0e0;
border: none;
height: 1px;
margin: 12px 0;
padding: 0;
width: 100%;
}
._omnivore-static-tweet-author-handle {
display: block;
}
._omnivore-static-tweet-ufi {
display: flex;
gap: 24px;
align-items: center;
}
._omnivore-static-tweet-ufi .likes, .retweets {
display: flex;
gap: 4px;
text-decoration: none;
}