fix code blocks not formatted correctly in articles from wechat official accounts

This commit is contained in:
Hongbo Wu
2024-01-25 16:17:33 +08:00
parent 04fd8d2e5d
commit fd7c2ffb49
4 changed files with 14574 additions and 18 deletions

View File

@ -307,23 +307,28 @@ export const parsePreparedContent = async (
// Format code blocks
// TODO: we probably want to move this type of thing
// to the handlers, and have some concept of postHandle
const codeBlocks = newDocumentElement.querySelectorAll<HTMLElement>(
'pre[class^="prism-"], pre[class^="language-"], pre[class^="code-snippet"], code'
const codeBlocks = newDocumentElement.querySelectorAll(
'pre[class^="prism-"], pre[class^="language-"], code'
)
if (codeBlocks.length > 0) {
codeBlocks.forEach((e) => {
const att = hljs.highlightAuto(e.innerText)
const code = document.createElement('code')
const langClass =
`hljs language-${att.language}` +
(att.second_best?.language
? ` language-${att.second_best?.language}`
: '')
code.setAttribute('class', langClass)
code.innerHTML = att.value
e.replaceWith(code)
})
}
codeBlocks.forEach((e) => {
if (!e.textContent) {
return e.parentNode?.removeChild(e)
}
// replace <br> or <p> or </p> with \n
e.innerHTML = e.innerHTML.replace(/<(br|p|\/p)>/g, '\n')
const att = hljs.highlightAuto(e.textContent)
const code = document.createElement('code')
const langClass =
`hljs language-${att.language}` +
(att.second_best?.language
? ` language-${att.second_best?.language}`
: '')
code.setAttribute('class', langClass)
code.innerHTML = att.value
e.replaceWith(code)
})
highlightData = findEmbeddedHighlight(newDocumentElement)

View File

@ -206,7 +206,7 @@ Readability.prototype = {
unlikelyCandidates: /\bad\b|ai2html|banner|breadcrumbs|breadcrumb|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager(?!ow)|popup|yom-remote|copyright|keywords|outline|infinite-list|beta|recirculation|site-index|hide-for-print|post-end-share-cta|post-end-cta-full|post-footer|post-head|post-tag|li-date|main-navigation|programtic-ads|outstream_article|hfeed|comment-holder|back-to-top|show-up-next|onward-journey|topic-tracker|list-nav|block-ad-entity|adSpecs|gift-article-button|modal-title|in-story-masthead|share-tools|standard-dock|expanded-dock|margins-h|subscribe-dialog|icon|bumped|dvz-social-media-buttons|post-toc|mobile-menu|mobile-navbar|tl_article_header|mvp(-post)*-(add-story|soc(-mob)*-wrap)|w-condition-invisible|rich-text-block main w-richtext|rich-text-block_ataglance at-a-glance test w-richtext|PostsPage-commentsSection|hide-text/i,
// okMaybeItsACandidate: /and|article(?!-breadcrumb)|body|column|content|main|shadow|post-header/i,
get okMaybeItsACandidate() {
return new RegExp(`and|(?<!${this.articleNegativeLookAheadCandidates.source})article(?!-(${this.articleNegativeLookBehindCandidates.source}))|body|column|content|^(?!main-navigation|main-header)main|shadow|post-header|hfeed site|blog-posts hfeed|container-banners|menu-opacity|header-with-anchor-widget|commentOnSelection`, 'i')
return new RegExp(`and|(?<!${this.articleNegativeLookAheadCandidates.source})article(?!-(${this.articleNegativeLookBehindCandidates.source}))|body|column|content|^(?!main-navigation|main-header)main|shadow|post-header|hfeed site|blog-posts hfeed|container-banners|menu-opacity|header-with-anchor-widget|commentOnSelection|highlight--with-header`, 'i')
},
positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story|tweet(-\w+)?|instagram|image|container-banners|player|commentOnSelection/i,
@ -261,7 +261,7 @@ Readability.prototype = {
"SUP", "TEXTAREA", "TIME", "VAR", "WBR"
],
// These are the classes that readability sets itself.
// These are the classes that we want to keep.
CLASSES_TO_PRESERVE: [
"page", "twitter-tweet", "tweet-placeholder", "instagram-placeholder", "morning-brew-markets", "prism-code"
],
@ -3082,6 +3082,7 @@ Readability.prototype = {
this._removeScripts(this._doc);
this._prepDocument();
console.log(this._doc.body.innerHTML);
var metadata = this._getArticleMetadata(jsonLd);
this._articleTitle = metadata.title;

File diff suppressed because it is too large Load Diff