From 02280b576b9d01577ae40decd7e5bd121ed20dde Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Thu, 18 Jul 2024 10:33:06 +0800 Subject: [PATCH 1/3] fix: wechat official account articles published date not captured because its format is changed --- packages/content-handler/src/websites/weixin-qq-handler.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/content-handler/src/websites/weixin-qq-handler.ts b/packages/content-handler/src/websites/weixin-qq-handler.ts index 4ac93c4e3..454728655 100644 --- a/packages/content-handler/src/websites/weixin-qq-handler.ts +++ b/packages/content-handler/src/websites/weixin-qq-handler.ts @@ -15,7 +15,7 @@ export class WeixinQqHandler extends ContentHandler { // Retrieve the publish time const publishTime = dom.querySelector('#publish_time')?.textContent if (publishTime) { - const dateTimeFormat = 'yyyy-LL-dd HH:mm' + const dateTimeFormat = 'yyyy年LL月dd日 HH:mm' // published time is in UTC+8 const publishTimeISO = DateTime.fromFormat(publishTime, dateTimeFormat, { zone: 'Asia/Shanghai', From f2b3a66b72c17411fd96e31d52ae3ed5ac72c938 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Thu, 18 Jul 2024 11:18:51 +0800 Subject: [PATCH 2/3] remove metadata and cover image from content --- .../src/websites/weixin-qq-handler.ts | 19 ++++++++++++++----- packages/readabilityjs/Readability.js | 2 ++ 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/packages/content-handler/src/websites/weixin-qq-handler.ts b/packages/content-handler/src/websites/weixin-qq-handler.ts index 454728655..8dac19e51 100644 --- a/packages/content-handler/src/websites/weixin-qq-handler.ts +++ b/packages/content-handler/src/websites/weixin-qq-handler.ts @@ -27,16 +27,25 @@ export class WeixinQqHandler extends ContentHandler { metaNode.setAttribute('content', publishTimeISO) dom.querySelector('head')?.appendChild(metaNode) } - // This replace the class name of the article info to preserve the block - dom - .querySelector('.rich_media_meta_list') - ?.setAttribute('class', '_omnivore_rich_media_meta_list') - // This removes the title + const author = ( + dom.querySelector('#js_author_name') || dom.querySelector('#js_name') + )?.textContent?.trim() + if (author) { + const authorNode = dom.createElement('meta') + authorNode.setAttribute('name', 'author') + authorNode.setAttribute('content', author) + dom.querySelector('head')?.appendChild(authorNode) + } + + // This removes the title, metadata and cover image dom.querySelector('.rich_media_title')?.remove() + dom.querySelector('.rich_media_meta_list')?.remove() + dom.querySelector('#js_row_immersive_cover_img')?.remove() // This removes the profile info dom.querySelector('.profile_container')?.remove() + dom.querySelector('.profile_card_container')?.remove() // This removes the footer dom.querySelector('#content_bottom_area')?.remove() diff --git a/packages/readabilityjs/Readability.js b/packages/readabilityjs/Readability.js index 2d57ba5ce..c97260ed7 100644 --- a/packages/readabilityjs/Readability.js +++ b/packages/readabilityjs/Readability.js @@ -2004,6 +2004,8 @@ Readability.prototype = { metadata.byline = jsonld.byline || values["dc:creator"] || values["dcterm:creator"] || + values["og:article:author"] || + values["twitter:creator"] || values["author"]; // get description From d9b088cc01bbddb4d3a3d1c2e72b9cac882e5538 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Thu, 18 Jul 2024 12:22:21 +0800 Subject: [PATCH 3/3] fix tests --- packages/readabilityjs/Readability.js | 1 - 1 file changed, 1 deletion(-) diff --git a/packages/readabilityjs/Readability.js b/packages/readabilityjs/Readability.js index c97260ed7..b42bc7c58 100644 --- a/packages/readabilityjs/Readability.js +++ b/packages/readabilityjs/Readability.js @@ -2005,7 +2005,6 @@ Readability.prototype = { values["dc:creator"] || values["dcterm:creator"] || values["og:article:author"] || - values["twitter:creator"] || values["author"]; // get description