Merge pull request #2813 from omnivore-app/fix/published-date
get published date from url and time elements
This commit is contained in:
@ -76,6 +76,22 @@ const extractPublishedDateFromAuthor = (author)=> {
|
||||
return [authorName, null];
|
||||
};
|
||||
|
||||
// extract published date from url if it's in the format of yyyy/mm/dd or yyyy-mm-dd
|
||||
const extractPublishedDateFromUrl = (url) => {
|
||||
if (!url) return null;
|
||||
|
||||
const regex = /(\d{4})(\/|-)(\d{2})(\/|-)(\d{2})/i;
|
||||
const match = url.match(regex);
|
||||
if (match) {
|
||||
const year = parseInt(match[1], 10);
|
||||
const month = parseInt(match[3], 10) - 1; // January is 0 in JavaScript Date
|
||||
const day = parseInt(match[5], 10);
|
||||
|
||||
return new Date(year, month, day);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Public constructor.
|
||||
* @param {Document} doc The document to parse.
|
||||
@ -1081,6 +1097,18 @@ Readability.prototype = {
|
||||
}
|
||||
// we don't want to check for dates in the URL's
|
||||
if (node.tagName.toLowerCase() === 'a') return
|
||||
// get the datetime from time element
|
||||
if (node.tagName.toLowerCase() === 'time') {
|
||||
const datetime = node.getAttribute('datetime')
|
||||
if (datetime) {
|
||||
const date = new Date(datetime)
|
||||
if (!isNaN(date)) {
|
||||
this._articlePublishedDate = date
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Searching for the real date in the text content
|
||||
const content = node.textContent.trim()
|
||||
let dateFound
|
||||
@ -3056,7 +3084,11 @@ Readability.prototype = {
|
||||
return null;
|
||||
|
||||
const byline = metadata.byline || this._articleByline;
|
||||
const [author, publishedAt] = extractPublishedDateFromAuthor(byline);
|
||||
const [author, publishedDateFromAuthor] = extractPublishedDateFromAuthor(byline);
|
||||
const publishedDate = metadata.publishedDate ||
|
||||
extractPublishedDateFromUrl(this._documentURI) ||
|
||||
publishedDateFromAuthor ||
|
||||
this._articlePublishedDate;
|
||||
|
||||
this._postProcessContent(articleContent);
|
||||
|
||||
@ -3092,7 +3124,7 @@ Readability.prototype = {
|
||||
siteName: metadata.siteName,
|
||||
siteIcon: metadata.siteIcon,
|
||||
previewImage: metadata.previewImage,
|
||||
publishedDate: metadata.publishedDate || publishedAt || this._articlePublishedDate,
|
||||
publishedDate,
|
||||
language: this._getLanguage(metadata.locale || this._languageCode),
|
||||
};
|
||||
}
|
||||
|
||||
@ -20,6 +20,12 @@
|
||||
<a href="./test-pages/electrek/distiller.html" target="iframe_b">[dom-distiller]</a>
|
||||
</li>
|
||||
|
||||
<li>caixin<br />
|
||||
<a href="./test-pages/caixin/source.html" target="iframe_b">[source]</a>
|
||||
<a href="./test-pages/caixin/expected.html" target="iframe_b">[readability]</a>
|
||||
<a href="./test-pages/caixin/distiller.html" target="iframe_b">[dom-distiller]</a>
|
||||
</li>
|
||||
|
||||
<li>news.utexas<br />
|
||||
<a href="./test-pages/news.utexas/source.html" target="iframe_b">[source]</a>
|
||||
<a href="./test-pages/news.utexas/expected.html" target="iframe_b">[readability]</a>
|
||||
|
||||
20
packages/readabilityjs/test/test-pages/caixin/distiller.html
Normal file
20
packages/readabilityjs/test/test-pages/caixin/distiller.html
Normal file
@ -0,0 +1,20 @@
|
||||
<div><p>
|
||||
<b>【财新网】</b>9月26日,汽车服务平台<a href="https://s.ccxe.com.cn/entities/companies/202035144" target="_blank">途虎养车</a>正式在港交所主板挂牌上市。途虎养车( <a href="09690.HKM">09690.HK</a> )上市发行价为28港元/股,此前公司披露的发行价区间为28港元/股至31港元/股,即实际发行价为区间下限。当日,途虎养车收报29.5港元/股,较发行价涨5.36%,市值为239.6亿港元。
|
||||
</p><p>
|
||||
途虎养车上市不易。途虎养车2022年1月即在港交所递表,2022年8月、2023年3月两次重新递交上市申请材料,终于在2023年8月23日通过聆讯。
|
||||
</p><img src="https://file.caixin.com/images/m/lineContent.png"/><div>
|
||||
|
||||
|
||||
</div><div>
|
||||
后获取已订阅的阅读权限
|
||||
</div><div>
|
||||
财新通会员<br/>
|
||||
可畅读全文
|
||||
</div><div>
|
||||
</div><img src="https://showimg.caixin.com/dolphinfile/caixin/2023/06/15688_1_16865563181267.jpg"/><img src="https://file.caixin.com/static/mh5/images/conponclose.png"/><p>
|
||||
推荐进入<a href="https://cxdata.caixin.com/index" target="_blank">财新数据库</a>,可随时查阅公司股价走势、结构人员变化等投资信息。
|
||||
</p><div>
|
||||
责任编辑:屈运栩 | 版面编辑:刘潇(ZN028)
|
||||
</div><dt>
|
||||
话题:
|
||||
</dt></div>
|
||||
@ -0,0 +1,11 @@
|
||||
{
|
||||
"title": "途虎养车港交所挂牌 腾讯为最大外部股东",
|
||||
"byline": "文|财新 余聪",
|
||||
"dir": null,
|
||||
"excerpt": "途虎养车 腾讯国内汽车服务市场高度分散,2022年,途虎养车取得汽车服务收入115亿元,市场份额0.9%",
|
||||
"siteName": "fakehost",
|
||||
"previewImage": "https://img.caixin.com/2023-09-26/169572084568190_560_373.jpg",
|
||||
"publishedDate": "2023-09-25T16:00:00.000Z",
|
||||
"language": "English",
|
||||
"readerable": true
|
||||
}
|
||||
45
packages/readabilityjs/test/test-pages/caixin/expected.html
Normal file
45
packages/readabilityjs/test/test-pages/caixin/expected.html
Normal file
@ -0,0 +1,45 @@
|
||||
<DIV class="page" id="readability-page-1">
|
||||
<div id="the_content">
|
||||
<div id="conTit">
|
||||
<h2> 途虎养车港交所挂牌 腾讯为最大外部股东 <img src="https://file.caixin.com/webchannel/all/img/icon_key.png">
|
||||
</h2>
|
||||
<!--baidu begin-->
|
||||
<!--baidu end-->
|
||||
<div id="artInfo">
|
||||
<!-- tt.s 12/17 -->
|
||||
<p> 文|财新 余聪 </p><!-- tt.e 12/17 -->
|
||||
<p> 2023年09月26日 17:22 </p>
|
||||
<!-- tt.s 12/17 来源于
|
||||
<a href=http://www.caixin.com target=_blank>财新网</a>
|
||||
tt.e 12/17-->
|
||||
<!-- 新版音频播放器代码 begin -->
|
||||
<!-- 新版音频播放器代码 end -->
|
||||
<img id="swit" height="26" src="http://file.caixin.com/images/content/PC.jpg"> 试听
|
||||
</div>
|
||||
<p> 国内汽车服务市场高度分散,2022年,途虎养车取得汽车服务收入115亿元,市场份额0.9% </p>
|
||||
</div>
|
||||
<div id="Main_Content_Val">
|
||||
<p> <b>【财新网】</b>9月26日,汽车服务平台<a href="https://s.ccxe.com.cn/entities/companies/202035144" target="_blank">途虎养车</a>正式在港交所主板挂牌上市。途虎养车( <a onclick="return false" href="http://fakehost/test/09690.HKM">09690.HK</a> )上市发行价为28港元/股,此前公司披露的发行价区间为28港元/股至31港元/股,即实际发行价为区间下限。当日,途虎养车收报29.5港元/股,较发行价涨5.36%,市值为239.6亿港元。 </p>
|
||||
<p> 途虎养车上市不易。途虎养车2022年1月即在港交所递表,2022年8月、2023年3月两次重新递交上市申请材料,终于在2023年8月23日通过聆讯。 </p>
|
||||
</div>
|
||||
<!--杂志购买 begin-->
|
||||
<!--全站公用文章页收费框碎片-->
|
||||
<div id="chargeWall">
|
||||
<p><img src="https://file.caixin.com/images/m/lineContent.png"></p>
|
||||
<p>登录 后获取已订阅的阅读权限 </p>
|
||||
<!---->
|
||||
<!---->
|
||||
<!---->
|
||||
<!---->
|
||||
<!---->
|
||||
<!---->
|
||||
</div>
|
||||
<div id="pay-box">
|
||||
<p><img src="https://file.caixin.com/static/mh5/images/conponclose.png"></p>
|
||||
</div>
|
||||
<!--<script src="//file.caixin.com/pkg/cx-pay-layer/js/chunk-vendors.js"></script>-->
|
||||
<!--杂志购买 end-->
|
||||
<p> 推荐进入<a href="https://cxdata.caixin.com/index" target="_blank">财新数据库</a>,可随时查阅公司股价走势、结构人员变化等投资信息。 </p>
|
||||
<p> 责任编辑:屈运栩 | 版面编辑:刘潇(ZN028) </p>
|
||||
</div>
|
||||
</DIV>
|
||||
2275
packages/readabilityjs/test/test-pages/caixin/source.html
Normal file
2275
packages/readabilityjs/test/test-pages/caixin/source.html
Normal file
File diff suppressed because one or more lines are too long
1
packages/readabilityjs/test/test-pages/caixin/url.txt
Normal file
1
packages/readabilityjs/test/test-pages/caixin/url.txt
Normal file
@ -0,0 +1 @@
|
||||
https://www.caixin.com/2023-09-26/102112537.html
|
||||
@ -4,7 +4,7 @@
|
||||
"dir": null,
|
||||
"excerpt": "The Sept. 27, 2022 episode of “The Ezra Klein Show”",
|
||||
"siteName": "fakehost",
|
||||
"siteIcon": "/vi-assets/static-assets/favicon-d2483f10ef688e6f89e23806b9700298.ico",
|
||||
"siteIcon": "http://fakehost/vi-assets/static-assets/favicon-d2483f10ef688e6f89e23806b9700298.ico",
|
||||
"previewImage": "https://static01.nyt.com/newsgraphics/images/icons/defaultPromoCrop.png",
|
||||
"publishedDate": "2022-09-27T16:25:17.221Z",
|
||||
"language": "English",
|
||||
|
||||
@ -5,8 +5,6 @@
|
||||
<article id="story">
|
||||
<header>
|
||||
<p> The Ezra Klein Show </p>
|
||||
<p><time datetime="2022-09-27T12:25:17-04:00">Sept. 27, 2022</time>
|
||||
</p>
|
||||
</header>
|
||||
<section name="articleBody">
|
||||
<div>
|
||||
|
||||
@ -30,8 +30,6 @@
|
||||
</figcaption>
|
||||
</figure>
|
||||
</div>
|
||||
<p><time datetime="2022-10-28T05:00:25-04:00"><span>Oct. 28, 2022</span></time>
|
||||
</p>
|
||||
</header>
|
||||
<section name="articleBody">
|
||||
<div>
|
||||
|
||||
Reference in New Issue
Block a user