Better content extraction of title and description for GitHub URLs

This commit is contained in:
Jackson Harper
2022-11-15 18:43:55 +08:00
parent 2c994e941b
commit 726f69484f

View File

@ -12,10 +12,29 @@ export class GitHubHandler extends ContentHandler {
async preParse(url: string, dom: Document): Promise<Document> {
const body = dom.querySelector('body')
const head = dom.querySelector('head')
const article = dom.querySelector('article')
const twitterTitle = dom.querySelector(`meta[name='twitter:title']`)
const linkAuthor = dom.querySelector(`span[itemprop='author']`)
if (body && article) {
body?.replaceChildren(article)
body.replaceChildren(article)
// Attempt to set the author also. This is available on repo homepages
// but not on things like PRs. Ideally we want PRs and issues to have
// author set to the author of the PR/issue.
if (linkAuthor && linkAuthor.textContent) {
const author = dom.createElement('span')
author.setAttribute('rel', 'author')
author.innerHTML = linkAuthor.textContent
article.appendChild(author)
}
}
// Remove the GitHub - and repo org from the title
const twitterTitleContent = twitterTitle?.getAttribute('content')
if (twitterTitle && twitterTitleContent) {
twitterTitle.setAttribute('content', twitterTitleContent.replace(/GitHub - .*\//, ''))
}
return Promise.resolve(dom)