From 726f69484fe67f27747ffe087fb06ac7f1a68cc2 Mon Sep 17 00:00:00 2001 From: Jackson Harper Date: Tue, 15 Nov 2022 18:43:55 +0800 Subject: [PATCH] Better content extraction of title and description for GitHub URLs --- .../src/websites/github-handler.ts | 21 ++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/packages/content-handler/src/websites/github-handler.ts b/packages/content-handler/src/websites/github-handler.ts index c31fe2224..92da72bcd 100644 --- a/packages/content-handler/src/websites/github-handler.ts +++ b/packages/content-handler/src/websites/github-handler.ts @@ -12,10 +12,29 @@ export class GitHubHandler extends ContentHandler { async preParse(url: string, dom: Document): Promise { const body = dom.querySelector('body') + const head = dom.querySelector('head') const article = dom.querySelector('article') + const twitterTitle = dom.querySelector(`meta[name='twitter:title']`) + const linkAuthor = dom.querySelector(`span[itemprop='author']`) if (body && article) { - body?.replaceChildren(article) + body.replaceChildren(article) + + // Attempt to set the author also. This is available on repo homepages + // but not on things like PRs. Ideally we want PRs and issues to have + // author set to the author of the PR/issue. + if (linkAuthor && linkAuthor.textContent) { + const author = dom.createElement('span') + author.setAttribute('rel', 'author') + author.innerHTML = linkAuthor.textContent + article.appendChild(author) + } + } + + // Remove the GitHub - and repo org from the title + const twitterTitleContent = twitterTitle?.getAttribute('content') + if (twitterTitle && twitterTitleContent) { + twitterTitle.setAttribute('content', twitterTitleContent.replace(/GitHub - .*\//, '')) } return Promise.resolve(dom)