diff --git a/packages/content-handler/src/websites/nitter-handler.ts b/packages/content-handler/src/websites/nitter-handler.ts index fbd0e1e7b..f5c5a8eab 100644 --- a/packages/content-handler/src/websites/nitter-handler.ts +++ b/packages/content-handler/src/websites/nitter-handler.ts @@ -1,10 +1,10 @@ +import { parseHTML } from 'linkedom' import _, { truncate } from 'lodash' import { DateTime } from 'luxon' import { Browser, BrowserContext } from 'puppeteer-core' import { ContentHandler, PreHandleResult } from '../content-handler' interface Tweet { - id: string url: string author: { username: string @@ -37,159 +37,154 @@ export class NitterHandler extends ContentHandler { async getTweets(browser: Browser, username: string, tweetId: string) { const url = `${this.ADDRESS}/${username}/status/${tweetId}` - async function genTweets(): Promise { + async function genTweets(address: string): Promise { + function authorParser(header: Element) { + const avatar = header + .querySelector('.tweet-avatar img') + ?.getAttribute('src') + if (!avatar) { + return null + } + const name = header.querySelector('.fullname')?.getAttribute('title') + if (!name) { + return null + } + return { + avatar, + name, + } + } + + function dateParser(tweetDate: string) { + const validDateTime = tweetDate.replace(' · ', ' ') + + return new Date(validDateTime).toISOString() + } + + function attachmentParser(attachments: Element | null) { + if (!attachments) return { photos: [], videos: [] } + + const photos = Array.from(attachments.querySelectorAll('img')).map( + (i) => i.getAttribute('src') ?? '' + ) + const videos = Array.from(attachments.querySelectorAll('source')).map( + (i) => i.getAttribute('src') ?? '' + ) + return { + photos, + videos, + } + } + + function parseTweet(tweet: Element): Tweet | null { + const header = tweet.querySelector('.tweet-header') + if (!header) { + return null + } + const author = authorParser(header) + if (!author) { + return null + } + + const body = tweet.querySelector('.tweet-body') + if (!body) { + return null + } + + const tweetDate = body + .querySelector('.tweet-date a') + ?.getAttribute('title') + if (!tweetDate) { + return null + } + const createdAt = dateParser(tweetDate) + + const content = body.querySelector('.tweet-content') + if (!content) { + return null + } + const text = content.textContent ?? '' + const urls = Array.from(content.querySelectorAll('a')).map((a) => ({ + url: a.getAttribute('href') ?? '', + display_url: a.textContent ?? '', + })) + + const attachments = body.querySelector('.attachments') + const { photos, videos } = attachmentParser(attachments) + + return { + author: { + username, + name: author.name, + profileImageUrl: author.avatar, + }, + createdAt, + text, + url, + entities: { + urls, + photos, + videos, + }, + } + } + let context: BrowserContext | undefined try { + const tweets: Tweet[] = [] + context = await browser.createIncognitoBrowserContext() const page = await context.newPage() - - // Modify this variable to control the size of viewport - const deviceScaleFactor = 0.2 - const height = Math.floor(2000 / deviceScaleFactor) - const width = Math.floor(1700 / deviceScaleFactor) - await page.setViewport({ width, height, deviceScaleFactor }) - await page.goto(url, { waitUntil: 'networkidle2', timeout: 30000, // 30 seconds }) - const tweets = (await page.evaluate( - async (username, id, url) => { - async function waitFor(ms: number) { - return new Promise((resolve) => setTimeout(resolve, ms)) + const html = await page.content() + const document = parseHTML(html).document + + // get the main thread including tweets and threads + const mainThread = document.querySelector('.main-thread') + if (!mainThread) { + return [] + } + const timelineItems = Array.from( + mainThread.querySelectorAll('.timeline-item') + ) + for (let i = 0; i < timelineItems.length; i++) { + const item = timelineItems[i] + if (item.classList.contains('more-replies')) { + const newUrl = item.querySelector('a')?.getAttribute('href') + if (!newUrl) { + break } - function authorParser(header: Element) { - const avatar = header - .querySelector('.tweet-avatar img') - ?.getAttribute('src') - if (!avatar) { - return null - } - const name = header - .querySelector('.fullname') - ?.getAttribute('title') - if (!name) { - return null - } - return { - avatar, - name, - } - } + // go to new url and wait for it to load + await page.goto(`${address}${newUrl}`, { + waitUntil: 'networkidle2', + timeout: 30000, // 30 seconds + }) - function dateParser(tweetDate: string) { - const validDateTime = tweetDate.replace(' · ', ' ') - - return new Date(validDateTime).toISOString() - } - - function attachmentParser(attachments: Element | null) { - if (!attachments) return { photos: [], videos: [] } - - const photos = Array.from( - attachments.querySelectorAll('img') - ).map((i) => i.getAttribute('src') ?? '') - const videos = Array.from( - attachments.querySelectorAll('source') - ).map((i) => i.getAttribute('src') ?? '') - return { - photos, - videos, - } - } - - function parseTweet(tweet: Element): Tweet | null { - const header = tweet.querySelector('.tweet-header') - if (!header) { - return null - } - const author = authorParser(header) - if (!author) { - return null - } - - const body = tweet.querySelector('.tweet-body') - if (!body) { - return null - } - - const tweetDate = body - .querySelector('.tweet-date a') - ?.getAttribute('title') - if (!tweetDate) { - return null - } - const createdAt = dateParser(tweetDate) - - const content = body.querySelector('.tweet-content') - if (!content) { - return null - } - const text = content.textContent ?? '' - const urls = Array.from(content.querySelectorAll('a')).map( - (a) => ({ - url: a.getAttribute('href') ?? '', - display_url: a.textContent ?? '', - }) - ) - - const attachments = body.querySelector('.attachments') - const { photos, videos } = attachmentParser(attachments) - - return { - id, - author: { - username, - name: author.name, - profileImageUrl: author.avatar, - }, - createdAt, - text, - url, - entities: { - urls, - photos, - videos, - }, - } - } - - const tweets: Tweet[] = [] - // get the main thread including tweets and threads - const mainThread = document.querySelector('.main-thread') - if (!mainThread) { - return [] - } - const timelineItems = Array.from( - mainThread.querySelectorAll('.timeline-item') + const document = parseHTML(await page.content()).document + const nextThread = document.querySelector( + '.main-thread .after-tweet' ) - for (let i = 0; i < timelineItems.length; i++) { - const item = timelineItems[i] - if (item.classList.contains('show-more')) { - // click the show more button - ;(item as HTMLAnchorElement).click() - await waitFor(2000) - - // get the new timeline items and add them to the list - const newTimelineItems = Array.from( - mainThread.querySelectorAll('.timeline-item') - ) - timelineItems.push(...newTimelineItems) - continue - } - - const tweet = parseTweet(item) - tweet && tweets.push(tweet) + if (!nextThread) { + break } - return tweets - }, - username, - tweetId, - url - )) as Tweet[] + // get the new timeline items and add them to the list + const newTimelineItems = Array.from( + nextThread.querySelectorAll('.timeline-item') + ) + + timelineItems.push(...newTimelineItems) + continue + } + + const tweet = parseTweet(item) + tweet && tweets.push(tweet) + } return tweets } catch (error) { @@ -203,7 +198,7 @@ export class NitterHandler extends ContentHandler { } } - return genTweets() + return genTweets(this.ADDRESS) } parseTweetUrl = (url: string) => {