go to the next thread page to scrape long thread

This commit is contained in:
Hongbo Wu
2023-06-20 19:17:34 +08:00
parent 89f89875d6
commit a109d0383f

View File

@ -1,10 +1,10 @@
import { parseHTML } from 'linkedom'
import _, { truncate } from 'lodash'
import { DateTime } from 'luxon'
import { Browser, BrowserContext } from 'puppeteer-core'
import { ContentHandler, PreHandleResult } from '../content-handler'
interface Tweet {
id: string
url: string
author: {
username: string
@ -37,159 +37,154 @@ export class NitterHandler extends ContentHandler {
async getTweets(browser: Browser, username: string, tweetId: string) {
const url = `${this.ADDRESS}/${username}/status/${tweetId}`
async function genTweets(): Promise<Tweet[]> {
async function genTweets(address: string): Promise<Tweet[]> {
function authorParser(header: Element) {
const avatar = header
.querySelector('.tweet-avatar img')
?.getAttribute('src')
if (!avatar) {
return null
}
const name = header.querySelector('.fullname')?.getAttribute('title')
if (!name) {
return null
}
return {
avatar,
name,
}
}
function dateParser(tweetDate: string) {
const validDateTime = tweetDate.replace(' · ', ' ')
return new Date(validDateTime).toISOString()
}
function attachmentParser(attachments: Element | null) {
if (!attachments) return { photos: [], videos: [] }
const photos = Array.from(attachments.querySelectorAll('img')).map(
(i) => i.getAttribute('src') ?? ''
)
const videos = Array.from(attachments.querySelectorAll('source')).map(
(i) => i.getAttribute('src') ?? ''
)
return {
photos,
videos,
}
}
function parseTweet(tweet: Element): Tweet | null {
const header = tweet.querySelector('.tweet-header')
if (!header) {
return null
}
const author = authorParser(header)
if (!author) {
return null
}
const body = tweet.querySelector('.tweet-body')
if (!body) {
return null
}
const tweetDate = body
.querySelector('.tweet-date a')
?.getAttribute('title')
if (!tweetDate) {
return null
}
const createdAt = dateParser(tweetDate)
const content = body.querySelector('.tweet-content')
if (!content) {
return null
}
const text = content.textContent ?? ''
const urls = Array.from(content.querySelectorAll('a')).map((a) => ({
url: a.getAttribute('href') ?? '',
display_url: a.textContent ?? '',
}))
const attachments = body.querySelector('.attachments')
const { photos, videos } = attachmentParser(attachments)
return {
author: {
username,
name: author.name,
profileImageUrl: author.avatar,
},
createdAt,
text,
url,
entities: {
urls,
photos,
videos,
},
}
}
let context: BrowserContext | undefined
try {
const tweets: Tweet[] = []
context = await browser.createIncognitoBrowserContext()
const page = await context.newPage()
// Modify this variable to control the size of viewport
const deviceScaleFactor = 0.2
const height = Math.floor(2000 / deviceScaleFactor)
const width = Math.floor(1700 / deviceScaleFactor)
await page.setViewport({ width, height, deviceScaleFactor })
await page.goto(url, {
waitUntil: 'networkidle2',
timeout: 30000, // 30 seconds
})
const tweets = (await page.evaluate(
async (username, id, url) => {
async function waitFor(ms: number) {
return new Promise((resolve) => setTimeout(resolve, ms))
const html = await page.content()
const document = parseHTML(html).document
// get the main thread including tweets and threads
const mainThread = document.querySelector('.main-thread')
if (!mainThread) {
return []
}
const timelineItems = Array.from(
mainThread.querySelectorAll('.timeline-item')
)
for (let i = 0; i < timelineItems.length; i++) {
const item = timelineItems[i]
if (item.classList.contains('more-replies')) {
const newUrl = item.querySelector('a')?.getAttribute('href')
if (!newUrl) {
break
}
function authorParser(header: Element) {
const avatar = header
.querySelector('.tweet-avatar img')
?.getAttribute('src')
if (!avatar) {
return null
}
const name = header
.querySelector('.fullname')
?.getAttribute('title')
if (!name) {
return null
}
return {
avatar,
name,
}
}
// go to new url and wait for it to load
await page.goto(`${address}${newUrl}`, {
waitUntil: 'networkidle2',
timeout: 30000, // 30 seconds
})
function dateParser(tweetDate: string) {
const validDateTime = tweetDate.replace(' · ', ' ')
return new Date(validDateTime).toISOString()
}
function attachmentParser(attachments: Element | null) {
if (!attachments) return { photos: [], videos: [] }
const photos = Array.from(
attachments.querySelectorAll('img')
).map((i) => i.getAttribute('src') ?? '')
const videos = Array.from(
attachments.querySelectorAll('source')
).map((i) => i.getAttribute('src') ?? '')
return {
photos,
videos,
}
}
function parseTweet(tweet: Element): Tweet | null {
const header = tweet.querySelector('.tweet-header')
if (!header) {
return null
}
const author = authorParser(header)
if (!author) {
return null
}
const body = tweet.querySelector('.tweet-body')
if (!body) {
return null
}
const tweetDate = body
.querySelector('.tweet-date a')
?.getAttribute('title')
if (!tweetDate) {
return null
}
const createdAt = dateParser(tweetDate)
const content = body.querySelector('.tweet-content')
if (!content) {
return null
}
const text = content.textContent ?? ''
const urls = Array.from(content.querySelectorAll('a')).map(
(a) => ({
url: a.getAttribute('href') ?? '',
display_url: a.textContent ?? '',
})
)
const attachments = body.querySelector('.attachments')
const { photos, videos } = attachmentParser(attachments)
return {
id,
author: {
username,
name: author.name,
profileImageUrl: author.avatar,
},
createdAt,
text,
url,
entities: {
urls,
photos,
videos,
},
}
}
const tweets: Tweet[] = []
// get the main thread including tweets and threads
const mainThread = document.querySelector('.main-thread')
if (!mainThread) {
return []
}
const timelineItems = Array.from(
mainThread.querySelectorAll('.timeline-item')
const document = parseHTML(await page.content()).document
const nextThread = document.querySelector(
'.main-thread .after-tweet'
)
for (let i = 0; i < timelineItems.length; i++) {
const item = timelineItems[i]
if (item.classList.contains('show-more')) {
// click the show more button
;(item as HTMLAnchorElement).click()
await waitFor(2000)
// get the new timeline items and add them to the list
const newTimelineItems = Array.from(
mainThread.querySelectorAll('.timeline-item')
)
timelineItems.push(...newTimelineItems)
continue
}
const tweet = parseTweet(item)
tweet && tweets.push(tweet)
if (!nextThread) {
break
}
return tweets
},
username,
tweetId,
url
)) as Tweet[]
// get the new timeline items and add them to the list
const newTimelineItems = Array.from(
nextThread.querySelectorAll('.timeline-item')
)
timelineItems.push(...newTimelineItems)
continue
}
const tweet = parseTweet(item)
tweet && tweets.push(tweet)
}
return tweets
} catch (error) {
@ -203,7 +198,7 @@ export class NitterHandler extends ContentHandler {
}
}
return genTweets()
return genTweets(this.ADDRESS)
}
parseTweetUrl = (url: string) => {