go to the next thread page to scrape long thread
This commit is contained in:
@ -1,10 +1,10 @@
|
||||
import { parseHTML } from 'linkedom'
|
||||
import _, { truncate } from 'lodash'
|
||||
import { DateTime } from 'luxon'
|
||||
import { Browser, BrowserContext } from 'puppeteer-core'
|
||||
import { ContentHandler, PreHandleResult } from '../content-handler'
|
||||
|
||||
interface Tweet {
|
||||
id: string
|
||||
url: string
|
||||
author: {
|
||||
username: string
|
||||
@ -37,159 +37,154 @@ export class NitterHandler extends ContentHandler {
|
||||
async getTweets(browser: Browser, username: string, tweetId: string) {
|
||||
const url = `${this.ADDRESS}/${username}/status/${tweetId}`
|
||||
|
||||
async function genTweets(): Promise<Tweet[]> {
|
||||
async function genTweets(address: string): Promise<Tweet[]> {
|
||||
function authorParser(header: Element) {
|
||||
const avatar = header
|
||||
.querySelector('.tweet-avatar img')
|
||||
?.getAttribute('src')
|
||||
if (!avatar) {
|
||||
return null
|
||||
}
|
||||
const name = header.querySelector('.fullname')?.getAttribute('title')
|
||||
if (!name) {
|
||||
return null
|
||||
}
|
||||
return {
|
||||
avatar,
|
||||
name,
|
||||
}
|
||||
}
|
||||
|
||||
function dateParser(tweetDate: string) {
|
||||
const validDateTime = tweetDate.replace(' · ', ' ')
|
||||
|
||||
return new Date(validDateTime).toISOString()
|
||||
}
|
||||
|
||||
function attachmentParser(attachments: Element | null) {
|
||||
if (!attachments) return { photos: [], videos: [] }
|
||||
|
||||
const photos = Array.from(attachments.querySelectorAll('img')).map(
|
||||
(i) => i.getAttribute('src') ?? ''
|
||||
)
|
||||
const videos = Array.from(attachments.querySelectorAll('source')).map(
|
||||
(i) => i.getAttribute('src') ?? ''
|
||||
)
|
||||
return {
|
||||
photos,
|
||||
videos,
|
||||
}
|
||||
}
|
||||
|
||||
function parseTweet(tweet: Element): Tweet | null {
|
||||
const header = tweet.querySelector('.tweet-header')
|
||||
if (!header) {
|
||||
return null
|
||||
}
|
||||
const author = authorParser(header)
|
||||
if (!author) {
|
||||
return null
|
||||
}
|
||||
|
||||
const body = tweet.querySelector('.tweet-body')
|
||||
if (!body) {
|
||||
return null
|
||||
}
|
||||
|
||||
const tweetDate = body
|
||||
.querySelector('.tweet-date a')
|
||||
?.getAttribute('title')
|
||||
if (!tweetDate) {
|
||||
return null
|
||||
}
|
||||
const createdAt = dateParser(tweetDate)
|
||||
|
||||
const content = body.querySelector('.tweet-content')
|
||||
if (!content) {
|
||||
return null
|
||||
}
|
||||
const text = content.textContent ?? ''
|
||||
const urls = Array.from(content.querySelectorAll('a')).map((a) => ({
|
||||
url: a.getAttribute('href') ?? '',
|
||||
display_url: a.textContent ?? '',
|
||||
}))
|
||||
|
||||
const attachments = body.querySelector('.attachments')
|
||||
const { photos, videos } = attachmentParser(attachments)
|
||||
|
||||
return {
|
||||
author: {
|
||||
username,
|
||||
name: author.name,
|
||||
profileImageUrl: author.avatar,
|
||||
},
|
||||
createdAt,
|
||||
text,
|
||||
url,
|
||||
entities: {
|
||||
urls,
|
||||
photos,
|
||||
videos,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
let context: BrowserContext | undefined
|
||||
try {
|
||||
const tweets: Tweet[] = []
|
||||
|
||||
context = await browser.createIncognitoBrowserContext()
|
||||
const page = await context.newPage()
|
||||
|
||||
// Modify this variable to control the size of viewport
|
||||
const deviceScaleFactor = 0.2
|
||||
const height = Math.floor(2000 / deviceScaleFactor)
|
||||
const width = Math.floor(1700 / deviceScaleFactor)
|
||||
await page.setViewport({ width, height, deviceScaleFactor })
|
||||
|
||||
await page.goto(url, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: 30000, // 30 seconds
|
||||
})
|
||||
|
||||
const tweets = (await page.evaluate(
|
||||
async (username, id, url) => {
|
||||
async function waitFor(ms: number) {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms))
|
||||
const html = await page.content()
|
||||
const document = parseHTML(html).document
|
||||
|
||||
// get the main thread including tweets and threads
|
||||
const mainThread = document.querySelector('.main-thread')
|
||||
if (!mainThread) {
|
||||
return []
|
||||
}
|
||||
const timelineItems = Array.from(
|
||||
mainThread.querySelectorAll('.timeline-item')
|
||||
)
|
||||
for (let i = 0; i < timelineItems.length; i++) {
|
||||
const item = timelineItems[i]
|
||||
if (item.classList.contains('more-replies')) {
|
||||
const newUrl = item.querySelector('a')?.getAttribute('href')
|
||||
if (!newUrl) {
|
||||
break
|
||||
}
|
||||
|
||||
function authorParser(header: Element) {
|
||||
const avatar = header
|
||||
.querySelector('.tweet-avatar img')
|
||||
?.getAttribute('src')
|
||||
if (!avatar) {
|
||||
return null
|
||||
}
|
||||
const name = header
|
||||
.querySelector('.fullname')
|
||||
?.getAttribute('title')
|
||||
if (!name) {
|
||||
return null
|
||||
}
|
||||
return {
|
||||
avatar,
|
||||
name,
|
||||
}
|
||||
}
|
||||
// go to new url and wait for it to load
|
||||
await page.goto(`${address}${newUrl}`, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: 30000, // 30 seconds
|
||||
})
|
||||
|
||||
function dateParser(tweetDate: string) {
|
||||
const validDateTime = tweetDate.replace(' · ', ' ')
|
||||
|
||||
return new Date(validDateTime).toISOString()
|
||||
}
|
||||
|
||||
function attachmentParser(attachments: Element | null) {
|
||||
if (!attachments) return { photos: [], videos: [] }
|
||||
|
||||
const photos = Array.from(
|
||||
attachments.querySelectorAll('img')
|
||||
).map((i) => i.getAttribute('src') ?? '')
|
||||
const videos = Array.from(
|
||||
attachments.querySelectorAll('source')
|
||||
).map((i) => i.getAttribute('src') ?? '')
|
||||
return {
|
||||
photos,
|
||||
videos,
|
||||
}
|
||||
}
|
||||
|
||||
function parseTweet(tweet: Element): Tweet | null {
|
||||
const header = tweet.querySelector('.tweet-header')
|
||||
if (!header) {
|
||||
return null
|
||||
}
|
||||
const author = authorParser(header)
|
||||
if (!author) {
|
||||
return null
|
||||
}
|
||||
|
||||
const body = tweet.querySelector('.tweet-body')
|
||||
if (!body) {
|
||||
return null
|
||||
}
|
||||
|
||||
const tweetDate = body
|
||||
.querySelector('.tweet-date a')
|
||||
?.getAttribute('title')
|
||||
if (!tweetDate) {
|
||||
return null
|
||||
}
|
||||
const createdAt = dateParser(tweetDate)
|
||||
|
||||
const content = body.querySelector('.tweet-content')
|
||||
if (!content) {
|
||||
return null
|
||||
}
|
||||
const text = content.textContent ?? ''
|
||||
const urls = Array.from(content.querySelectorAll('a')).map(
|
||||
(a) => ({
|
||||
url: a.getAttribute('href') ?? '',
|
||||
display_url: a.textContent ?? '',
|
||||
})
|
||||
)
|
||||
|
||||
const attachments = body.querySelector('.attachments')
|
||||
const { photos, videos } = attachmentParser(attachments)
|
||||
|
||||
return {
|
||||
id,
|
||||
author: {
|
||||
username,
|
||||
name: author.name,
|
||||
profileImageUrl: author.avatar,
|
||||
},
|
||||
createdAt,
|
||||
text,
|
||||
url,
|
||||
entities: {
|
||||
urls,
|
||||
photos,
|
||||
videos,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
const tweets: Tweet[] = []
|
||||
// get the main thread including tweets and threads
|
||||
const mainThread = document.querySelector('.main-thread')
|
||||
if (!mainThread) {
|
||||
return []
|
||||
}
|
||||
const timelineItems = Array.from(
|
||||
mainThread.querySelectorAll('.timeline-item')
|
||||
const document = parseHTML(await page.content()).document
|
||||
const nextThread = document.querySelector(
|
||||
'.main-thread .after-tweet'
|
||||
)
|
||||
for (let i = 0; i < timelineItems.length; i++) {
|
||||
const item = timelineItems[i]
|
||||
if (item.classList.contains('show-more')) {
|
||||
// click the show more button
|
||||
;(item as HTMLAnchorElement).click()
|
||||
await waitFor(2000)
|
||||
|
||||
// get the new timeline items and add them to the list
|
||||
const newTimelineItems = Array.from(
|
||||
mainThread.querySelectorAll('.timeline-item')
|
||||
)
|
||||
timelineItems.push(...newTimelineItems)
|
||||
continue
|
||||
}
|
||||
|
||||
const tweet = parseTweet(item)
|
||||
tweet && tweets.push(tweet)
|
||||
if (!nextThread) {
|
||||
break
|
||||
}
|
||||
|
||||
return tweets
|
||||
},
|
||||
username,
|
||||
tweetId,
|
||||
url
|
||||
)) as Tweet[]
|
||||
// get the new timeline items and add them to the list
|
||||
const newTimelineItems = Array.from(
|
||||
nextThread.querySelectorAll('.timeline-item')
|
||||
)
|
||||
|
||||
timelineItems.push(...newTimelineItems)
|
||||
continue
|
||||
}
|
||||
|
||||
const tweet = parseTweet(item)
|
||||
tweet && tweets.push(tweet)
|
||||
}
|
||||
|
||||
return tweets
|
||||
} catch (error) {
|
||||
@ -203,7 +198,7 @@ export class NitterHandler extends ContentHandler {
|
||||
}
|
||||
}
|
||||
|
||||
return genTweets()
|
||||
return genTweets(this.ADDRESS)
|
||||
}
|
||||
|
||||
parseTweetUrl = (url: string) => {
|
||||
|
||||
Reference in New Issue
Block a user