-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTweetScraper.py
129 lines (107 loc) · 4.74 KB
/
TweetScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
from datetime import date, timedelta
from time import sleep
import pandas as pd
import snscrape.modules.twitter as sntwitter
from tqdm.auto import tqdm
class TweetScraper:
"""
A class for scraping tweets from Twitter for a given month and year, with a specified number of tweets per day.
"""
def __init__(self, year, num_tweets_per_day):
"""
Initializes the TweetScraper with the year and the number of tweets to scrape per day.
Parameters:
- year (int): The year for which to scrape tweets.
- num_tweets_per_day (int): The number of tweets to scrape per day.
"""
self.year = year
self.num_tweets_per_day = num_tweets_per_day
def grab_tweets(self, month):
"""
Scrapes tweets for a specified month and saves them to a CSV file.
Parameters:
- month (int): The month for which to scrape tweets.
"""
days_in_month = self.calculate_days_in_month(month)
total_tweets = days_in_month * self.num_tweets_per_day
tweets_list = []
pbar = tqdm(total=total_tweets)
since = date(self.year, month, 1)
for _ in range(days_in_month):
until = since + timedelta(days=1)
tweets_list.extend(self.scrape_tweets_for_day(since, until, pbar))
since = until
pbar.close()
self.save_to_csv(tweets_list, month)
def calculate_days_in_month(self, month):
"""
Calculates the number of days in a given month.
Parameters:
- month (int): The month number (1-12).
Returns:
- int: The number of days in the month.
"""
if month == 2:
return 29 if (self.year % 4 == 0 and self.year % 100 != 0) or (self.year % 400 == 0) else 28
elif month in [4, 6, 9, 11]:
return 30
else:
return 31
def scrape_tweets_for_day(self, since, until, pbar):
"""
Scrapes tweets for a single day.
Parameters:
- since (date): The start date.
- until (date): The end date (exclusive).
- pbar (tqdm): The progress bar object.
Returns:
- list: A list of tweets scraped for the day.
"""
tweets_list = []
query = f'since:{since.isoformat()} until:{until.isoformat()} lang:en'
for tweet in sntwitter.TwitterSearchScraper(query).get_items():
if tweet.inReplyToTweetId is not None or tweet.inReplyToUser is not None or len(
tweets_list) >= self.num_tweets_per_day:
continue
tweets_list.append(self.extract_tweet_data(tweet))
sleep(0.01)
pbar.update(1)
return tweets_list
@staticmethod
def extract_tweet_data(tweet):
"""
Extracts relevant data from a tweet object.
Parameters:
- tweet (Tweet): The tweet object.
Returns:
- list: A list containing relevant data from the tweet.
"""
return [tweet.id, tweet.url, tweet.date, tweet.content,
tweet.likeCount, tweet.replyCount, tweet.retweetCount,
tweet.quoteCount, tweet.sourceLabel, tweet.links, tweet.media,
tweet.quotedTweet, tweet.mentionedUsers, tweet.coordinates, tweet.place,
tweet.hashtags, tweet.cashtags, tweet.card, tweet.vibe,
tweet.user.username, tweet.user.description, tweet.user.favouritesCount,
tweet.user.followersCount, tweet.user.friendsCount, tweet.user.location,
tweet.user.verified, tweet.user.protected, tweet.user.mediaCount]
def save_to_csv(self, tweets_list, month):
"""
Saves the scraped tweets to a CSV file.
Parameters:
- tweets_list (list): The list of tweets to save.
- month (int): The month number for naming the file.
"""
df = pd.DataFrame(tweets_list, columns=['id', 'url', 'date', 'content',
'likeCount', 'replyCount', 'retweetCount',
'quoteCount', 'sourceLabel', 'links', 'media',
'quotedTweet', 'mentionedUsers', 'coordinates', 'place',
'hashtags', 'cashtags', 'card', 'vibe', 'username',
'UserDescription', 'UserFavouritesCount', 'followersCount',
'friendsCount', 'location', 'verified', 'protected', 'mediaCount'])
df.to_csv(f'{month}-{self.year}.csv', encoding='utf-8', index=False)
# Example usage:
year = 2022
num_tweets_per_day = 3000
scraper = TweetScraper(year, num_tweets_per_day)
for month in range(1, 13):
scraper.grab_tweets(month)