-
Notifications
You must be signed in to change notification settings - Fork 2
/
subreddit_downloader_pushshift_api.py
83 lines (69 loc) · 3.16 KB
/
subreddit_downloader_pushshift_api.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from sys import argv
from time import sleep, time
import pandas as pd
import requests
def get_pushshift(subreddit_name=None, before=None, after=None, searchType='submission'):
suffix=''
if (before is not None):
suffix += f'&before={before}'
if (after is not None):
suffix += f'&after={after}'
if (subreddit_name is not None):
suffix += f'&subreddit={subreddit_name}'
url = f'https://api.pushshift.io/reddit/search/{searchType}?sort=desc&size=1500{suffix}'
#print('loading '+url)
r = requests.get(url)
data = r.json()
if len(data['data']) > 0:
prev_end_date = data['data'][-1]['created_utc']
else:
prev_end_date = None
return (data, prev_end_date)
if __name__=="__main__":
try:
subreddit_name = str(argv[1]).strip()
except:
print('error: please input the name of subreddit as a first argument. ex: python subreddit_downloader.py [subreddit name]')
exit()
# scraping posts
"""
scraping columns:
data.
id, title, author, selftext, link_flair_text, created_utc, num_comments, url(attatched url), permalink
"""
print(f"Scraping posts in subreddit '{subreddit_name}'")
df_posts = pd.DataFrame([], columns=['id', 'title', 'author', 'selftext', 'link_flair_text', 'created_utc', 'num_comments', 'url', 'permalink'])
prev_end_date = 9999999999
while prev_end_date is not None:
submissions, prev_end_date = get_pushshift(subreddit_name=subreddit_name, before=prev_end_date-1, after='5y', searchType='submission')
if prev_end_date is not None:
for post in submissions['data']:
df_posts.loc[len(df_posts)] = [
post.get('id',''), post.get('title',''), post.get('author',''), post.get('selftext',''), post.get('link_flair_text',''),
post.get('created_utc',''), post.get('num_comments',''), post.get('url',''), post.get('permalink','')
]
sleep(1)
# scraping comments
"""
scraping columns:
data.
id, parent_id, author, body, created_utc, permalink
"""
print(f"Scraping comments in subreddit '{subreddit_name}'")
df_comments = pd.DataFrame([], columns=['id', 'parent_id', 'author', 'body', 'created_utc', 'permalink'])
prev_end_date = 9999999999
while prev_end_date is not None:
comments, prev_end_date = get_pushshift(subreddit_name=subreddit_name, before=prev_end_date-1, after='5y', searchType='comment')
if prev_end_date is not None:
for comment in comments['data']:
df_comments.loc[len(df_comments)] = [
comment['id'], comment['parent_id'], comment['author'], comment['body'], comment['created_utc'], comment['permalink']
]
sleep(1)
# saving to excel files
time_current = round(time())
path_save = f"subreddit_{subreddit_name}_{time_current}.xlsx"
with pd.ExcelWriter(path_save) as writer:
df_posts.to_excel(writer, sheet_name='posts', index=False)
df_comments.to_excel(writer, sheet_name='comments', index=False)
print(f"File was saved with the name of '{path_save}'")