-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathreddit_scrape.py
More file actions
83 lines (69 loc) · 2.55 KB
/
reddit_scrape.py
File metadata and controls
83 lines (69 loc) · 2.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import datetime
import requests
import json
import csv
def getPushshiftData(query, after, before, sub):
url = 'https://api.pushshift.io/reddit/search/submission/?title=' + str(query) + '&after=' + str(
after) + '&before=' + str(before) + '&subreddit=' + str(sub)
print(url)
r = requests.get(url)
data = json.loads(r.text)
return data['data']
def collectSubData(subm):
subData = list() # list to store data points
title = subm['title']
url = subm['url']
try:
flair = subm['link_flair_text']
except KeyError:
flair = "NaN"
author = subm['author']
sub_id = subm['id']
score = subm['score']
created = datetime.datetime.fromtimestamp(subm['created_utc']) # 1520561700.0
numComms = subm['num_comments']
permalink = subm['permalink']
subData.append((sub_id, title, url, author, score, created, numComms, permalink, flair))
subStats[sub_id] = subData
def updateSubs_file():
upload_count = 0
location = ""
print("input filename of submission file, please add .csv")
filename = input()
file = location + filename
with open(file, 'w', newline='', encoding='utf-8') as file:
a = csv.writer(file, delimiter=',')
headers = ["Post ID", "Title", "Url", "Author", "Score", "Publish Date", "Total No. of Comments", "Permalink",
"Flair"]
a.writerow(headers)
for sub in subStats:
a.writerow(subStats[sub][0])
upload_count += 1
print(str(upload_count) + " submissions have been uploaded")
# Subreddit to query
sub = 'news'
# before and after dates
before = '1451606400' # October 1st
after = '1420070400' # January 1st
query = "amazon"
subCount = 0
subStats = {}
data = getPushshiftData(query, after, before, sub)
# Will run until all posts have been gathered
# from the 'after' date up until before date
while len(data) > 0:
for submission in data:
collectSubData(submission)
subCount += 1
# Calls getPushshiftData() with the created date of the last submission
print(len(data))
print(str(datetime.datetime.fromtimestamp(data[-1]['created_utc'])))
after = data[-1]['created_utc']
data = getPushshiftData(query, after, before, sub)
print(len(data))
print(str(len(subStats)) + " submissions have added to list")
print("1st entry is:")
print(list(subStats.values())[0][0][1] + " created: " + str(list(subStats.values())[0][0][5]))
print("Last entry is:")
print(list(subStats.values())[-1][0][1] + " created: " + str(list(subStats.values())[-1][0][5]))
updateSubs_file()