-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathreddit-scraper.py
829 lines (659 loc) · 25.2 KB
/
reddit-scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
import argparse
import datetime
import dateutil.parser
import json
import logging
import nltk.tokenize
import os
import praw
import prawcore
import re
import requests
import sys
import time
from multiprocessing import Pool
try:
import reddit_config as config
except ImportError:
print("You should create a file named reddit_config.py before running this script.")
print(
"Have a look at the README or just rename reddit_config.py and put your Reddit"
)
print("API credentials in there.")
exit(1)
# ~~~ Global constants ~~~ #
# Output encoding
ENCODING = "utf8"
# API endpoint
PUSHSHIFT_ENDPOINT = "https://api.pushshift.io/reddit/search/submission"
# Skip posts deleted by the author?
SKIP_DELETED = config.SKIP_DELETED
# Skip posts deleted by mods/bots?
SKIP_REMOVED = config.SKIP_REMOVED
# Add the username of the users at the beginning of the lines?
PRINT_USERS = config.PRINT_USERS
# Word tokenizer
TOKENIZER = nltk.tokenize.WordPunctTokenizer().tokenize
# Maximum number of retries when the APIs return an error
MAX_RETRIES = 5
# Configuration file field separator
CONFIG_FIELD_SEPARATOR = "\t"
# Default user for posts with no author
DEFAULT_NO_AUTHOR = "[ DELETED_AUTHOR ]"
# Separator for username/post
AUTHOR_SEP = " : "
# Logging
logging.basicConfig(
format="%(asctime)s\t%(levelname)s\t%(module)s\t%(message)s", level=logging.DEBUG
)
# Disable logging for imported libraries
logging.getLogger("urllib3").setLevel(logging.INFO)
logging.getLogger("prawcore").setLevel(logging.INFO)
logging.getLogger("request").setLevel(logging.INFO)
def conflate_spaces(text):
"""
Conflates space characters into a single space in a string.
:param text: the string to cleanup.
:return: the input string, with the space characters conflated.
"""
return re.sub(r"\s+", " ", text)
def remove_markdown(text):
"""
Removes markdown formatting from the input text.
:param text: the text to cleanup
:return: the cleaned up text.
"""
# Strip URLs
text = re.sub(r"\[([^]]+)\][ ]?\(([^)]+)\)", r"\g<1>", text)
# Remove star-marked bold and italic
# We apply the regex three times to remove
# *italic*, **bold**, ***bold and italic***
text = re.sub(r"\*([^]]+)\*", r"\g<1>", text)
text = re.sub(r"\*([^]]+)\*", r"\g<1>", text)
text = re.sub(r"\*([^]]+)\*", r"\g<1>", text)
# Remove underline-marked bold and italic
text = re.sub(r"_([^]]+)_", r"\g<1>", text)
text = re.sub(r"_([^]]+)_", r"\g<1>", text)
text = re.sub(r"_([^]]+)_", r"\g<1>", text)
# Remove code
text = re.sub(r"`([^]]+)`", r"\g<1>", text)
# Remove strikethrough
text = re.sub(r"~~([^]]+)~~", r"\g<1>", text)
# Remove spoilers
text = re.sub(r">!([^]]+)!<", r"\g<1>", text)
return text
def scrape_comment_tree(comment):
"""
Recursively scrapes a comment and all its replies (and each reply's replies, and so on)
and returns them as a string.
:param comment: the instance of `CommentForest` to scrape.
:return: a list containing the comment and all its replies.
"""
comments = []
if PRINT_USERS:
author = comment.author.name if comment.author else DEFAULT_NO_AUTHOR
comments.extend([author + AUTHOR_SEP + comment.body])
else:
comments.extend([comment.body])
for reply in comment.replies:
comments.extend(scrape_comment_tree(reply))
return comments
def scrape_submission(reddit, submission_id, blacklist, output_dir, status_message=""):
"""
Scrapes a single submission and its comments and appends
its content into a file.
:param reddit: the `reddit` instance.
:param submission_id: the ID of the submission to scrape
:param status_message: an additional status message (optional)
:param blacklist: a list of lines to ignore (default `[]`).
:param output_dir: the output directory
"""
# Obtain the submission
submission = reddit.submission(id=submission_id)
output_file = "{0}{1}{2}.{3}".format(output_dir, os.sep, submission_id, "txt")
logging.debug(
'Scraping {0} {1} "{2}"'.format(
submission.id + " |",
status_message + " |" if status_message.strip() else "",
submission.title
if len(submission.title) < 40
else submission.title[:37] + "...",
)
)
# Skip deleted (by the author) or removed (due to rule violation) submissions
if SKIP_DELETED and "[deleted]" in submission.selftext:
return
if SKIP_REMOVED and "[removed]" in submission.selftext:
return
# Build list of contents
if PRINT_USERS:
author = submission.author.name if submission.author else DEFAULT_NO_AUTHOR
submission_list = [
author + AUTHOR_SEP + submission.title,
author + AUTHOR_SEP + submission.selftext,
]
else:
submission_list = [submission.title, submission.selftext]
submission.comments.replace_more(None)
for comment in submission.comments:
comments = scrape_comment_tree(comment)
if comments:
submission_list.extend(comments)
submission_list = [line.strip() + "\n" for line in submission_list]
# Put one sentence per line
sentences = []
for _submission in submission_list:
if PRINT_USERS:
sub_author = _submission[: _submission.index(":") - 1]
sub_text = _submission[_submission.index(":") + 2 :]
line_sentences = nltk.tokenize.sent_tokenize(remove_markdown(sub_text))
clean_lines = [conflate_spaces(sent) for sent in line_sentences]
clean_lines = [
sub_author + AUTHOR_SEP + " ".join(TOKENIZER(sent))
for sent in clean_lines
]
else:
line_sentences = nltk.tokenize.sent_tokenize(remove_markdown(_submission))
clean_lines = [conflate_spaces(sent) for sent in line_sentences]
clean_lines = [" ".join(TOKENIZER(sent)) for sent in clean_lines]
if len(clean_lines) > 0:
if len(blacklist) == 0:
sentences.extend(clean_lines)
else:
for line in clean_lines:
if line not in blacklist:
sentences.append(line)
# Write to file
with open(output_file, encoding=ENCODING, mode="w") as f:
f.writelines(sentence + "\n" for sentence in sentences)
def get_submission_list(start_timestamp, end_timestamp, args=None):
"""
Scrapes a subreddit for submissions between to given dates. Due to limitations
of the underlying service, it may not return all the possible submissions, so
it will be necessary to call this method again. The method requests the results
in descending orders, so in subsequent calls, you should only update end_timestamp.
:param start_timestamp: request results after this date/time.
:param end_timestamp: request results before this date/time.
:param args: the args to pass to the endpoint
:return: the JSON object returned by the service.
"""
# Generic parameters: for each submission we want its ID and timestamp,
# 500 is the maximum limit, sorted temporally by the most recent
params = "fields=id,created_utc,subreddit&limit=500&sort=desc&sort_type=created_utc"
if args:
for key, value in args.items():
params += "&{0}={1}".format(key, value)
url = "{0}?before={1}&after={2}&{3}".format(
PUSHSHIFT_ENDPOINT, end_timestamp, start_timestamp, params
)
resp = requests.get(url)
return resp.json()
def scrape_all(reddit, start_timestamp, end_timestamp, output_dir, config, blacklist):
"""
Scrapes the specified subreddit and writes it in an eponymously named
file in `output_dir`.
:param reddit: the `reddit` instance.
:param output_dir: the output directory.
:param start_timestamp: the starting date of the scraping
:param end_timestamp: the end date of the scraping
:param config: the parameters to pass to the Pushshift APIs
:param blacklist: a list of lines to ignore (default `[]`).
"""
start_date = datetime.datetime.utcfromtimestamp(start_timestamp).strftime(
"%Y-%m-%d %H:%M:%S"
)
end_date = datetime.datetime.utcfromtimestamp(end_timestamp).strftime(
"%Y-%m-%d %H:%M:%S"
)
logging.info("Scraping {0} to {1}...".format(start_date, end_date))
# get_submission_list returns the _most recent_ 500 submissions
# between the provided timestamps. So, if there are more than 500,
# we need to iterate.
smallest_processed_timestamp = end_timestamp
while smallest_processed_timestamp > start_timestamp:
submissions = None
retries = 0
e = Exception()
while (not submissions) and (retries < MAX_RETRIES):
retries += 1
try:
submissions = get_submission_list(
start_timestamp, smallest_processed_timestamp, args=config
)
except json.decoder.JSONDecodeError as _e:
e = _e
logging.info(
"Failed scraping {0}: tentative {1} of {2}".format(
start_date, retries, MAX_RETRIES
)
)
time.sleep(1)
if not submissions:
logging.warning(
"Error while retrieving {0}: {1}".format(
start_date, str(type(e).__name__)
)
)
return
for submission in submissions["data"]:
try:
sub_date = datetime.datetime.utcfromtimestamp(
submission["created_utc"]
).strftime("%Y-%m-%d")
sub_output_dir = "{0}{1}{3}{1}{2}".format(
output_dir, os.sep, sub_date, submission["subreddit"]
)
os.makedirs(sub_output_dir, exist_ok=True)
scrape_submission(
reddit, submission["id"], blacklist, sub_output_dir, sub_date
)
except Exception as e:
logging.warning(
"{0}: Failed scraping submission {1} due to {2} {3}".format(
start_date,
submission["id"],
type(e).__name__,
": " + str(e) if str(e) else "",
)
)
if len(submissions["data"]) == 0:
break
smallest_processed_timestamp = submissions["data"][
len(submissions["data"]) - 1
]["created_utc"]
logging.info(
"Scraped from {0} to {1}.".format(
datetime.datetime.utcfromtimestamp(
smallest_processed_timestamp
).strftime("%Y-%m-%d %H:%M:%S"),
datetime.datetime.utcfromtimestamp(
submissions["data"][0]["created_utc"]
).strftime("%Y-%m-%d %H:%M:%S"),
)
)
logging.info("Scraped {0} to {1}.".format(start_date, end_date))
def scrape_subreddit(reddit, sub_id, start_date, end_date, output_dir, blacklist):
"""
Scrapes the specified subreddit and writes it in an eponymously named
file in `output_dir`.
:param reddit: the `reddit` instance.
:param sub_id: the subreddit to scrape.
:param output_dir: the output directory.
:param start_date: the starting date of the scraping, in YYYY-MM-DD format.
:param end_date: the end date of the scraping, in YYYY-MM-DD format.
:param blacklist: a list of lines to ignore (default `[]`).
"""
logging.info("Scraping {0}...".format(sub_id))
# Create the subreddit's directory
output_dir = "{0}{1}{2}".format(output_dir, os.sep, sub_id)
os.makedirs(output_dir, exist_ok=True)
p1 = dateutil.parser.parse(start_date)
p2 = dateutil.parser.parse(end_date)
d1 = datetime.datetime(p1.year, p1.month, p1.day)
d2 = datetime.datetime(p2.year, p2.month, p2.day, 23, 59)
start_timestamp = int(d1.timestamp())
end_timestamp = int(d2.timestamp())
# get_submission_list returns the _most recent_ 500 submissions
# between the provided timestamps. So, if there are more than 500,
# we need to iterate.
smallest_processed_timestamp = end_timestamp
while smallest_processed_timestamp > start_timestamp:
submissions = None
retries = 0
e = Exception()
while (not submissions) and (retries < MAX_RETRIES):
retries += 1
try:
submissions = get_submission_list(
start_timestamp,
smallest_processed_timestamp,
args={"subreddit": sub_id},
)
except json.decoder.JSONDecodeError as _e:
e = _e
logging.info(
"Failed scraping {0}: tentative {1} of {2}".format(
sub_id, retries, MAX_RETRIES
)
)
time.sleep(1)
if not submissions:
logging.warning(
"Error while retrieving {0}: {1}".format(sub_id, str(type(e).__name__))
)
return
for submission in submissions["data"]:
try:
sub_date = datetime.datetime.utcfromtimestamp(
submission["created_utc"]
).strftime("%Y-%m-%d")
scrape_submission(
reddit, submission["id"], blacklist, output_dir, sub_date
)
except Exception as e:
logging.warning(
"r/{0}: Failed scraping submission {1} due to {2} {3}".format(
sub_id,
submission["id"],
type(e).__name__,
": " + str(e) if str(e) else "",
)
)
if len(submissions["data"]) == 0:
break
smallest_processed_timestamp = submissions["data"][
len(submissions["data"]) - 1
]["created_utc"]
logging.info(
"Scraped {0} from {1} to {2}.".format(
sub_id,
datetime.datetime.utcfromtimestamp(
smallest_processed_timestamp
).strftime("%Y-%m-%d %H:%M:%S"),
datetime.datetime.utcfromtimestamp(
submissions["data"][0]["created_utc"]
).strftime("%Y-%m-%d %H:%M:%S"),
)
)
logging.info("Finished scraping {0}.".format(sub_id))
def load_list_from_file(file):
"""
Loads the lists of subreddits to scrape and returns it. If the input file does
not exist, prints a message and quits the script.
:param file: the input file.
:return: a list of subreddit IDs.
"""
if os.path.isfile(file):
with open(file, encoding="utf8") as f:
lines = f.readlines()
lines = [s.strip() for s in lines]
lines = [sub for sub in lines if not sub.startswith("#")]
logging.debug("First 5 entries: {0}".format(str(lines[:5])))
return lines
else:
print("The file you specified does not exist or it is inaccessible.")
print("Exiting...")
exit(1)
def load_blacklist(file):
"""
Loads the lists of subreddits to scrape and returns it. If the input file does
not exist, prints a message and quits the script.
:param file: the input file.
:return: a list of subreddit IDs.
"""
if os.path.isfile(file):
with open(file, encoding="utf8") as f:
return [l.strip() for l in f.readlines()]
else:
print("The blacklist file you specified does not exist or it is inaccessible.")
print("Exiting...")
exit(1)
def load_config(file):
"""
Loads the configuration files for the parameters to pass to Pushshift and returns it. If the input file does
not exist, prints a message and quits the script.
:param file: the input file.
:return: the configuration parameters for Pushshift.
"""
config = {}
if os.path.isfile(file):
with open(file, encoding="utf8") as f:
for line in f.readlines():
if not line.startswith("#"):
config_entry = line.split(CONFIG_FIELD_SEPARATOR)
assert (
len(config_entry) == 2
), "Invalid configuration: each line should contain two entries"
config[config_entry[0]] = config_entry[1]
return config
else:
print("The config file you specified does not exist or it is inaccessible.")
print("Exiting...")
exit(1)
def check_output_directory(output_dir):
"""
Checks if the output directory exists and is writable. If not, prints a message
and quits the script.
:param output_dir: the string representing the output path.
"""
if not os.access(output_dir, os.W_OK):
print(
"Error: {0} is not a valid directory or it is not writable.".format(
output_dir
)
)
print("Exiting...")
exit(1)
def do_reddit_login():
"""
Reads the configuration and and initialises the `reddit` object.
If someting goes wrong, prints a message and quits the script.
:return: the `reddit` instance.
"""
reddit = praw.Reddit(
client_id=config.CLIENT_ID,
client_secret=config.CLIENT_SECRET,
user_agent=config.USER_AGENT,
)
# Check if we're actually logged in
try:
for _ in reddit.subreddit("test").top("week"):
break
except (prawcore.exceptions.OAuthException, prawcore.exceptions.ResponseException):
print(
"Login failed. Please double check your username, password, and application tokens."
)
exit(1)
return reddit
def make_splits(start_date, end_date, workers):
"""
Prepare the script to parallelize the download from the entire Reddit corpus.
:param start_date: the start date of the scrape
:param end_date: the end date of the scrape
:param workers: the number of splits
:return: the splits, in a list of couples like `[(start_timestamp,end_timestamp) , ... ]`
"""
p1 = dateutil.parser.parse(start_date)
p2 = dateutil.parser.parse(end_date)
d1 = datetime.datetime(p1.year, p1.month, p1.day)
d2 = datetime.datetime(p2.year, p2.month, p2.day, 23, 59)
delta = (d2 - d1) / workers
start_split = d1
end_split = d1 + delta
splits = [(int(start_split.timestamp()), int(end_split.timestamp()))]
while end_split < d2:
start_split = end_split
end_split = start_split + delta
splits.append((int(start_split.timestamp()), int(end_split.timestamp())))
return splits
def process_all(args):
"""
Utility function for parallel processing.
:param args: a tuple containing the args for `scrape_all`.
"""
# Unpack arguments
(reddit, start_year, end_year, output_folder, config, blacklist) = args
scrape_all(reddit, start_year, end_year, output_folder, config, blacklist)
def process_subs(args):
"""
Utility function for parallel processing.
:param args: a tuple containing the args for `scrape_subreddit`.
"""
# Unpack arguments
(reddit, sub, start_year, end_year, output_folder, blacklist) = args
scrape_subreddit(reddit, sub, start_year, end_year, output_folder, blacklist)
def process_posts(args):
"""
Utility function for parallel processing.
:param args: a tuple containing the args for `scrape_submission`.
"""
# Unpack arguments
(reddit, post, blacklist, output_folder) = args
scrape_submission(reddit, post, blacklist, output_folder)
def main():
import textwrap
# Parse command-line arguments
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description=textwrap.dedent(
"""\
Scrapes subreddits and puts their content in a plain text file.
Use with --posts to download posts, --subs to download
subreddits, and --config to make custom Pushshift API calls.
"""
),
)
mode_group = parser.add_mutually_exclusive_group(required=True)
mode_group.add_argument(
"--posts",
dest="posts_file",
type=str,
default="",
help="A file containing the list of posts to download, one per line.",
)
mode_group.add_argument(
"--subs",
dest="subs_file",
type=str,
# required=False,
default="",
help="A file containing the list of subreddits to download, one per line.",
)
mode_group.add_argument(
"--config",
dest="config_file",
type=str,
# required=False,
default="",
help="A file containing the arguments for the Pushshift APIs. See config.default.txt for a sample config file.",
)
parser.add_argument(
"--start",
dest="start_date",
type=str,
# required=True,
help="The date to start parsing from, in YYYY-MM-DD format",
)
parser.add_argument(
"--end",
dest="end_date",
type=str,
# required=True,
help="The final date of the parsing, in YYYY-MM-DD format",
)
parser.add_argument(
"--output",
dest="output_folder",
type=str,
required=True,
help="The output folder",
)
parser.add_argument(
"--blacklist",
dest="blacklist_file",
type=str,
required=False,
default="",
help="A file containing the lines to skip.",
)
parser.add_argument(
"--workers",
dest="num_workers",
type=int,
required=False,
default=1,
help="Number of parallel workers",
)
if len(sys.argv[1:]) == 0:
parser.print_help()
parser.exit()
args = parser.parse_args()
if args.config_file or args.subs_file:
if not (args.start_date and args.end_date):
parser.error(
"Start date and end date are required in --config or --subs mode."
)
pattern = re.compile("^[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]$")
if not (pattern.match(args.start_date) and pattern.match(args.end_date)):
parser.error("Invalid date format.")
check_output_directory(args.output_folder)
try:
reddit = do_reddit_login()
except ImportError:
parser.error("Failed to load configuration. Did you create reddit_config.py?")
return
# here return useless since parser.error quits,
# but necessary to avoid the 'variable might not be initialized' warnings
if args.subs_file:
subs = load_list_from_file(args.subs_file)
blacklist = load_blacklist(args.blacklist_file) if args.blacklist_file else []
if args.num_workers > 1:
with Pool(args.num_workers) as p:
p.map(
process_subs,
[
(
reddit,
sub,
args.start_date,
args.end_date,
args.output_folder,
blacklist,
)
for sub in subs
],
)
else:
for sub in subs:
process_subs(
(
reddit,
sub,
args.start_date,
args.end_date,
args.output_folder,
blacklist,
)
)
elif args.posts_file:
posts = load_list_from_file(args.posts_file)
blacklist = load_blacklist(args.blacklist_file) if args.blacklist_file else []
if args.num_workers > 1:
with Pool(args.num_workers) as p:
p.map(
process_posts,
[(reddit, post, blacklist, args.output_folder) for post in posts],
)
else:
for post in posts:
process_posts((reddit, post, blacklist, args.output_folder))
else:
blacklist = load_blacklist(args.blacklist_file) if args.blacklist_file else []
config = load_config(args.config_file) if args.config_file else {}
if args.num_workers > 1:
with Pool(args.num_workers) as p:
p.map(
process_all,
[
(
reddit,
start_split,
end_split,
args.output_folder,
config,
blacklist,
)
for (start_split, end_split) in make_splits(
args.start_date, args.end_date, args.num_workers
)
],
)
else:
start_ts, end_ts = make_splits(args.start_date, args.end_date, 1)[0]
process_all(
(reddit, start_ts, end_ts, args.output_folder, config, blacklist)
)
print("Done!")
exit(0)
if __name__ == "__main__":
main()