-
Notifications
You must be signed in to change notification settings - Fork 0
/
storyscraper.py
102 lines (82 loc) · 2.57 KB
/
storyscraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
"""
This module is responsible for scraping stories from a predetermined multireddit.
It imports and filters the json story data and saves the stories in a specified file.
"""
import requests
import json
import os
from unidecode import unidecode
import re
# number of stories to get
n = 20
def scrape_stories() -> dict:
"""
Scrapes stories from a predetermined multireddit.
"""
link = f"https://www.reddit.com/r/AmItheAsshole+MaliciousCompliance+nosleep+tifu/top.json?limit={n}&t=week"
data = requests.get(link).json()
return data
def filter_stories(data: dict) -> list[dict]:
"""
Filters the stories from the json data.
:param data:
:return:
"""
# TODO: Remove stories that are too short (len)
# TODO: Remove links from stories (replace entire word with https://, and change it to the word 'link')
stories = []
for post in data['data']['children']:
title = post['data']['title']
title = filter_text(title)
text = post['data']['selftext']
text = filter_text(text)
stories.append({'title': title, 'text': text})
return stories
def filter_text(text: str) -> str:
"""
Filters out any unwanted text from the stories.
:param text:
:return:
"""
text = text.replace('&', 'and')
text = text.replace('u2026', '...')
text = re.sub('fuck', 'fork', text, flags=re.IGNORECASE)
text = re.sub('shit', 'poop', text, flags=re.IGNORECASE)
text = re.sub('bitch', 'birch', text, flags=re.IGNORECASE)
text = re.sub('fucking', 'flocking', text, flags=re.IGNORECASE)
translation_table = str.maketrans({
';': '',
'/': '',
r'\ '[0]: '',
'\n': '',
'\"': '',
':': '',
"'": '',
})
text = text.translate(translation_table)
# Normalize Unicode characters to ASCII
text = unidecode(text)
# Remove any remaining unwanted characters using regex
text = re.sub(r'[^\x00-\x7F]+', '', text)
return text
def save_stories(stories: list[dict], file_path: str):
"""
Saves the stories in a specified file.
:param stories:
:param file_path:
:return:
"""
if os.path.exists(file_path):
os.remove(file_path)
with open(file_path, 'w') as file:
json.dump(stories, file)
def gather_n_stories(file_path: str = 'temp/stories.json'):
"""
Main function to scrape stories from a predetermined multireddit.
:param file_path:
:return:
"""
data = scrape_stories()
stories = filter_stories(data)
save_stories(stories, file_path)
return stories