-
Notifications
You must be signed in to change notification settings - Fork 1
/
reddit-sentiment-analysis.py
170 lines (142 loc) · 6.99 KB
/
reddit-sentiment-analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
'''*****************************************************************************
Purpose: To analyze the sentiments of the reddit
This program uses Vader SentimentIntensityAnalyzer to calculate the ticker compound value.
You can change multiple parameters to suit your needs. See below under "set program parameters."
Implementation:
I am using sets for 'x in s' comparison, sets time complexity for "x in s" is O(1) compare to list: O(n).
Limitations:
It depends mainly on the defined parameters for current implementation:
It completely ignores the heavily downvoted comments, and there can be a time when
the most mentioned ticker is heavily downvoted, but you can change that in upvotes variable.
Author: github:asad70
-------------------------------------------------------------------
****************************************************************************'''
import praw
from data import *
import time
import pandas as pd
import matplotlib.pyplot as plt
import squarify
from nltk.sentiment.vader import SentimentIntensityAnalyzer
start_time = time.time()
reddit = praw.Reddit(user_agent="Comment Extraction",
client_id="ZM9jcd0nyXvtlA",
client_secret="2WjTo27fw6c98-x0Nb5oTICNB-6D0g",
username="",
password="")
'''############################################################################'''
# set the program parameters
subs = ['wallstreetbets' ] # sub-reddit to search
post_flairs = {'Daily Discussion', 'Weekend Discussion', 'Discussion'} # posts flairs to search || None flair is automatically considered
goodAuth = {'AutoModerator'} # authors whom comments are allowed more than once
uniqueCmt = True # allow one comment per author per symbol
ignoreAuthP = {'example'} # authors to ignore for posts
ignoreAuthC = {'example'} # authors to ignore for comment
upvoteRatio = 0.70 # upvote ratio for post to be considered, 0.70 = 70%
ups = 20 # define # of upvotes, post is considered if upvotes exceed this #
limit = 500 # define the limit, comments 'replace more' limit
upvotes = 2 # define # of upvotes, comment is considered if upvotes exceed this #
picks = 10 # define # of picks here, prints as "Top ## picks are:"
picks_ayz = 5 # define # of picks for sentiment analysis
'''############################################################################'''
posts, count, c_analyzed, tickers, titles, a_comments = 0, 0, 0, {}, [], {}
cmt_auth = {}
for sub in subs:
subreddit = reddit.subreddit(sub)
hot_python = subreddit.hot() # sorting posts by hot
# Extracting comments, symbols from subreddit
for submission in hot_python:
flair = submission.link_flair_text
author = submission.author.name
# checking: post upvote ratio # of upvotes, post flair, and author
if submission.upvote_ratio >= upvoteRatio and submission.ups > ups and (flair in post_flairs or flair is None) and author not in ignoreAuthP:
submission.comment_sort = 'new'
comments = submission.comments
titles.append(submission.title)
posts += 1
try:
submission.comments.replace_more(limit=limit)
for comment in comments:
# try except for deleted account?
try: auth = comment.author.name
except: pass
c_analyzed += 1
# checking: comment upvotes and author
if comment.score > upvotes and auth not in ignoreAuthC:
split = comment.body.split(" ")
for word in split:
word = word.replace("$", "")
# upper = ticker, length of ticker <= 5, excluded words,
if word.isupper() and len(word) <= 5 and word not in blacklist and word in us:
# unique comments, try/except for key errors
if uniqueCmt and auth not in goodAuth:
try:
if auth in cmt_auth[word]: break
except: pass
# counting tickers
if word in tickers:
tickers[word] += 1
a_comments[word].append(comment.body)
cmt_auth[word].append(auth)
count += 1
else:
tickers[word] = 1
cmt_auth[word] = [auth]
a_comments[word] = [comment.body]
count += 1
except Exception as e: print(e)
# sorts the dictionary
symbols = dict(sorted(tickers.items(), key=lambda item: item[1], reverse = True))
top_picks = list(symbols.keys())[0:picks]
time = (time.time() - start_time)
# print top picks
print("It took {t:.2f} seconds to analyze {c} comments in {p} posts in {s} subreddits.\n".format(t=time, c=c_analyzed, p=posts, s=len(subs)))
print("Posts analyzed saved in titles")
#for i in titles: print(i) # prints the title of the posts analyzed
print(f"\n{picks} most mentioned picks: ")
times = []
top = []
for i in top_picks:
print(f"{i}: {symbols[i]}")
times.append(symbols[i])
top.append(f"{i}: {symbols[i]}")
# Applying Sentiment Analysis
scores, s = {}, {}
vader = SentimentIntensityAnalyzer()
# adding custom words from data.py
vader.lexicon.update(new_words)
picks_sentiment = list(symbols.keys())[0:picks_ayz]
for symbol in picks_sentiment:
stock_comments = a_comments[symbol]
for cmnt in stock_comments:
score = vader.polarity_scores(cmnt)
if symbol in s:
s[symbol][cmnt] = score
else:
s[symbol] = {cmnt:score}
if symbol in scores:
for key, _ in score.items():
scores[symbol][key] += score[key]
else:
scores[symbol] = score
# calculating avg.
for key in score:
scores[symbol][key] = scores[symbol][key] / symbols[symbol]
scores[symbol][key] = "{pol:.3f}".format(pol=scores[symbol][key])
# printing sentiment analysis
print(f"\nSentiment analysis of top {picks_ayz} picks:")
df = pd.DataFrame(scores)
df.index = ['Bearish', 'Neutral', 'Bullish', 'Total/Compound']
df = df.T
print(df)
# Date Visualization
# most mentioned picks
squarify.plot(sizes=times, label=top, alpha=.7 )
plt.axis('off')
plt.title(f"{picks} most mentioned picks")
plt.show()
# Sentiment analysis
df = df.astype(float)
colors = ['red', 'springgreen', 'forestgreen', 'coral']
df.plot(kind = 'bar', color=colors, title=f"Sentiment analysis of top {picks_ayz} picks:")
plt.show()