-
Notifications
You must be signed in to change notification settings - Fork 0
/
Main.py
371 lines (298 loc) · 12.7 KB
/
Main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
import re
import os.path
import numpy as np
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
################################################################
#
# TwitterUser class for storing info. about user and his tweets
#
################################################################
class TwitterUser:
def __init__(self, idd, created, collected, followings,
followers, tweets_num, length_name, length_description):
self._user_id = idd
self._created_at = created
self._collected_at = collected
self._numb_followings = followings
self._numb_followers = followers
self._numb_tweets = tweets_num
self._name_length = length_name
self._description_length = length_description
self._tweets = []
self._tfidf = []
self._ratio_follower_following = 0.0
self._count_http = 0
self._count_at = 0
@property
def user_id(self):
return self._user_id
@user_id.setter
def user_id(self, value):
self._user_id = value
@property
def created_at(self):
return self._created_at
@created_at.setter
def created_at(self, value):
self._created_at = value
@property
def collected_at(self):
return self._collected_at
@collected_at.setter
def collected_at(self, value):
self._collected_at = value
@property
def numb_followings(self):
return self._numb_followings
@numb_followings.setter
def numb_followings(self, value):
self._numb_followings = value
@property
def numb_followers(self):
return self._numb_followers
@numb_followers.setter
def numb_followers(self, value):
self._numb_followers = value
@property
def numb_tweets(self):
return self._numb_tweets
@numb_tweets.setter
def numb_tweets(self, value):
self._numb_tweets = value
@property
def name_length(self):
return self._name_length
@name_length.setter
def name_length(self, value):
self._name_length = value
@property
def description_length(self):
return self._description_length
@description_length.setter
def description_length(self, value):
self._description_length = value
@property
def tweets(self):
return self._tweets
@tweets.setter
def tweets(self, value):
self._tweets = value
@property
def ratio_follower_following(self):
return self._ratio_follower_following
@ratio_follower_following.setter
def ratio_follower_following(self, value):
self._ratio_follower_following = value
@property
def count_http(self):
return self._count_http
@count_http.setter
def count_http(self, value):
self._count_http = value
@property
def count_at(self):
return self._count_at
@count_at.setter
def count_at(self, value):
self._count_at = value
@property
def tfidf(self):
return self._tfidf
@tfidf.setter
def tfidf(self, value):
self._tfidf = value
#################################################################
#
# Imports data from text files, stores it into TwitterUser object
# and finally return a list TwitterUser objects.
#
#################################################################
def import_user_data(user_file, tweet_file):
twitter_users = []
with open(user_file) as f1:
all_user_info = f1.readlines()
for user_info in all_user_info:
twitter_user_ob = TwitterUser(re.split(r'\t+', user_info)[0], re.split(r'\t+', user_info)[1],
re.split(r'\t+', user_info)[2], float(re.split(r'\t+', user_info)[3]),
float(re.split(r'\t+', user_info)[4]), float(re.split(r'\t+', user_info)[5]),
float(re.split(r'\t+', user_info)[6]), float(re.split(r'\t+', user_info)[7][:-1]))
twitter_users.append(twitter_user_ob)
with open(tweet_file, encoding='utf-8') as f2:
all_user_tweets = f2.readlines()
for line_tweets_info in all_user_tweets:
for twitter_user in twitter_users:
if twitter_user.user_id == re.split(r'\t+', line_tweets_info)[0]:
tweets = twitter_user.tweets
tweets.append(re.split(r'\t+', line_tweets_info)[2])
twitter_user.tweets = tweets
return twitter_users
#################################################################
#
# Calculates various features derived from the existing data types
# of the TwitterUser object, and add those calculated features back
# to the object.
#
#################################################################
def calculate_features(twitter_users):
for user in twitter_users:
try:
tfidf = TfidfVectorizer(min_df=1).fit_transform(user.tweets)
pairwise_similarity = tfidf * tfidf.T
user.tfidf = csr_matrix.mean(pairwise_similarity).item()
except Exception:
user.tfidf = 0.0
pass
if user.numb_followings > 0:
user.ratio_follower_following = user.numb_followers / user.numb_followings
else:
user.ratio_follower_following = 0
at_count = 0
http_count = 0
for tweet in user.tweets:
at_count += tweet.count("@")
http_count += tweet.count("http")
user.count_at = at_count
user.count_http = http_count
####################################################################
#
# Converts the features into numpy arrray / matrix and normalizes it
#
####################################################################
def build_feature_matrix(twitter_users):
features_matrix = []
for user in twitter_users:
features_matrix.append([user.name_length, user.description_length, user.count_http,
user.count_at, user.ratio_follower_following, user.tfidf])
features_matrix_np = np.array(features_matrix)
features_matrix_normalized = features_matrix_np / features_matrix_np.max(axis=0)
return features_matrix_normalized
####################################################################
#
# Importing files, creating training & testing features and labels
#
####################################################################
predictUsers = []
def appendToUsers(predictThis):
for i in predictThis:
predictUsers.append(i.user_id)
def refomat_new_lines(filename):
print(filename)
with open(filename, "rt", encoding='utf-8') as fh:
content = fh.read().replace('\r', '')
print(content[:3])
with open(filename + '_new', "wt", encoding='utf-8') as fh:
fh.write(content)
print('OK!')
if os.path.isfile("training_labels.dat") and os.path.isfile("training_features_matrix.dat") and os.path.isfile("testing_labels.dat") and os.path.isfile("testing_features_matrix.dat"):
# for filename in ["training_labels.dat", "training_features_matrix.dat",
# "testing_labels.dat", "testing_features_matrix.dat"]:
# refomat_new_lines(filename)
training_labels = pickle.load(open("training_labels.dat_new", "rb"), encoding = 'bytes')
training_features = pickle.load(open("training_features_matrix.dat_new", "rb"), encoding = 'bytes')
testing_labels = pickle.load(open("testing_labels.dat_new", "rb"), encoding = 'bytes')
testing_features = pickle.load(open("testing_features_matrix.dat_new", "rb"), encoding = 'bytes')
testing_legit = import_user_data("Testing_data/legitimate_users1.txt", "Testing_data/legitimate_users_tweets.txt")
testing_spammers = import_user_data("Testing_data/spammers1.txt", "Testing_data/spammers_tweets.txt")
appendToUsers(testing_legit)
appendToUsers(testing_spammers)
else:
training_spammers = import_user_data("Training_data/spammers.txt", "Training_data/spammers_tweets.txt")
calculate_features(training_spammers)
training_spammers_feature_matrix = build_feature_matrix(training_spammers)
training_legit = import_user_data("Training_data/legitimate_users.txt", "Training_data/legitimate_users_tweets.txt")
calculate_features(training_legit)
training_legit_feature_matrix = build_feature_matrix(training_legit)
training_labels = [0] * len(training_spammers_feature_matrix) + [1] * len(training_legit_feature_matrix) #0 spammers, 1 - legit
with open('training_labels.dat', 'w') as outfile:
pickle.dump(training_labels, outfile)
training_features = np.concatenate((training_spammers_feature_matrix, training_legit_feature_matrix), axis=0)
with open('training_features_matrix.dat', 'w') as outfile:
pickle.dump(training_features, outfile)
testing_spammers = import_user_data("Testing_data/spammers1.txt", "Testing_data/spammers_tweets.txt")
calculate_features(testing_spammers)
testing_spammers_feature_matrix = build_feature_matrix(testing_spammers)
appendToUsers(testing_spammers)
testing_legit = import_user_data("Testing_data/legitimate_users1.txt", "Testing_data/legitimate_users_tweets.txt")
calculate_features(testing_legit)
testing_legit_feature_matrix = build_feature_matrix(testing_legit)
appendToUsers(testing_legit)
testing_labels = [0] * len(testing_spammers_feature_matrix) + [1] * len(testing_legit_feature_matrix)
with open('testing_labels.dat', 'w') as outfile:
pickle.dump(testing_labels, outfile)
testing_features = np.concatenate((testing_spammers_feature_matrix, testing_legit_feature_matrix), axis=0)
with open('testing_features_matrix.dat', 'w') as outfile:
pickle.dump(testing_features, outfile)
####################################################################
#
# Configuring plot appearance and labels
#
####################################################################
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.BuPu):
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(2)
plt.xticks(tick_marks, ["Spammer", "Non-Spammer"], rotation=45)
plt.yticks(tick_marks, ["Spammer", "Non-Spammer"])
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
####################################################################
#
# Building, testing and evaluating various Machine Learning models.
#
####################################################################
def select_classifier(algo, label):
model = algo
print(training_features)
print(training_labels)
model.fit(training_features, np.array(training_labels))
expected = testing_labels
predicted = model.predict(testing_features)
count = 0
for predict in predicted:
count+=1
if(predict == 0):
print(predictUsers[count - 1]),
print("is a spammer")
else:
print(predictUsers[count - 1]),
print("is a not spammer")
print("----------------------------------------------------")
print("| Classification Report |")
print("----------------------------------------------------")
print(metrics.classification_report(expected, predicted))
print("")
print("----------------------------------------------------")
print("| Confusion Matrix |")
print("----------------------------------------------------")
print(metrics.confusion_matrix(expected, predicted))
print("")
cm_list = metrics.confusion_matrix(expected, predicted).tolist()
list_total = float(sum(sum(x) for x in cm_list))
print("----------------------------------------------------")
print("| False Positives and Negatives |")
print("----------------------------------------------------")
print ("False Positive: ", cm_list[1][0] / list_total)
print("")
print ("False Negative: ", cm_list[0][1] / list_total)
print("")
plt.figure()
plot_confusion_matrix(metrics.confusion_matrix(expected, predicted), label)
plt.show()
# Naive Bias
select_classifier(GaussianNB(), "Naive Bias Classifier")
# SVM
#select_classifier(svm.SVC(), "SVM Classifier")
# ADA Boost
#select_classifier(AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=90), "ADA Boost Classifier")
# Random Forest
#select_classifier(RandomForestClassifier(n_estimators=150), "Random Forest Classifier")