-
Notifications
You must be signed in to change notification settings - Fork 69
/
Copy path05_1_machine_learning_implementation_for_attack_files .py
188 lines (140 loc) · 8.6 KB
/
05_1_machine_learning_implementation_for_attack_files .py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
## "all_data.csv" file is required for the operation of the program.
## "all_data.csv" file must be located in the same directory as the program.
## the purpose of this program is to apply machine learning algorithms to the dataset and observe the performance of algorithms.
## the algorithms used are:Naive Bayes, QDA, Random Forest, ID3, AdaBoost, MLP, Nearest Neighbors
## As the program display output data include: file name, machine learning algorithm name, accuracy,Precision, Recall, F1-score,Time
## the program will create a CSV file that prints the results and a folder containing graphics.
## the some codes parts used for calculation and graphing are taken from the following site.
## http://scikit-learn.org
from sklearn import metrics
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import average_precision_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
import matplotlib.pyplot as plt
import numpy as np
#%matplotlib inline
import os
import pandas as pd
import csv
import time
import warnings
import math
warnings.filterwarnings("ignore")
result="./results/results_1.csv" #a CSV file is named in which the results are saved.
csv_files=os.listdir("attacks")# CSV files names: #The names of the files in the attacks folder are taken and assigned to a list (csv_files).
path=".\\attacks\\"
repetition=10
def folder(f_name): #this function creates a folder named "results" and "result_graph_1" in the program directory.
try:
if not os.path.exists(f_name):
os.makedirs(f_name)
except OSError:
print ("The folder could not be created!")
folder_name="./results/"
folder(folder_name)
folder_name="./results/result_graph_1/"
folder(folder_name)
#The machine learning algorithms to be used are defined in a dictionary (ml_list).
ml_list={
"Naive Bayes":GaussianNB(),
"QDA":QDA(),
"Random Forest":RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
"ID3" :DecisionTreeClassifier(max_depth=5,criterion="entropy"),
"AdaBoost":AdaBoostClassifier(),
"MLP":MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500),
"Nearest Neighbors":KNeighborsClassifier(3)}
# the features to be used for each attack type is defined in a dictionary(features).
# the first 4 of the features created by the file "04_1_feature_selection_for_attack_files.py" are used here.
features={"Bot":["Bwd Packet Length Mean","Flow IAT Max","Flow Duration","Flow IAT Min","Label"],
"DDoS":["Bwd Packet Length Std","Total Backward Packets","Fwd IAT Total","Flow Duration","Label"],
"DoS GoldenEye":["Flow IAT Max","Bwd Packet Length Std","Flow IAT Min","Total Backward Packets","Label"],
"DoS Hulk":["Bwd Packet Length Std","Fwd Packet Length Std","Fwd Packet Length Max","Flow IAT Min","Label"],
"DoS Slowhttptest":["Flow IAT Mean","Fwd Packet Length Min","Bwd Packet Length Mean","Total Length of Bwd Packets","Label"],
"DoS slowloris":["Flow IAT Mean","Total Length of Bwd Packets","Bwd Packet Length Mean","Total Fwd Packets","Label"],
"FTP-Patator":["Fwd Packet Length Max","Fwd Packet Length Std","Fwd Packet Length Mean","Bwd Packet Length Std","Label"],
"Heartbleed":["Total Backward Packets","Fwd Packet Length Max","Flow IAT Min","Bwd Packet Length Max","Label"],
"Infiltration":["Fwd Packet Length Max","Fwd Packet Length Mean","Flow Duration","Total Length of Fwd Packets","Label"],
"PortScan":["Flow Bytes/s","Total Length of Fwd Packets","Fwd IAT Total","Flow Duration","Label"],
"SSH-Patator":["Fwd Packet Length Max","Flow Duration","Flow IAT Max","Total Length of Fwd Packets","Label"],
"Web Attack":["Bwd Packet Length Std","Total Length of Fwd Packets","Flow Bytes/s","Flow IAT Max","Label"]}
seconds=time.time()#time stamp for all processing time
with open(result, "w", newline="",encoding="utf-8") as f:#a CSV file is created to save the results obtained.
wrt = csv.writer(f)
wrt.writerow(["File","ML algorithm","accuracy","Precision", "Recall" , "F1-score","Time"])
for j in csv_files: #this loop runs on the list containing the filenames.Operations are repeated for all attack files
print ('%-17s %-17s %-15s %-15s %-15s %-15s %-15s' % ("File","ML algorithm","accuracy","Precision", "Recall" , "F1-score","Time"))# print output header
a=[]
feature_list=list(features[j[0:-4]])
df=pd.read_csv(path+j,usecols=feature_list)#read an attack file.
df=df.fillna(0)
attack_or_not=[]
for i in df["Label"]: #it changes the normal label to "1" and the attack tag to "0" for use in the machine learning algorithm
if i =="BENIGN":
attack_or_not.append(1)
else:
attack_or_not.append(0)
df["Label"]=attack_or_not
y = df["Label"] #this section separates the label and the data into two separate pieces, as Label=y Data=X
del df["Label"]
feature_list.remove('Label')
X = df[feature_list]
for ii in ml_list: #this loop runs on the list containing the machine learning algorithm names. Operations are repeated for all the 7 algorithm
precision=[]
recall=[]
f1=[]
accuracy=[]
t_time=[]
for i in range(repetition): # This loop allows cross-validation and machine learning algorithm to be repeated 10 times
second=time.time()#time stamp for processing time
# cross-validation
X_train, X_test, y_train, y_test = train_test_split(X, y,# data (X) and labels (y) are divided into 2 parts to be sent to the machine learning algorithm (80% train,%20 test).
test_size = 0.20, random_state = repetition)# So, in total there are 4 tracks: training data(X_train), training tag (y_train), test data(X_test) and test tag(y_test).
#machine learning algorithm is applied in this section
clf = ml_list[ii]#choose algorithm from ml_list dictionary
clf.fit(X_train, y_train)
predict =clf.predict(X_test)
#makes "classification report" and assigns the precision, f-measure, and recall values.s.
f_1=f1_score(y_test, predict, average='macro')
pr=precision_score(y_test, predict, average='macro')
rc=recall_score(y_test, predict, average='macro')
precision.append(float(pr))
recall.append(float(rc))
f1.append(float(f_1))
accuracy.append(clf.score(X_test, y_test))
t_time.append(float((time.time()-second)) )
print ('%-17s %-17s %-15s %-15s %-15s %-15s %-15s' % (j[0:-4],ii,str(round(np.mean(accuracy),2)),str(round(np.mean(precision),2)),
str(round(np.mean(recall),2)),str(round(np.mean(f1),2)),str(round(np.mean(t_time),4))))#the result of the ten repetitions is printed on the screen.
with open(result, "a", newline="",encoding="utf-8") as f: # all the values found are saved in the opened file.
wrt = csv.writer(f)
for i in range(0,len(t_time)):
wrt.writerow([j[0:-4],ii,accuracy[i],precision[i],recall[i],f1[i],t_time[i]])#file name, algorithm name, precision, recall and f-measure are writed in CSV file
a.append(f1)
# In this section, Box graphics are created for the results of machine learning algorithms and saved in the feaure_graph folder.
ml=["Naive Bayes","QDA","Random Forest","ID3","AdaBoost","MLP","Nearest Neighbors"]
temp=0
fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(12, 6), sharey=True)
for c in range(2):
for b in range(4):
axes[c, b].boxplot(a[temp] )
axes[c, b].set_title(str(j[0:-4])+" - "+str(ml[temp]),fontsize=7)
axes[c, b].set_ylabel(("F measure"))
temp+=1
if temp==7:
break
if temp==7:
break
plt.savefig(folder_name+j[0:-4]+".pdf",bbox_inches='tight', papertype = 'a4', orientation = 'portrait', format = 'pdf')
plt.show()
print("\n------------------------------------------------------------------------------------------------------\n\n")
print("mission accomplished!")
print("Total operation time: = ",time.time()- seconds ,"seconds")