-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathlabel_malicious_irp_logs.py
139 lines (105 loc) · 5.95 KB
/
label_malicious_irp_logs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
__author__ = "Md. Ahsan Ayub"
__license__ = "GPL"
__credits__ = ["Ayub, Md. Ahsan", "Martindale, Nathan", "Smith, Steven",
"Siraj, Ambareen"]
__maintainer__ = "Md. Ahsan Ayub"
__email__ = "[email protected]"
__status__ = "Prototype"
# Importing the libraries
import pandas as pd
import glob
import os
import re
# Utility function to find the Process IDs and Process Names that are flagged
def find_malicious_logs(dataset):
# Initialize return lists
process_ids = []
process_names = []
dataset = dataset.drop(dataset[(dataset['doc_files_flag'] != 1)].index)
process_ids = dataset['process_id'].tolist()
process_names = dataset['process_name'].tolist()
return process_ids, process_names
# Utility function to find the Process IDs and Process Names that are flagged
def set_malicious_logs_labels(dataset, process_ids, process_names, ransomware_hash):
dataset['family_id'] = ransomware_hash
malicious_copy_dataset = dataset[(dataset['process_id'].isin(process_ids) & dataset['process_name'].isin(process_names))]
malicious_copy_dataset['class'] = 1
benign_copy_dataset = dataset[~(dataset['process_id'].isin(process_ids) & dataset['process_name'].isin(process_names))]
benign_copy_dataset['class'] = 0
frames = [malicious_copy_dataset, benign_copy_dataset]
return pd.concat(frames)
def generate_time_chunks(dataset, start_time, end_time, interval, ransomware_hash):
chunk_size = round((end_time - start_time) / interval)
#Create Directory
file_path = "/Time_Interval_Dataset/" + str(ransomware_hash) #+ "/" + str(round(interval / 60)) + "_mins"
try:
if(os.path.isdir(os.getcwd() + file_path) != True):
os.mkdir(os.getcwd() + file_path)
print(str(os.getcwd() + file_path) + " is created.")
file_path = file_path + "/" + str(round(interval / 60)) + "_mins"
if(os.path.isdir(os.getcwd() + file_path)) != True:
os.mkdir(os.getcwd() + file_path)
print(str(os.getcwd() + file_path) + " is created.")
except OSError:
print("Creation of the directory %s failed" % file_path)
return
file_name = str(ransomware_hash) + "_" + str(round(interval / 60)) + "_mins"
str_log = ""
start_time_index = start_time
for i in range(chunk_size):
if(i != chunk_size - 1):
end_time_index = start_time_index + interval
else:
end_time_index = end_time + 1
temp_dataset_copy = labeled_processed_data[((labeled_processed_data.pre_operation_time >= start_time_index) & (labeled_processed_data.pre_operation_time < end_time_index))]
# Dump the file
temp_dataset_copy.to_pickle(str(os.getcwd()) + str(file_path) + "/" + str(file_name) + "_" + str(i+1) + ".pkl.gz", compression='gzip')
str_log = str_log + str(i+1) + "\t" + str(start_time_index) + "\t" + str(end_time_index) + "\t" + str(temp_dataset_copy.shape) + "\n"
start_time_index = start_time + ((i + 1) * interval)
print(str_log)
with open(str(os.getcwd()) + str(file_path) + "/" + str(file_name) + ".txt", "w") as text_file:
print(str_log, file=text_file)
if __name__ == '__main__':
pwd = os.getcwd()
os.chdir('./Dataset/ransomware-irp-logs/')
# Storing the file names for all the aggregated datasets
all_filenames_aggregated = [i for i in glob.glob('*_aggregated*')]
all_filenames_aggregated = sorted(all_filenames_aggregated)
all_filenames_processed = [i for i in glob.glob('*_processed.*')]
all_filenames_processed = sorted(all_filenames_processed)
file_name_aggregated = all_filenames_aggregated[0]
file_name_processed = all_filenames_processed[0]
ransomware_hash = file_name_aggregated.split('_')[0]
"""all_filenames_labeled = [i for i in glob.glob('*labeled*')]
all_filenames_labeled = [filename.split('_')[0] for filename in all_filenames_labeled]
if ransomware_hash in all_filenames_labeled:
exit(1)"""
try:
aggegated_dataset = pd.read_csv(file_name_aggregated, compression='zip', header=0, sep=',', quotechar='"')
try:
processed_dataset = pd.read_csv(file_name_processed, compression='zip', header=0, sep=',', quotechar='"')
except:
processed_dataset = pd.read_pickle(file_name_processed, compression='zip')
except:
aggegated_dataset = pd.read_csv(file_name_aggregated, compression='gzip', header=0, sep=',', quotechar='"')
try:
processed_dataset = pd.read_csv(file_name_processed, compression='gzip', header=0, sep=',', quotechar='"')
except:
processed_dataset = pd.read_pickle(file_name_processed, compression='gzip')
aggegated_dataset = aggegated_dataset.drop(['Unnamed: 0'], axis=1)
process_ids, process_names = find_malicious_logs(aggegated_dataset)
ransomware_hash = file_name_aggregated.split('_')[0]
labeled_processed_data = set_malicious_logs_labels(processed_dataset, process_ids, process_names, ransomware_hash)
del aggegated_dataset, processed_dataset, process_ids, process_names
# Dump processed and label dataset
labeled_processed_data.to_pickle(str(ransomware_hash) + "_processed_labeled.pkl.gz", compression='gzip')
print("Labeled dataset generated")
# Partiton the data into time chunks
''' The difference will be around an hour and half and the values are in seconds'''
start_time = labeled_processed_data.pre_operation_time.min()
end_time = labeled_processed_data.pre_operation_time.max()
generate_time_chunks(labeled_processed_data, start_time, end_time, (5*60), ransomware_hash) # 5 mins
generate_time_chunks(labeled_processed_data, start_time, end_time, (10*60), ransomware_hash) # 10 mins
generate_time_chunks(labeled_processed_data, start_time, end_time, (20*60), ransomware_hash) # 20 mins
del labeled_processed_data
os.chdir(pwd)