-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathflow.py
75 lines (58 loc) · 3.36 KB
/
flow.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
"""
Processes and aggregates network flow data from a CSV file generated by the parse.py script.
This script is intended to be run after parse.py, which produces a CSV file with detailed network traffic data
The script filters network traffic for TCP, TLSv1.2, and UDP, and groups packets into flows based on connection details, calculates flow statistics, and extracts initial hostnames. The aggregated data is then saved to a new CSV file.
Usage:
python flow.py <input_csv_file> <output_csv_file>
Example:
python flow.py output_from_parse.csv aggregated_flows.csv
Note:
The script is platform-independent but requires Python and pandas to be installed. Make sure file paths and permissions are correctly configured according to your operating system's requirements.
"""
import pandas as pd
import sys
import tldextract
def extract_main_domain(hostname):
try:
extracted = tldextract.extract(hostname) # Extracts registered domain: main domain and TLD
return f"{extracted.domain}.{extracted.suffix}"
except:
return None
def process_pcap_data(input_csv, output_csv):
df = pd.read_csv(input_csv)
df['frame.time_epoch'] = pd.to_datetime(df['frame.time_epoch'], unit='s') # Convert to timestamps
df = df[df['_ws.col.Protocol'].isin(['TCP', 'TLSv1.2', 'UDP'])] # Filtering out non-IP/TCP/UDP protocols for accurate flow information
# Sort by grouping columns and timestamp for accurate inter-arrival times
df = df.sort_values(by=['ip.src', 'ip.dst', 'tcp.srcport', 'tcp.dstport', 'udp.srcport', 'udp.dstport', '_ws.col.Protocol', 'frame.time_epoch'])
grouped = df.groupby(['ip.src', 'ip.dst', 'tcp.srcport', 'tcp.dstport', 'udp.srcport', 'udp.dstport', '_ws.col.Protocol'])
# Calculating inter-arrival times within each flow
df['inter_arrival_time'] = df.groupby(['ip.src', 'ip.dst', 'tcp.srcport', 'tcp.dstport', 'udp.srcport', 'udp.dstport', '_ws.col.Protocol'])['frame.time_epoch'].diff().dt.total_seconds()
# Apply domain extraction to hostname columns
df['src_main_domain'] = df['src_hostname'].apply(extract_main_domain)
df['dst_main_domain'] = df['dst_hostname'].apply(extract_main_domain)
# Aggregate data for each flow
flows = grouped.agg(
start_ts=('frame.time_epoch', 'min'),
end_ts=('frame.time_epoch', 'max'),
byte_count=('frame.len', 'sum'),
packet_count=('frame.len', 'size'),
avg_inter_arrival_time=('inter_arrival_time', 'mean'), # Calculate average inter-arrival time
src_hostname=('src_hostname', 'first'),
dst_hostname=('dst_hostname', 'first'),
src_main_domain=('src_main_domain', 'first'),
dst_main_domain=('dst_main_domain', 'first')
)
flows.reset_index(inplace=True)
# Specify order
columns_order = ['start_ts', 'end_ts', 'ip.src', 'ip.dst', 'tcp.srcport', 'tcp.dstport', 'udp.srcport', 'udp.dstport', '_ws.col.Protocol', 'byte_count', 'packet_count', 'avg_inter_arrival_time', 'src_hostname', 'dst_hostname', 'src_main_domain', 'dst_main_domain']
flows = flows[columns_order]
flows.to_csv(output_csv, index=False)
def main():
if len(sys.argv) != 3:
print("Usage: python flow.py <input_csv_file> <output_csv_file>")
sys.exit(1)
input_csv_file = sys.argv[1]
output_csv_file = sys.argv[2]
process_pcap_data(input_csv_file, output_csv_file)
if __name__ == "__main__":
main()