-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlog_parser.py
375 lines (320 loc) · 17.2 KB
/
log_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
import re
import time
import json
from os.path import dirname
from dateutil.parser import parse
from datetime import timedelta
from drain3 import TemplateMiner
from drain3.template_miner_config import TemplateMinerConfig
from drain3.file_persistence import FilePersistence
from termcolor import colored
from logger_config import logger
from rich import print
from rich.tree import Tree
import io
import contextlib
from rich.console import Console
import html
from rich.table import Table
from rich.style import Style
class LogParser:
def __init__(self):
# Compiling regular expressions for different log types
self.log_type_patterns = {
'ERROR': re.compile(r'\bERROR\b'),
'WARN': re.compile(r'\bWARN\b'),
'DEBUG': re.compile(r'\bDEBUG\b'),
'INFO': re.compile(r'\bINFO\b'),
}
config = TemplateMinerConfig()
self.persistence = FilePersistence("./models/drain3_state.bin")
self.drain3_config = config.load(f"{dirname(__file__)}/drain3.ini")
self.template_miner = TemplateMiner(self.persistence, self.drain3_config)
self.batch_size = 100000 # process and parse logs in batches of X
# Pattern for replacing whitespace characters
self.whitespace_pattern = re.compile(r'\t+|\r+|\n+|\r\n+|\s\s+|\r\n|\t')
# List of common timestamp patterns in various log files, expected at the start of the line
self.timestamp_patterns = [
r'^[\[\(]?\d{2}.\d{2}.\d{2} \d{2}:\d{2}:\d{2}.\d{1,9}[\]\)]?', # Format with hyphens (ex: 24.05.25 11:47:32.910019467)
r'^[\[\(]?\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}[\]\)]?', # Format with hyphens
r'^[\[\(]?\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}[\]\)]?', # Format with slashes
r'^[\[\(]?\d{2}/\d{2}/\d{4}:\d{2}:\d{2}:\d{2}[\]\)]?', # Format with colon
r'^[\[\(]?\w{3} \d{2} \d{2}:\d{2}:\d{2}[\]\)]?', # Format with three-letter month abbreviation
r'^[\[\(]?\w{3} \d{1,2}, \d{4} \d{1,2}:\d{1,2}:\d{1,2}[\]\)]?', # Format with comma
r'^[\[\(]?\d{4}\.\d{2}\.\d{2} \d{2}:\d{2}:\d{2},\d{2}[\]\)]?', # Format with dots and comma
r'^[\[\(]?\d{2}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}[\]\)]?', # Short date with slashes
r'^[\[\(]?\d{2}-\d{2}-\d{4} \d{2}:\d{2}:\d{2}[\]\)]?', # Short date with hyphens
r'^[\[\(]?\d{2}\.\d{2}\.\d{4} \d{2}:\d{2}:\d{2}[\]\)]?', # Short date with dots
r'^[\[\(]?\d{4}\d{2}\d{2}T\d{2}:\d{2}:\d{2}[\]\)]?', # ISO 8601 without separators
r'^[\[\(]?\d{8} \d{2}:\d{2}:\d{2}[\]\)]?', # Date without separators
r'^[\[\(]?[A-Z]\w{2,8} \d{1,2} \d{4} \d{2}:\d{2}:\d{2}[\]\)]?', # Full month name
r'^[\[\(]?\d{2} \w{3} \d{4} \d{2}:\d{2}:\d{2}[\]\)]?', # Two-digit day with three-letter month
r'^[\[\(]?\d{2}:\d{2}:\d{2} \d{2}/\d{2}/\d{4}[\]\)]?', # Time first, date with slashes
r'^[\[\(]?[A-Za-z]{3,4} \d{2} \d{2}:\d{2}:\d{2} \d{4}[\]\)]?', # Syslog format
r'^[\[\(]?[A-Za-z]{3,4}, \d{2} \w{3} \d{4} \d{2}:\d{2}:\d{2}[\]\)]?' # HTTP log format (RFC 1123)
r'^[\[\(]?\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z?[\]\)]?', # ISO 8601 with dashes
r'^[\[\(]?\d{2}\w{3}\d{2} \d{2}:\d{2}:\d{2}[\]\)]?', # Apache Log with abbreviation
r'^[\[\(]?\d{2}/\d{2}/\d{4}:\d{2}:\d{2}:\d{2}[\]\)]?', # Apache Combined Log Format
r'^[\[\(]?\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{2,3}[\]\)]?', # Java Util Logging
r'^[\[\(]?[A-Za-z]{3,4} \d{2}, \d{4} \d{2}:\d{2}:\d{2} \w{3}[\]\)]?', # Syslog with timezone
r'^[\[\(]?\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3,6}[\]\)]?', # ISO 8601 with microseconds
r'^[\[\(]?\d{4}/\d{2}/\d{2}-\d{2}:\d{2}:\d{2}[\]\)]?', # Date with slashes and hyphen
r'^[\[\(]?\d{4}\.\d{2}\.\d{2} \d{2}:\d{2}:\d{2}[\]\)]?', # Date with dots
r'^[\[\(]?[A-Za-z]{3,4} \d{2} \w{3} \d{4} \d{2}:\d{2}:\d{2}\.\d{3} \w{3}[\]\)]?', # Full Syslog with millis
r'^[\[\(]?W\d{2}\w{3}\d{2} \d{2}:\d{2}:\d{2} \d{4}[\]\)]?', # Week number with abbreviation
r'^[\[\(]?[A-Za-z]{3,4} \w{3} \d{2} \d{2}:\d{2}:\d{2}\.\d{6} \d{4}[\]\)]?', # Full Syslog with microseconds
r'^[\[\(]?[A-Za-z]{3,4} \d{2} \w{3} \d{4} \d{2}:\d{2}:\d{2} \w{3}\.\d{3,6}[\]\)]?', # Timezone with microseconds
r'^[\[\(]?[A-Za-z]{3,4}, \d{2} \w{3} \d{4} \d{2}:\d{2}:\d{2} \w{3}\.\d{3,6}[\]\)]?', # HTTP log format with millis
r'^[\[\(]?\d{4}\d{2}\d{2} \d{2}:\d{2}:\d{2}\.\d{3}[\]\)]?', # Date without separators, with milliseconds
r'^[\[\(]?[A-Za-z]{3,4} \w{3} \d{2} \d{2}:\d{2}:\d{2}\.\d{3} \d{4} \w{3}[\]\)]?' # Full Syslog with timezone and millis
]
def sanitize_text(self, text):
# Remove characters that might cause issues
sanitized_text = text.replace('[', '').replace(']', '')
return sanitized_text
def display_tree_from_print_tree(self, print_tree_output):
# Create a Tree instance
tree = Tree("Root")
# Parse the captured print_tree output and populate the tree
lines = print_tree_output.strip().split("\n")
current_node = tree
level_stack = []
for line in lines:
level = line.count("\t") + 1
text = self.sanitize_text(line.lstrip("\t")) # Sanitize the text
if level > len(level_stack):
level_stack.append(current_node)
while level < len(level_stack):
level_stack.pop()
new_node = level_stack[-1].add(text)
current_node = new_node
# Print the tree using Rich console
console = Console()
console.print(tree)
def escape_tree_text(self, text):
return text.replace("<", "<").replace(">", ">")
def escape_markup(self, text):
text = text.replace('<', '<')
text = text.replace('>', '>')
text = text.replace('&', '&')
text = text.replace('[', '[')
text = text.replace(']', ']')
return text
def display_table_from_clusters(self, clusters):
table = Table(title="Clusters")
table.add_column("Cluster ID", justify="right")
table.add_column("Size", justify="right")
table.add_column("Description")
cyan_style = Style(color="cyan")
white_style = Style(color="white")
alternating_style = [cyan_style, white_style]
for index, cluster in enumerate(clusters):
cluster_id = str(cluster.cluster_id)
size = str(cluster.size)
description = str(cluster.log_template_tokens)
description = description.replace("<", "<").replace(">", ">")
description = self.escape_markup(description)
style = alternating_style[index % len(alternating_style)]
table.add_row(cluster_id, size, description, style=style)
console = Console()
console.print(table)
def parse_log_lines(self, filepath, lines):
# Performance stats
line_count = 0
start_time = time.time()
batch_start_time = start_time
batch_size = self.batch_size
structured_logs = []
logger.info(f"[{colored(filepath, 'yellow')}] --> Line count before condensing & deduplicating: {len(lines)}")
condensed_lines = self.condense_lines(filepath, lines)
logger.info(f"[{colored(filepath, 'yellow')}] --> Line count after condensing & deduplicating {len(condensed_lines)}")
for line in condensed_lines:
line = line.strip()
#print(f"Line: {line}")
result = self.template_miner.add_log_message(line)
params = self.template_miner.extract_parameters(result['template_mined'], line)
original_line_content = line
line_count += 1
if line_count % batch_size == 0:
time_took = time.time() - batch_start_time
rate = batch_size / time_took
logger.info(f"[{colored(filepath, 'yellow')}] --> Processing line: {line_count}, rate {rate:.1f} lines/sec, "
f"{len(self.template_miner.drain.clusters)} clusters so far.")
batch_start_time = time.time()
if result["change_type"] != "none":
result_json = json.dumps(result)
#print(f"Input ({line_count}): {line}")
#print(f"Result: {result_json}")
if result["template_mined"] != "none":
#print(f"Parameters being added: {str(params)}")
cluster = self.template_miner.match(line)
if cluster is None:
logger.info(f"[{colored(filepath, 'yellow')}] --> No cluster match found for line: {line}")
else:
template = cluster.get_template()
parameters = self.template_miner.get_parameter_list(template, line)
#print(f"Matched template #{cluster.cluster_id}: {template}")
#print(f"Parameters: {parameters}")
structured_logs.append({
'template': template,
'parameters': parameters,
'content': original_line_content,
})
time_took = time.time() - start_time
rate = line_count / time_took
logger.info(f"[{colored(filepath, 'yellow')}] --> Done mining file in {time_took:.2f} sec. Total of {line_count} lines, rate {rate:.1f} lines/sec, "
f"{len(self.template_miner.drain.clusters)} clusters")
sorted_clusters = sorted(self.template_miner.drain.clusters, key=lambda it: it.size, reverse=True)
#print(f"sorted_clusters: {sorted_clusters}")
print(f"\n\n--------------------------------------------------")
print(f"[{filepath}] --> Clusters:")
print(f"--------------------------------------------------\n")
self.display_table_from_clusters(sorted_clusters)
print(f"\n\n--------------------------------------------------")
print(f"[{filepath}] --> Prefix Tree:")
print(f"--------------------------------------------------")
# Capture the output of self.template_miner.drain.print_tree()
# Capture the print_tree output
captured_output = io.StringIO()
with contextlib.redirect_stdout(captured_output):
self.template_miner.drain.print_tree()
self.display_tree_from_print_tree(captured_output.getvalue())
print("\n\n")
#self.template_miner.profiler.report(0)
#print(f"Result from add_log_message: {result")
#print(result)
return structured_logs
def condense_lines(self, filepath, lines):
output = []
in_non_timestamp_block = False
current_log_entry = None
skipped_empty_line_count = 0
# Compile the regular expressions
carriage_return_pattern = re.compile(r'\r\n+')
whitespace_pattern1 = re.compile(r'\>\s*\n?\s*\<')
whitespace_pattern2 = re.compile(r'\>\s\s+\<')
newline_pattern = re.compile(r'\>\n\<')
#keep a count of skipped empty lines
skipped_empty_line_count = sum(1 for line in lines if not line.strip())
# Strip more than one carriage return
condensed_lines = [carriage_return_pattern.sub('\r\n', line.strip()) for line in lines]
#Remove empty values or empty strings
condensed_lines = [line.strip() for line in condensed_lines if line.strip() != '']
# Iterate through the condensed_lines list and apply the other regular expressions
for i in range(len(condensed_lines)):
condensed_lines[i] = whitespace_pattern1.sub('><', condensed_lines[i].strip())
condensed_lines[i] = whitespace_pattern2.sub('><', condensed_lines[i].strip())
condensed_lines[i] = newline_pattern.sub('><', condensed_lines[i].strip())
condensed_lines[i] = self.whitespace_pattern.sub(' ', condensed_lines[i].strip()).strip()
for i in range(len(condensed_lines)):
line = condensed_lines[i].strip()
# Skip empty lines
if not line.strip():
skipped_empty_line_count = skipped_empty_line_count + 1
continue
# Search for lines starting with a timestamp
has_timestamp = self.try_parse_timestamp(line.strip())
#print(f'Processing line number: {index + 1}')
#print(f'Line content: {line}')
#print(f'Has timestamp: {has_timestamp}')
#print(f'In non-timestamp block: {in_non_timestamp_block}')
#print(f'Current Log Entry: {current_log_entry}')
# Care for malformed XML
if line.strip().startswith('<'):
in_non_timestamp_block = True
has_timestamp = False
if has_timestamp:
in_non_timestamp_block = False
if current_log_entry:
output.append(line.strip())
current_log_entry = line.strip()
elif current_log_entry is None:
current_log_entry = line.strip()
else:
if current_log_entry:
output.append(current_log_entry.strip())
else:
output.append(current_log_entry)
else:
if in_non_timestamp_block and current_log_entry:
current_log_entry += '\n' + line.strip()
else:
if current_log_entry:
output.append(current_log_entry.strip())
current_log_entry = line.strip()
in_non_timestamp_block = True
#Remove duplicate lines after the condense processing
output = list(set(output))
# Add the last log entry if exists
if current_log_entry:
output.append(current_log_entry.strip())
# else:
# print(f"Current log entry is none for line {line}!")
# print(f"in-non_timestamp_block = {in_non_timestamp_block}")
# print(f"has_timestamp = {has_timestamp}")
logger.info(f"[{colored(filepath, 'yellow')}] --> Condense lines: { len(output)} valid lines, ({skipped_empty_line_count}) empty lines skipped. ")
return output
def unique_structured_logs(self, structured_logs_master):
# Step 1: Create a dictionary to hold unique content
unique_content = {}
# Step 2: Iterate through lines
for line in structured_logs_master: # Replace with your actual lines iteration
original_line_content = line['content'] # Adjust as needed
template = line['template']
parameters = line['parameters']
# Check if the content is already in the dictionary
if original_line_content not in unique_content:
unique_content[original_line_content] = {
'template': template,
'parameters': parameters,
}
# Step 3: Convert the dictionary into structured_logs
structured_logs = []
for content, value in unique_content.items():
structured_logs.append({
'template': value['template'],
'parameters': value['parameters'],
'content': content,
})
return structured_logs
def try_parse_timestamp(self, line):
try:
if self.extract_timestamp(line) is not None:
return True
else:
return False
except ValueError:
return False
def parse_log_line(self, log_line):
#print("log_line before calling extract_timestamp:", log_line) # Add this line
log_type = self.identify_log_type(log_line)
if len(log_line['content']) > 0:
timestamp = self.extract_timestamp(log_line['content'])
else:
timestamp = self.extract_timestamp(log_line)
if timestamp is not None:
timestamp_str = timestamp.strftime('%Y-%m-%d %H:%M:%S.%f')
else:
timestamp_str = "None"
return {'type': log_type, 'timestamp': timestamp_str, 'content': log_line }
def identify_log_type(self, log_line):
if not isinstance(log_line, str):
return 'INFO' # Or you can handle this case differently, if needed
for log_type, pattern in self.log_type_patterns.items():
if pattern.search(log_line):
return log_type
return 'INFO'
def extract_timestamp(self, log_line):
timestamp_pattern = re.compile('|'.join(self.timestamp_patterns))
if log_line.startswith('<'):
return None
match = re.search(timestamp_pattern, log_line)
if match:
timestamp_str = match.group(0).strip('[]()') # Remove enclosing brackets if present
try:
timestamp = parse(timestamp_str)
return timestamp
except Exception as e:
logger.info(f"[{colored('Individual Model', 'yellow')}] --> Failed to parse timestamp from log line: {log_line}, check masking patterns. Error: {e}")
#print(f"No matching timestamp found in log line: {log_line}.")
return None