drain3.ini

[SNAPSHOT]
snapshot_interval_minutes = 10
compress_state = True

[MASKING]
; Note that the order does matter in this list, the regexs are weighted from top to bottom. This regex list is for java log4j logs mostly
masking = [
          {"regex_pattern": "([a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12})", "mask_with": "UUID"},
          {"regex_pattern": "(^[a-fA-F0-9]{32}$)", "mask_with": "MD5-HASH"},
          {"regex_pattern": "(^[a-fA-F0-9]{64}$)", "mask_with": "SHA256-HASH"},
          {"regex_pattern": "(^[a-fA-F0-9]{56}$)", "mask_with": "SHA3-224-HASH"},
          {"regex_pattern": "(^[a-fA-F0-9]{64}$)", "mask_with": "SHA3-256-HASH"},
          {"regex_pattern": "(^[a-fA-F0-9]{96}$)", "mask_with": "SHA3-384-HASH"},
          {"regex_pattern": "(^[a-fA-F0-9]{128}$)", "mask_with": "SHA3-512-HASH"},
          {"regex_pattern": "((\\d{4}[-./]\\d{2}[-./]\\d{2}[ ,]\\d{2}:\\d{2}:\\d{2}(?:[,\\.]\\d{1,3})?))", "mask_with": "TIMESTAMP"},
          {"regex_pattern": "\\b(DEBUG|INFO|WARN|ERROR|FATAL)\\b", "mask_with": "LOG-LEVEL"},
          {"regex_pattern": "(\\[(?:.*?-)?([a-zA-Z]+(?:-[0-9]{1,10})?)\\])", "mask_with": "THREAD-ID"},
          {"regex_pattern": "((\\[[a-zA-Z0-9-]*\\]))", "mask_with": "THREAD-ID"},
          {"regex_pattern": "(\\b(?:com|org|net|edu|gov|mil|java|io|int|arpa|biz|info|name|pro|aero|coop|museum|asia|cat|jobs|sun|javax)\\.[a-zA-Z0-9_\\-\\.]+[a-zA-Z0-9_]\\b)", "mask_with": "CLASS-NAME"},
          {"regex_pattern": "\\b[a-zA-Z0-9_]+\\.java:\\d+\\b", "mask_with": "FILE-NAME-LINE-NUMBER"},
          {"regex_pattern": "((ISO-\\d{4}(-\\d{1})?)|UTF-8)", "mask_with": "CHARACHTER-SET"},
          {"regex_pattern": "(<\\?xml.*?>.*)", "mask_with": "XML-CONTENT"},
          {"regex_pattern": "((?i)(?<=xpath\\s)\\/[a-zA-Z0-9\\/\\_\\-]+)", "mask_with": "XML-XPATH"},
          {"regex_pattern": "\\{(?:[^{}]|\\{[^{}]*\\})*\\}", "mask_with": "JSON-CONTENT"},
;          {"regex_pattern": "^(?:[^,]+,)+[^,]+$", "mask_with": "CSV-CONTENT"},
          {"regex_pattern": "^\\s*\\w+\\s*:\\s*.*$", "mask_with": "YAML-CONTENT"},
          {"regex_pattern": "\\b(SELECT|INSERT|UPDATE|DELETE)\\b.*", "mask_with": "SQL-QUERY"},
          {"regex_pattern": "^(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?$", "mask_with": "BASE64-CONTENT"},
          {"regex_pattern": "((?i)<(?:a|abbr|acronym|address|area|article|aside|audio|b|base|bdi|bdo|big|blockquote|body|br|button|canvas|caption|cite|code|col|colgroup|data|datalist|dd|del|details|dfn|dialog|div|dl|dt|em|embed|fieldset|figcaption|figure|font|footer|form|h1|h2|h3|h4|h5|h6|head|header|hr|html|i|iframe|img|input|ins|kbd|label|legend|li|link|main|map|mark|meta|meter|nav|noscript|object|ol|optgroup|option|output|p|param|picture|pre|progress|q|rp|rt|ruby|s|samp|script|section|select|small|source|span|strong|style|sub|summary|sup|svg|table|tbody|td|textarea|tfoot|th|thead|time|title|tr|track|u|ul|var|video|wbr)[\\s/>])", "mask_with": "HTML-CONTENT"},
          {"regex_pattern": "([0-9a-fA-F]{48})", "mask_with": "MQ-MESSAGE-OR-CORRELATIONID"},
          {"regex_pattern": "([0-9a-fA-F]+\\*{10}[0-9a-fA-F]+|[0-9a-fA-F]{48})", "mask_with": "MASKED-MQ-MESSAGE-OR-CORRELATIONID"},
          {"regex_pattern": "((?<=correlationId=)NONE)", "mask_with": "MQ-MESSAGE-OR-CORRELATIONID-NONE"},
          {"regex_pattern": "(MQMT_[A-Z]+)", "mask_with": "MQ-MESSAGE-TYPE"},
          {"regex_pattern": "(MQPER_[A-Z]+)", "mask_with": "MQ-MESSAGE-PERSISTENCE"},
          {"regex_pattern": "(?<=replyToQueueName=)[A-Z]+(?:\\.[A-Z]+)+(?=.*[0-9a-fA-F]+\\*{10}[0-9a-fA-F]+|[0-9a-fA-F]{48})", "mask_with": "MQ-MESSAGE-QUEUE"},
          {"regex_pattern": "((ins|inb)-\\w+-[0-9]{14}-[0-9a-z]+)", "mask_with": "ALERT-ID"},
          {"regex_pattern": "(ins-\\w+-[0-9]{14}-[0-9a-z]+)", "mask_with": "ALERT-ID"},
          {"regex_pattern": "((ins|inb)-\\w+-[0-9]{2}-[0-9]{14}-[0-9a-z]+)", "mask_with": "ALERT-ID"}, 
          {"regex_pattern": "((ins|inb)-\\w+-[0-9]{2}-[0-9]{14}-[0-9a-z]+)", "mask_with": "AL-ALERT-ID"},
          {"regex_pattern": "((inb|ins)-\\w+-\\d\\d-\\d+-\\d+)", "mask_with": "AL-ALERT-ID"},
          {"regex_pattern": "((inb|ins)-citi.*-[0-9]{2}-[0-9]{14}-[0-9a-z]+)", "mask_with": "CITI-ALERT-ID"},
          {"regex_pattern": "(<request>.*</request>)", "mask_with": "REQUEST-CONTENT"},
          {"regex_pattern": "(<\\?request.*?>.*)", "mask_with": "REQUEST-CONTENT"},
          {"regex_pattern": "(<response>.*</response>)", "mask_with": "RESPONSE-CONTENT"},
          {"regex_pattern": "(<\\?response.*?>.*)", "mask_with": "RESPONSE-CONTENT"},
          {"regex_pattern": "(https?://[a-zA-Z0-9.-]+(?::\\d+)?(?:/[a-zA-Z0-9_\\-./]*)?)", "mask_with": "URL"},
          {"regex_pattern": "\\b(?:\\d{4}[- ]?){3}\\d{4}\\b", "mask_with": "CREDIT-CARD-NUMBER"},
          {"regex_pattern": "(\\d{6}\\*{6}\\d{4})", "mask_with": "MASKED-CREDIT-CARD"},
          {"regex_pattern": "\\b(?:[0-9]{1,3}\\.){3}[0-9]{1,3}\\b", "mask_with": "IPV4-ADDRESS"},
          {"regex_pattern": "\\b(?:[A-Fa-f0-9]{1,4}:){7}[A-Fa-f0-9]{1,4}\\b", "mask_with": "IPV6-ADDRESS"},
          {"regex_pattern": "\\b(?:[A-Fa-f0-9]{2}[:-]){5}(?:[A-Fa-f0-9]{2})\\b", "mask_with": "MAC-ADDRESS"},
          {"regex_pattern": "\\b(?:978-|979-)?\\d{1,5}-\\d{1,7}-\\d{1,7}-\\d{1,7}-\\d{1}\\b", "mask_with": "ISBN-NUMBER"},
          {"regex_pattern": "\\b\\d{3}-\\d{2}-\\d{4}\\b", "mask_with": "US-SSN"},
          {"regex_pattern": "[a-zA-Z]:\\\\(?:[^\\\\\\n\\t\\0]+\\\\)*[^\\\\\\n\\t\\0]*", "mask_with": "FILE-PATH"},
          {"regex_pattern": "([a-zA-Z0-9._%%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,})", "mask_with": "EMAIL"},
          {"regex_pattern": "\\b\\d{4}-\\d{2}-\\d{2}\\b", "mask_with": "DATE-YYYY-MM-DD"},
          {"regex_pattern": "\\b\\d{2}:\\d{2}:\\d{2}\\b", "mask_with": "TIME-HH-MM-SS"},
          {"regex_pattern": "(\\\\+?\\\\d{1,3}?[-.\\\\s]?\\\\(\\\\d{1,4}?\\\\)?[-.\\\\s]?\\\\d{1,4}[-.\\\\s]?\\\\d{1,4}[-.\\\\s]?\\\\d{1,4})", "mask_with": "PHONENUMBER"},
          {"regex_pattern": "(key=\\d+)", "mask_with": "DB-KEY-ID"},
          {"regex_pattern":"((?<=[^A-Za-z0-9])|^)([0-9a-f]{6,} ?){3,}((?=[^A-Za-z0-9])|$)", "mask_with": "SEQ"},
          {"regex_pattern":"((?<=[^A-Za-z0-9])|^)([0-9A-F]{4} ?){4,}((?=[^A-Za-z0-9])|$)", "mask_with": "SEQ"},
          {"regex_pattern":"((?<=[^A-Za-z0-9])|^)(0x[a-f0-9A-F]+)((?=[^A-Za-z0-9])|$)", "mask_with": "HEX"},
          {"regex_pattern":"((?<=[^A-Za-z0-9])|^)(([0-9a-f]{2,}:){3,}([0-9a-f]{2,}))((?=[^A-Za-z0-9])|$)", "mask_with": "ID"},
          {"regex_pattern": "\\b\\d+\\b", "mask_with": "NUM"}
          ;{"regex_pattern":"((?<=[^A-Za-z0-9])|^)([\\-\\+]?\\d+)((?=[^A-Za-z0-9])|$)", "mask_with": "NUM"}
          ]
mask_prefix = <:
mask_suffix = :>

[DRAIN]
# engine is Optional parameter. Engine will be "Drain" if the engine argument is not specified.
# engine has two options: 'Drain' and 'JaccardDrain'.
# engine = Drain
sim_th = 0.4
depth = 4
max_children = 100
max_clusters = 1024
extra_delimiters = ["_"]

[PROFILING]
enabled = False
report_sec = 120

[GENERAL]
; Interval for polling logs, in minutes
POLLING_INTERVAL_MINUTES = 1
; Directory where individual models are stored
MODELS_DIRECTORY = models
; Where to store numpy files for the individual/ensemble models
NUMPY_DIRECTORY = numpy/

[ENSEMBLE_MODEL]
; Path to the ensemble model
MODEL_PATH = models/ensemble_model.pkl
; The proportion of outliers in the ENSEMBLE model. It's used to define the threshold for outlier scores. Increasing this value will result in more anomalies being detected in the ENSEMBLE model.
MODEL_CONTAMINATION = 0.1
# The percentage of anomalies expected in the data (0.00%) - there is an anomaly found if the score less than threshold, with the lower the score the more likely it is to be an anomaly. It's used to calculate the threshold in the combined ENSEMBLE model. Often the anomaly scores are negative numbers
ANOMALIES_THRESHOLD = 0.00
# The number of individual models to consider when doing calculations on the ENSEMBLE model for anomaly detection
MAX_NUM_MODELS_TO_CONSIDER = 20
# The maximum number of features to consider when extracting features from the log lines (a higher number means more memory usage but potentially less sensitive to anomalies)
MAX_FEATURES = 1000
; The similarity threshold used for grouping like anomalies together  (0.90 means 90% similar)
SIMILARITY_THRESHOLD = .90

[INDIVIDUAL_MODELS]
; The proportion of outliers in the individual models. It's used to define the threshold for outlier scores. Increasing this value will result in more anomalies being detected (per log).
MODEL_CONTAMINATION = 0.1
; The maximum number of features to consider when extracting features from the log lines (higher numbers mean more memory usage and this could affect performance)
MAX_FEATURES = 3000
; The percentage of anomalies expected in the data (0.00%) - there is an anomaly found if the score less than threshold, with the lower the score the more likely it is to be an anomaly. It's used to calculate the threshold in the individual (per log) models. Often the anomaly scores are negative numbers
ANOMALIES_THRESHOLD = -0.04
; The similarity threshold used for grouping like anomalies together  (0.85 means 85% similar)
SIMILARITY_THRESHOLD = 0.85