Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions src/cache/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,48 @@


def get_digested(candidate_path: str) -> str:
"""returns the encrypted the string

returns the candidate's path encrypted using the FIPS secure hash algorithms sha256
:param candidate_path:
:return:
"""
return hashlib.sha256(candidate_path.encode('utf-8')).hexdigest()


def load_from_cache(path: str, prefix: str = ''):
"""returns the object loaded from path

:param path:
:param prefix:
:return:
"""
if path is not None: # TODO: what if the file is not there?
with open(prefix + get_digested(path) + '.pickle', 'rb') as f:
return pickle.load(f)


def dump_to_cache(path: str, obj, prefix: str = ''):
"""uploads the object obj on path

:param path:
:param obj:
:param prefix:
"""
if path is not None:
with open(prefix + get_digested(path) + '.pickle', "wb") as f:
pickle.dump(obj, f)


def put_loaded_logs(split: Split, train_df, test_df, additional_columns):
"""uploads the loaded logs

uploads the training and test DataFrames loaded
:param split:
:param train_df:
:param test_df:
:param additional_columns:
"""
[dump_to_cache(path, data, 'cache/loaded_log_cache/') for (path, data) in [
(split.train_log.name, train_df),
(split.test_log.name, test_df),
Expand All @@ -40,6 +66,13 @@ def put_loaded_logs(split: Split, train_df, test_df, additional_columns):


def put_labelled_logs(job: Job, train_df, test_df):
"""uploads the labelled logs

uploads the training and test DataFrames labelled using the given job configuration
:param job:
:param train_df:
:param test_df:
"""
train_df_path = "encoding{}-label{}-splitTR{}".format(job.encoding.id, job.labelling.id, job.split.train_log.name)
test_df_path = "encoding{}-label{}-splitTE{}".format(job.encoding.id, job.labelling.id, job.split.train_log.name)
[dump_to_cache(path, data, 'cache/labeled_log_cache/') for (path, data) in [
Expand All @@ -54,6 +87,12 @@ def put_labelled_logs(job: Job, train_df, test_df):


def get_loaded_logs(split: Split) -> (DataFrame, DataFrame, DataFrame):
"""returns the loaded logs

returns the training and test DataFrames loaded using
:param split:
:return:
"""
logger.info('\t\tFound pre-loaded Dataset in cache, loading..')
cache = LoadedLog.objects.filter(split=split)[0]
return (
Expand All @@ -64,6 +103,12 @@ def get_loaded_logs(split: Split) -> (DataFrame, DataFrame, DataFrame):


def get_labelled_logs(job: Job) -> (DataFrame, DataFrame):
"""returns the labelled logs

returns the training and test DataFrames labelled using the given job configuration
:param job:
:return:
"""
logger.info('\t\tFound pre-labeled Dataset in cache, loading..')
cache = LabelledLog.objects.filter(split=job.split,
encoding=job.encoding,
Expand Down
18 changes: 18 additions & 0 deletions src/clustering/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,11 @@ def cluster_data(self, input_df: DataFrame) -> dict:
}

def _choose_clusterer(self, clustering: src.clustering.models.Clustering):
"""chooses the clustering method

chooses the clustering method using the given clustering
:param clustering:
"""
self.config.pop('clustering_method', None)
if clustering.clustering_method == ClusteringMethods.KMEANS.value:
self.clusterer = KMeans(**self.config)
Expand All @@ -79,6 +84,12 @@ def _choose_clusterer(self, clustering: src.clustering.models.Clustering):

@classmethod
def load_model(cls, job: Job):
"""returns the clustering method from a model

returns the clustering method using the given job configuration
:param job:
:return:
"""
if job.clustering.clustering_method == ClusteringMethods.KMEANS.value:
clusterer = joblib.load(job.clustering.model_path)
elif job.clustering.clustering_method == ClusteringMethods.NO_CLUSTER.value:
Expand All @@ -89,6 +100,13 @@ def load_model(cls, job: Job):


def init_clusterer(clustering: Clustering, train_data: DataFrame):
"""returns a new cluster

returns a new cluster, fitted on the train_data DataFrame
:param clustering:
:param train_data:
:return:
"""
clusterer = Clustering(clustering)
clusterer.fit(train_data.drop(['trace_id', 'label'], 1))
return clusterer
5 changes: 3 additions & 2 deletions src/core/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def run_by_type(training_df: DataFrame, test_df: DataFrame, job: Job) -> (dict,
def runtime_calculate(job: Job) -> dict:
"""calculate the prediction for traces in the uncompleted logs

:param job: job idctionary
:param job: job dictionary
:return: runtime results
"""

Expand All @@ -112,7 +112,7 @@ def runtime_calculate(job: Job) -> dict:
return results


def replay_prediction_calculate(job: Job, log) -> (dict, dict):
def replay_prediction_calculate(job: Job, log: EventLog) -> (dict, dict):
"""calculate the prediction for the log coming from replayers

:param job: job dictionary
Expand Down Expand Up @@ -146,6 +146,7 @@ def get_run(job: Job) -> str:
def _label_task(input_dataframe: DataFrame) -> dict:
"""calculates the distribution of labels in the data frame

:param input_dataframe:
:return: Dict of string and int {'label1': label1_count, 'label2': label2_count}

"""
Expand Down
54 changes: 53 additions & 1 deletion src/encoding/boolean_frequency.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,26 @@


def boolean(log: EventLog, event_names: list, label: Labelling, encoding: Encoding) -> DataFrame:
"""Encodes the log using boolean encoding

:param log:
:param event_names:
:param label:
:param encoding:
:return:
"""
return _encode_boolean_frequency(log, event_names, label, encoding)


def frequency(log: EventLog, event_names: list, label: Labelling, encoding: Encoding) -> DataFrame:
"""Encodes the log using frequency encoding

:param log:
:param event_names:
:param label:
:param encoding:
:return:
"""
return _encode_boolean_frequency(log, event_names, label, encoding)


Expand All @@ -21,6 +37,10 @@ def _encode_boolean_frequency(log: EventLog, event_names: list, labelling: Label
"""Encodes the log by boolean or frequency

trace_id, event_nr, event_names, label stuff
:param log:
:param event_names:
:param label:
:param encoding:
:return pandas DataFrame
"""
columns = _create_columns(event_names, encoding, labelling)
Expand All @@ -46,7 +66,12 @@ def _encode_boolean_frequency(log: EventLog, event_names: list, labelling: Label


def _create_event_happened(event_names: list, encoding: Encoding) -> list:
"""Creates list of event happened placeholders"""
"""Creates list of event happened placeholders

:param event_names:
:param encoding:
:return:
"""
if encoding.value_encoding == ValueEncodings.BOOLEAN.value:
return [False] * len(event_names)
return [0] * len(event_names)
Expand All @@ -57,6 +82,11 @@ def _update_event_happened(event, event_names: list, event_happened: list, encod

For boolean set happened to True.
For frequency updates happened count.
:param event
:param event_names:
:param event_happened:
:param encoding:
:return:
"""
event_name = event['concept:name']
if event_name in event_names:
Expand All @@ -68,13 +98,35 @@ def _update_event_happened(event, event_names: list, event_happened: list, encod


def _create_columns(event_names: list, encoding: Encoding, labelling: Labelling) -> list:
"""Returns a new columns

:param event_names:
:param encoding:
:param labelling:
:return:
"""
columns = ["trace_id"]
columns = list(np.append(columns, event_names).tolist())
return compute_label_columns(columns, encoding, labelling)


def _trace_to_row(trace: Trace, encoding: Encoding, event_index: int, labelling: Labelling = None, executed_events=None,
resources_used=None, new_traces=None, event_names=None, atr_classifier=None):
"""Transforms trace into a list comprehension

:param trace:
:param encoding:
:param labelling:
:param event_index:
:param data_fun:
:param columns_len:
:param atr_classifier:
:param executed_events:
:param resources_used:
:param new_traces:
:param additional_columns
:return:
"""
# starts with all False, changes to event
event_happened = _create_event_happened(event_names, encoding)
trace_row = []
Expand Down
44 changes: 42 additions & 2 deletions src/encoding/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,16 @@


def encode_label_logs(training_log: EventLog, test_log: EventLog, job: Job, additional_columns=None, encode=True):
"""returns the encoded label logs

returns the training and test DataFrames with encoded column 'label' using the given job configuration
:param training_log:
:param test_log:
:param job:
:param additional_columns:
:param encode:
:return:
"""
logger.info('\tDataset not found in cache, building..')
training_log, cols = _eventlog_to_dataframe(training_log, job.encoding, job.labelling, additional_columns=additional_columns, cols=None)
test_log, _ = _eventlog_to_dataframe(test_log, job.encoding, job.labelling, additional_columns=additional_columns, cols=cols)
Expand Down Expand Up @@ -61,6 +71,15 @@ def encode_label_logs(training_log: EventLog, test_log: EventLog, job: Job, addi


def _eventlog_to_dataframe(log: EventLog, encoding: Encoding, labelling: Labelling, additional_columns=None, cols=None):
"""converting eventlog into a DataFrame

:param training_log:
:param test_log:
:param job:
:param additional_columns:
:param encode:
:return:
"""
if encoding.prefix_length < 1:
raise ValueError("Prefix length must be greater than 1")
if encoding.value_encoding == ValueEncodings.SIMPLE_INDEX.value:
Expand Down Expand Up @@ -88,7 +107,16 @@ def _eventlog_to_dataframe(log: EventLog, encoding: Encoding, labelling: Labelli
return run_df, cols


def _data_encoder_encoder(job: Job, training_log, test_log) -> Encoder:
def _data_encoder_encoder(job: Job, training_log: EventLog, test_log: EventLog) -> Encoder:
"""uses data_encoder to encoder DataFrame

:param training_log:
:param test_log:
:param job:
:param additional_columns:
:param encode:
:return:
"""
if job.type != JobTypes.LABELLING.value and \
job.encoding.value_encoding != ValueEncodings.BOOLEAN.value and \
job.predictive_model.predictive_model != PredictiveModels.TIME_SERIES_PREDICTION.value:
Expand All @@ -107,12 +135,24 @@ def _data_encoder_encoder(job: Job, training_log, test_log) -> Encoder:
return encoder


def data_encoder_decoder(job: Job, training_log, test_log) -> None:
def data_encoder_decoder(job: Job, training_log: EventLog, test_log: EventLog) -> None:
"""uses data_encoder to decoder DataFrame

:param training_log:
:param test_log:
:param job:
:return:
"""
encoder = retrieve_proper_encoder(job)
encoder.decode(training_log, job.encoding), encoder.decode(test_log, job.encoding)


def retrieve_proper_encoder(job: Job) -> Encoder:
"""find the proper encoder

:param job:
:return:
"""
if job.incremental_train is not None:
return retrieve_proper_encoder(job.incremental_train)
else:
Expand Down
Loading