Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/hotfixes' into release
Browse files Browse the repository at this point in the history
  • Loading branch information
fit-alessandro-berti committed Jan 15, 2024
2 parents 71bfb8c + 2225808 commit 7917258
Show file tree
Hide file tree
Showing 35 changed files with 74 additions and 66 deletions.
2 changes: 1 addition & 1 deletion pm4py/algo/conformance/alignments/dfg/variants/classic.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def apply_log(log, dfg, sa, ea, parameters=None):

if pandas_utils.check_is_pandas_dataframe(log):
case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME)
traces = log.groupby(case_id_key)[activity_key].agg(list).to_numpy().tolist(); traces = [tuple(x) for x in traces]
traces = [tuple(x) for x in log.groupby(case_id_key)[activity_key].agg(list).to_dict().values()]
else:
log = log_converter.apply(log, variant=log_converter.Variants.TO_EVENT_LOG, parameters=parameters)
traces = [tuple(x[activity_key] for x in trace) for trace in log]
Expand Down
2 changes: 1 addition & 1 deletion pm4py/algo/conformance/alignments/petri_net/algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ def __get_variants_structure(log, parameters):

if pandas_utils.check_is_pandas_dataframe(log):
case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, CASE_CONCEPT_NAME)
traces = log.groupby(case_id_key)[activity_key].agg(list).to_numpy().tolist(); traces = [tuple(x) for x in traces]
traces = [tuple(x) for x in log.groupby(case_id_key)[activity_key].agg(list).to_dict().values()]
for idx, trace in enumerate(traces):
if trace not in variants_idxs:
variants_idxs[trace] = [idx]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ def apply_multiprocessing(obj: Union[EventLog, Trace, pd.DataFrame], pt: Process
if pandas_utils.check_is_pandas_dataframe(obj):
case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters,
constants.CASE_CONCEPT_NAME)
traces = obj.groupby(case_id_key)[activity_key].agg(list).to_numpy().tolist(); traces = [tuple(x) for x in traces]
traces = [tuple(x) for x in obj.groupby(case_id_key)[activity_key].agg(list).to_dict().values()]
else:
obj = log_converter.apply(obj, variant=log_converter.Variants.TO_EVENT_LOG, parameters=parameters)
traces = [tuple(x[activity_key] for x in case) for case in obj]
Expand Down Expand Up @@ -385,7 +385,7 @@ def apply(obj: Union[EventLog, Trace, pd.DataFrame], pt: ProcessTree, parameters
ret = []
if pandas_utils.check_is_pandas_dataframe(obj):
case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME)
traces = obj.groupby(case_id_key)[activity_key].agg(list).to_numpy().tolist(); traces = [tuple(x) for x in traces]
traces = [tuple(x) for x in obj.groupby(case_id_key)[activity_key].agg(list).to_dict().values()]
else:
obj = log_converter.apply(obj, variant=log_converter.Variants.TO_EVENT_LOG, parameters=parameters)
traces = [tuple(x[activity_key] for x in case) for case in obj]
Expand Down
2 changes: 1 addition & 1 deletion pm4py/algo/conformance/log_skeleton/variants/classic.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def apply_log(log: Union[EventLog, pd.DataFrame], model: Dict[str, Any], paramet

if pandas_utils.check_is_pandas_dataframe(log):
case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, CASE_CONCEPT_NAME)
traces = log.groupby(case_id_key)[activity_key].agg(list).to_numpy().tolist(); traces = [tuple(x) for x in traces]
traces = [tuple(x) for x in log.groupby(case_id_key)[activity_key].agg(list).to_dict().values()]
else:
traces = [tuple(y[activity_key] for y in x) for x in log]
grouped_traces = {}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -987,7 +987,7 @@ def apply_log(log, net, initial_marking, final_marking, enable_pltr_fitness=Fals
trans_map[t.label] = t

if pandas_utils.check_is_pandas_dataframe(log):
traces = log.groupby(case_id_key)[activity_key].agg(list).to_numpy().tolist(); traces = [tuple(x) for x in traces]
traces = [tuple(x) for x in log.groupby(case_id_key)[activity_key].agg(list).to_dict().values()]
else:
traces = [tuple(x[activity_key] for x in trace) for trace in log]

Expand Down
6 changes: 3 additions & 3 deletions pm4py/algo/discovery/batches/variants/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import pandas as pd

from pm4py.algo.discovery.batches.utils import detection
from pm4py.util import exec_utils, constants, xes_constants
from pm4py.util import exec_utils, constants, xes_constants, pandas_utils
import numpy as np


Expand Down Expand Up @@ -98,10 +98,10 @@ def apply(log: pd.DataFrame, parameters: Optional[Dict[Union[str, Parameters], A
# here, we want them to have the second granularity, so we divide by 10**9
# for example 1001000000 nanoseconds (value stored in the column)
# is equivalent to 1,001 seconds.
log[timestamp_key] = log[timestamp_key].values.astype(np.int64) / 10**9
log[timestamp_key] = pandas_utils.convert_to_seconds(log[timestamp_key])
if start_timestamp_key != timestamp_key:
# see the aforementioned explanation.
log[start_timestamp_key] = log[start_timestamp_key].values.astype(np.int64) / 10**9
log[start_timestamp_key] = pandas_utils.convert_to_seconds(log[start_timestamp_key])

actres_grouping0 = log.groupby([activity_key, resource_key]).agg(list).to_dict()
start_timestamps = actres_grouping0[start_timestamp_key]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def apply(log: Union[EventLog, EventStream, pd.DataFrame], parameters: Optional[
# keep only the two columns before conversion
log = log[list(set([activity_key, timestamp_key, start_timestamp_key]))]
log = log.sort_values([timestamp_key, start_timestamp_key])
activities_counter = dict(log[activity_key].value_counts())
activities_counter = log[activity_key].value_counts().to_dict()
activities = sorted(list(activities_counter.keys()))
else:
log = converter.apply(log, variant=converter.Variants.TO_EVENT_STREAM, parameters={"deepcopy": False, "include_case_attributes": False})
Expand Down
2 changes: 1 addition & 1 deletion pm4py/algo/discovery/declare/variants/classic.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def __col_to_dict_rule(col_name: Union[Tuple[str, str], Tuple[str, str, str]]) -
if len(col_name) == 2:
return col_name[0], col_name[1]
else:
if col_name[2] is None or pd.isna(col_name[2]):
if col_name[2] is None or pd.isna(col_name[2]) or not col_name[2]:
return col_name[0], col_name[1]

return col_name[0], (col_name[1], col_name[2])
Expand Down
6 changes: 3 additions & 3 deletions pm4py/algo/discovery/dfg/adapters/pandas/df_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,8 @@ def get_dfg_graph(df, measure="frequency", activity_key="concept:name", case_id_
df_successive_rows[constants.DEFAULT_FLOW_TIME] = df_successive_rows.apply(
lambda x: soj_time_business_hours_diff(x[timestamp_key], x[start_timestamp_key + '_2'], business_hours_slot, workcalendar), axis=1)
else:
df_successive_rows[constants.DEFAULT_FLOW_TIME] = (
df_successive_rows[start_timestamp_key + '_2'] - df_successive_rows[timestamp_key]).dt.total_seconds()
difference = df_successive_rows[start_timestamp_key + '_2'] - df_successive_rows[timestamp_key]
df_successive_rows[constants.DEFAULT_FLOW_TIME] = pandas_utils.get_total_seconds(difference)
# groups couple of attributes (directly follows relation, we can measure the frequency and the performance)
directly_follows_grouping = df_successive_rows.groupby([activity_key, target_activity_key + '_2'])[
constants.DEFAULT_FLOW_TIME]
Expand Down Expand Up @@ -246,7 +246,7 @@ def get_partial_order_dataframe(df, start_timestamp_key=None, timestamp_key="tim
df[constants.DEFAULT_FLOW_TIME] = df.apply(
lambda x: soj_time_business_hours_diff(x[timestamp_key], x[start_timestamp_key + '_2'], business_hours_slot, workcalendar), axis=1)
else:
df[constants.DEFAULT_FLOW_TIME] = (df[start_timestamp_key + "_2"] - df[timestamp_key]).dt.total_seconds()
df[constants.DEFAULT_FLOW_TIME] = pandas_utils.get_total_seconds(df[start_timestamp_key + "_2"] - df[timestamp_key])

if keep_first_following:
df = df.groupby(constants.DEFAULT_INDEX_KEY).first().reset_index()
Expand Down
2 changes: 1 addition & 1 deletion pm4py/algo/discovery/log_skeleton/variants/classic.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ def apply(log: Union[EventLog, pd.DataFrame], parameters: Optional[Dict[Union[st
elif pandas_utils.check_is_pandas_dataframe(log):
case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, CASE_CONCEPT_NAME)
all_activs = log[activity_key].value_counts().to_dict()
logs_traces = Counter([tuple(x) for x in log.groupby(case_id_key)[activity_key].agg(list).to_numpy().tolist()])
logs_traces = Counter([tuple(x) for x in log.groupby(case_id_key)[activity_key].agg(list).to_dict().values()])

ret = {}
ret[Outputs.EQUIVALENCE.value] = equivalence(logs_traces, all_activs, noise_threshold=noise_threshold)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,6 @@ def apply(left_df: pd.DataFrame, right_df: pd.DataFrame, case_relations: pd.Data

md = pandas_utils.concat([df1, df2])
md = md.sort_values([index_key+left_suffix, index_key+right_suffix])
md[timestamp_diff] = (md[target_timestamp] - md[source_timestamp]).dt.total_seconds()
md[timestamp_diff] = pandas_utils.get_total_seconds(md[target_timestamp] - md[source_timestamp])

return md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def apply(log: Union[EventLog, EventStream, pd.DataFrame],

if pandas_utils.check_is_pandas_dataframe(log):
case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME)
control_flow_log = list(log.groupby(case_id_key)[activity_key].agg(list))
control_flow_log = [tuple(x) for x in log.groupby(case_id_key)[activity_key].agg(list).to_dict().values()]
else:
log = log_conversion.apply(log, parameters, log_conversion.TO_EVENT_LOG)
control_flow_log = log_util.log.project_traces(log, activity_key)
Expand Down
2 changes: 1 addition & 1 deletion pm4py/algo/evaluation/precision/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def get_log_prefixes(log, activity_key=xes_util.DEFAULT_NAME_KEY, case_id_key=co
prefix_count = Counter()

if pandas_utils.check_is_pandas_dataframe(log):
traces = log.groupby(case_id_key)[activity_key].agg(list).to_numpy().tolist()
traces = [tuple(x) for x in log.groupby(case_id_key)[activity_key].agg(list).to_dict().values()]
else:
traces = [tuple(x[activity_key] for x in trace) for trace in log]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ def filter_df_keeping_spno_activities(df: pd.DataFrame, activity_key: str = "con
df
Filtered dataframe
"""
activity_values_dict = dict(df[activity_key].value_counts())
activity_values_dict = df[activity_key].value_counts().to_dict()
activity_values_ordered_list = []
for act in activity_values_dict:
activity_values_ordered_list.append([act, activity_values_dict[act]])
Expand Down
4 changes: 2 additions & 2 deletions pm4py/algo/filtering/pandas/cases/case_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def filter_on_ncases(df: pd.DataFrame, case_id_glue: str = constants.CASE_CONCEP
df
Filtered dataframe
"""
cases_values_dict = dict(df[case_id_glue].value_counts())
cases_values_dict = df[case_id_glue].value_counts().to_dict()
cases_to_keep = []
for case in cases_values_dict:
cases_to_keep.append(case)
Expand Down Expand Up @@ -124,7 +124,7 @@ def filter_on_case_performance(df: pd.DataFrame, case_id_glue: str = constants.C
lambda x: soj_time_business_hours_diff(x[timestamp_key], x[timestamp_key + "_2"], business_hours_slots), axis=1)
else:
stacked_df['caseDuration'] = stacked_df[timestamp_key + "_2"] - stacked_df[timestamp_key]
stacked_df['caseDuration'] = stacked_df['caseDuration'].dt.total_seconds()
stacked_df['caseDuration'] = pandas_utils.get_total_seconds(stacked_df['caseDuration'])
stacked_df = stacked_df[stacked_df['caseDuration'] <= max_case_performance]
stacked_df = stacked_df[stacked_df['caseDuration'] >= min_case_performance]
i1 = df.set_index(case_id_glue).index
Expand Down
3 changes: 1 addition & 2 deletions pm4py/algo/filtering/pandas/ltl/ltl_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,7 @@ def eventually_follows(df0: pd.DataFrame, attribute_values: List[str], parameter

if enable_timestamp:
for i in range(1, len(df_a)):
df_join["@@difftimestamp%d" % (i - 1)] = (
df_join[timestamp_key + "_%d" % i] - df_join[timestamp_key + '_%d' % (i-1)]).dt.total_seconds()
df_join["@@difftimestamp%d" % (i - 1)] = pandas_utils.get_total_seconds(df_join[timestamp_key + "_%d" % i] - df_join[timestamp_key + '_%d' % (i-1)])

if timestamp_diff_boundaries:
df_join = df_join[df_join["@@difftimestamp%d" % (i-1)] >= timestamp_diff_boundaries[i-1][0]]
Expand Down
2 changes: 1 addition & 1 deletion pm4py/algo/filtering/pandas/paths/paths_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def apply_performance(df: pd.DataFrame, provided_path: Tuple[str, str], paramete
stacked_df = pandas_utils.concat([filt_df, filt_dif_shifted], axis=1)
stacked_df["@@path"] = stacked_df[attribute_key] + DEFAULT_VARIANT_SEP + stacked_df[attribute_key + "_2"]
stacked_df = stacked_df[stacked_df["@@path"] == provided_path]
stacked_df["@@timedelta"] = (stacked_df[timestamp_key + "_2"] - stacked_df[timestamp_key]).dt.total_seconds()
stacked_df["@@timedelta"] = pandas_utils.get_total_seconds(stacked_df[timestamp_key + "_2"] - stacked_df[timestamp_key])
stacked_df = stacked_df[stacked_df["@@timedelta"] >= min_performance]
stacked_df = stacked_df[stacked_df["@@timedelta"] <= max_performance]
i1 = df.set_index(case_id_glue).index
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
'''
from enum import Enum
from pm4py.util import exec_utils
from pm4py.util import xes_constants, constants
from pm4py.util import xes_constants, constants, pandas_utils
import pandas as pd
from typing import Dict, Optional, Any, Tuple
from pm4py.util.business_hours import soj_time_business_hours_diff
Expand Down Expand Up @@ -99,8 +99,7 @@ def build_network_analysis_from_link_analysis(merged_df: pd.DataFrame, parameter
business_hours_slots), axis=1)

else:
merged_df[timestamp_diff_column] = (
merged_df[timestamp_column + "_in"] - merged_df[timestamp_column + "_out"]).dt.total_seconds()
merged_df[timestamp_diff_column] = pandas_utils.get_total_seconds(merged_df[timestamp_column + "_in"] - merged_df[timestamp_column + "_out"])

edges0 = merged_df.dropna(subset=[node_column_source + "_out", node_column_target + "_in", edge_column + edge_reference], how="any").groupby([node_column_source + "_out", node_column_target + "_in", edge_column + edge_reference])[
timestamp_diff_column].agg(list).to_dict()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -506,7 +506,7 @@ def average_duration_activity(df: pd.DataFrame, t1: Union[datetime, str], t2: Un
df = df[df[timestamp_key] >= t1]
df = df[df[timestamp_key] < t2]

return float((df[timestamp_key] - df[start_timestamp_key]).dt.total_seconds().mean())
return float(pandas_utils.get_total_seconds(df[timestamp_key] - df[start_timestamp_key]).mean())


def average_case_duration(df: pd.DataFrame, t1: Union[datetime, str], t2: Union[datetime, str], r: str,
Expand Down
2 changes: 1 addition & 1 deletion pm4py/algo/organizational_mining/roles/variants/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,6 @@ def apply(df: pd.DataFrame, parameters: Optional[Dict[Union[str, Parameters], An

resource_key = exec_utils.get_param_value(Parameters.RESOURCE_KEY, parameters, xes.DEFAULT_RESOURCE_KEY)
activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes.DEFAULT_NAME_KEY)
activity_resource_couples = Counter(dict(df.groupby([resource_key, activity_key]).size()))
activity_resource_couples = Counter(df.groupby([resource_key, activity_key]).size().to_dict())

return algorithm.apply(activity_resource_couples, parameters=parameters)
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,9 @@ def apply(log: pd.DataFrame, parameters: Optional[Dict[Union[str, Parameters], A
resource_key = exec_utils.get_param_value(Parameters.RESOURCE_KEY, parameters, xes.DEFAULT_RESOURCE_KEY)
activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes.DEFAULT_NAME_KEY)

activities = dict(log[activity_key].value_counts())
resources = dict(log[resource_key].value_counts())
activity_resource_couples = dict(log.groupby([resource_key, activity_key]).size())
activities = log[activity_key].value_counts().to_dict()
resources = log[resource_key].value_counts().to_dict()
activity_resource_couples = log.groupby([resource_key, activity_key]).size().to_dict()
activities_keys = sorted(list(activities.keys()))
resources_keys = sorted(list(resources.keys()))
rsc_act_matrix = np.zeros((len(resources_keys), len(activities_keys)))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def apply(log: Union[EventLog, EventStream, pd.DataFrame], parameters: Optional[
log = pandas_utils.insert_case_arrival_finish_rate(log, case_id_column=case_id_column, timestamp_column=timestamp_column, arrival_rate_column=arrival_rate, finish_rate_column=finish_rate)
log = pandas_utils.insert_case_service_waiting_time(log, case_id_column=case_id_column, timestamp_column=timestamp_column, diff_start_end_column=diff_start_end, service_time_column=service_time, sojourn_time_column=sojourn_time, waiting_time_column=waiting_time)

grouped_log = log.groupby(pd.Grouper(key=start_timestamp_column, freq=grouper_freq))
grouped_log = log.groupby(pandas_utils.get_grouper(key=start_timestamp_column, freq=grouper_freq))

final_values = []

Expand Down
5 changes: 2 additions & 3 deletions pm4py/filtering.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,11 +315,11 @@ def filter_eventually_follows_relation(log: Union[EventLog, pd.DataFrame], relat
if retain:
cases = set()
else:
cases = set(log[case_id_key])
cases = set(log[case_id_key].to_numpy().tolist())
for path in relations:
filt_log = ltl_checker.eventually_follows(log, path,
parameters=parameters)
this_traces = set(filt_log[case_id_key])
this_traces = set(filt_log[case_id_key].to_numpy().tolist())
if retain:
cases = cases.union(this_traces)
else:
Expand Down Expand Up @@ -1005,7 +1005,6 @@ def filter_trace_segments(log: Union[EventLog, pd.DataFrame], admitted_traces: L
filtered_log = pm4py.filter_trace_segments(log, [["...", "check ticket", "decide", "reinitiate request", "..."]])
print(filtered_log)
"""
if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!")
__event_log_deprecation_warning(log)

parameters = get_properties(log, activity_key=activity_key, timestamp_key=timestamp_key, case_id_key=case_id_key)
Expand Down
7 changes: 4 additions & 3 deletions pm4py/objects/log/util/dataframe_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,9 +185,10 @@ def convert_timestamp_columns_in_df(df, timest_format=None, timest_columns=None)
try:
df[col] = pandas_utils.dataframe_column_string_to_datetime(df[col], format=timest_format, exact=False, utc=True)
except:
# traceback.print_exc()
# print("exception converting column: "+str(col))
pass
try:
df[col] = pandas_utils.dataframe_column_string_to_datetime(df[col], format=timest_format)
except:
pass

for col in df.columns:
if "date" in str(df[col].dtype) or "time" in str(df[col].dtype):
Expand Down
2 changes: 1 addition & 1 deletion pm4py/objects/log/util/pandas_numpy_variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def apply(dataframe: pd.DataFrame, parameters=None) -> Tuple[Dict[Collection[str
if importlib.util.find_spec("cudf"):
case_variant = dataframe.groupby(case_id_key)[activity_key].agg(list).to_dict()
case_variant = {x: tuple(y) for x, y in case_variant.items()}
variants_counter = Counter(case_variant.items())
variants_counter = Counter(case_variant.values())
else:
variants_counter = Counter()
cases = dataframe[case_id_key].to_numpy()
Expand Down
2 changes: 1 addition & 1 deletion pm4py/ocel.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ def ocel_objects_summary(ocel: OCEL) -> pd.DataFrame:
objects_summary = act_comb.join(lif_start_tim)
objects_summary = objects_summary.join(lif_end_tim)
objects_summary = objects_summary.reset_index()
objects_summary["lifecycle_duration"] = (objects_summary["lifecycle_end"] - objects_summary["lifecycle_start"]).dt.total_seconds()
objects_summary["lifecycle_duration"] = pandas_utils.get_total_seconds(objects_summary["lifecycle_end"] - objects_summary["lifecycle_start"])
ev_rel_obj = ocel.relations.groupby(ocel.event_id_column)[ocel.object_id_column].agg(list).to_dict()
objects_ids = pandas_utils.format_unique(ocel.objects[ocel.object_id_column].unique())
graph = {o: set() for o in objects_ids}
Expand Down
Loading

0 comments on commit 7917258

Please sign in to comment.