Skip to content

Commit

Permalink
Re-structure and document code
Browse files Browse the repository at this point in the history
  • Loading branch information
zazass8 committed Oct 9, 2024
1 parent 38fd54d commit d061a9f
Show file tree
Hide file tree
Showing 4 changed files with 189 additions and 142 deletions.
201 changes: 119 additions & 82 deletions mlxtend/frequent_patterns/association_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,17 +23,17 @@
"leverage",
"conviction",
"zhangs_metric",
# "jaccard",
# "certainty",
# "kulczynski"
"jaccard",
"certainty",
"kulczynski"
]


def association_rules(
df: pd.DataFrame,
df_or: pd.DataFrame,
num_itemsets: int,
disabled: np.ndarray,
null_values=False,
metric="confidence",
min_threshold=0.8,
support_only=False,
Expand All @@ -48,6 +48,15 @@ def association_rules(
pandas DataFrame of frequent itemsets
with columns ['support', 'itemsets']
df_or : pandas DataFrame
DataFrame with original input data
num_itemsets : int
Number of transactions in original input data
null_values : bool (default: True)
In case there are null values as NaNs in the original input data
metric : string (default: 'confidence')
Metric to evaluate if a rule is of interest.
**Automatically set to 'support' if `support_only=True`.**
Expand Down Expand Up @@ -115,7 +124,7 @@ def association_rules(
columns 'support' and 'itemsets'"
)

def kulczynski_helper(sAC, sA, sC):
def kulczynski_helper(sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_):
conf_AC = sAC / sA
conf_CA = sAC / sC
kulczynski = (conf_AC + conf_CA) / 2
Expand All @@ -128,31 +137,31 @@ def conviction_helper(conf, sC):
conf = conf[np.newaxis]
sC = sC[np.newaxis]
conviction[:] = np.inf
conviction[conf < 1.0] = (1.0 - sC[conf < 1.0]) / (
1.0 - conf[conf < 1.0]
)
conviction[conf < 1.0] = (1.0 - sC[conf < 1.0]) / (1.0 - conf[conf < 1.0])

return conviction

def zhangs_metric_helper(sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_):
denominator = np.maximum(sAC * (1 - sA), sA * (sC - sAC))
numerator = metric_dict["leverage"](sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_)
numerator = metric_dict["leverage"](
sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_
)

with np.errstate(divide="ignore", invalid="ignore"):
# ignoring the divide by 0 warning since it is addressed in the below np.where
zhangs_metric = np.where(denominator == 0, 0, numerator / denominator)

return zhangs_metric

def jaccard_metric_helper(sAC, sA, sC):
numerator = metric_dict["support"](sAC, sA, sC)
def jaccard_metric_helper(sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_):
numerator = metric_dict["support"](sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_)
denominator = sA + sC - numerator

jaccard_metric = numerator / denominator
return jaccard_metric

def certainty_metric_helper(sAC, sA, sC):
certainty_num = metric_dict["confidence"](sAC, sA, sC) - sC
def certainty_metric_helper(sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_):
certainty_num = metric_dict["confidence"](sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_) - sC
certainty_denom = 1 - sC

cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)
Expand All @@ -163,15 +172,34 @@ def certainty_metric_helper(sAC, sA, sC):
"antecedent support": lambda _, sA, ___, ____, _____, ______, _______, ________: sA,
"consequent support": lambda _, __, sC, ____, _____, ______, _______, ________: sC,
"support": lambda sAC, _, __, ___, ____, _____, ______, _______: sAC,
"confidence": lambda sAC, sA, _, disAC, disA, __, dis_int, ___: (sAC*(num_itemsets - disAC)) / (sA*(num_itemsets - disA) - dis_int),
"lift": lambda sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_: metric_dict["confidence"](sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_) / (sC*(num_itemsets - disC) - dis_int_),
"representativity": lambda _, __, disAC, ____, ___, ______, _______, ________ : (num_itemsets-disAC)/num_itemsets,
"leverage": lambda sAC, sA, sC, _, __, ____, _____, ______: metric_dict["support"](sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_) - sA * sC,
"conviction": lambda sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_: conviction_helper(metric_dict["confidence"](sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_), sC),
"zhangs_metric": lambda sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_: zhangs_metric_helper(sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_),
# "jaccard": lambda sAC, sA, sC: jaccard_metric_helper(sAC, sA, sC),
# "certainty": lambda sAC, sA, sC: certainty_metric_helper(sAC, sA, sC),
# "kulczynski": lambda sAC, sA, sC: kulczynski_helper(sAC, sA, sC),
"confidence": lambda sAC, sA, _, disAC, disA, __, dis_int, ___: (
sAC * (num_itemsets - disAC)
)
/ (sA * (num_itemsets - disA) - dis_int),
"lift": lambda sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_: metric_dict[
"confidence"
](sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_)
/ sC,
"representativity": lambda _, __, ___, disAC, ____, ______, _______, ________: (
num_itemsets - disAC
)
/ num_itemsets,
"leverage": lambda sAC, sA, sC, _, __, ____, _____, ______: metric_dict[
"support"
](sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_)
- sA * sC,
"conviction": lambda sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_: conviction_helper(
metric_dict["confidence"](
sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_
),
sC,
),
"zhangs_metric": lambda sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_: zhangs_metric_helper(
sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_
),
"jaccard": lambda sAC, sA, sC, _, __, ____, _____, ______: jaccard_metric_helper(sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_),
"certainty": lambda sAC, sA, sC, _, __, ____, _____, ______: certainty_metric_helper(sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_),
"kulczynski": lambda sAC, sA, sC, _, __, ____, _____, ______: kulczynski_helper(sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_),
}

# check for metric compliance
Expand All @@ -194,9 +222,14 @@ def certainty_metric_helper(sAC, sA, sC):
rule_consequents = []
rule_supports = []

# assign columns from original df to be the same on the disabled.
disabled = pd.DataFrame(disabled)
disabled.columns = df_or.columns
# Define the disabled df, assign columns from original df to be the same on the disabled.
disabled = df_or.copy()
if null_values:
disabled = np.where(pd.isna(disabled), 1, np.nan) + np.where(
(disabled == 0) | (disabled == 1), np.nan, 0
)
disabled = pd.DataFrame(disabled)
disabled.columns = df_or.columns

# iterate over all frequent itemsets
for k in frequent_items_dict.keys():
Expand All @@ -219,57 +252,56 @@ def certainty_metric_helper(sAC, sA, sC):
sA = frequent_items_dict[antecedent]
sC = frequent_items_dict[consequent]

an=list(antecedent)
con=list(consequent)
an.extend(con)

dec=disabled.loc[:,an]
_dec=disabled.loc[:,list(antecedent)]
__dec=disabled.loc[:,list(consequent)]
dec_=df_or.loc[:,list(antecedent)]
dec__=df_or.loc[:,list(consequent)]

disAC=0
disA=0
disC=0
dis_int=0
dis_int_=0
for i in range(len(dec.index)):
v=list(dec.iloc[i,:])
x=list(_dec.iloc[i,:])
y=list(__dec.iloc[i,:])
z=list(dec_.iloc[i,:])
w=list(dec__.iloc[i,:])
# if (1 in x) and all(x=='?' for x in y): ##
if 1 in set(v):
disAC+=1
if 1 in set(x):
disA+=1
if 1 in y:
disC+=1

# for i in range(len(_dec.index)):
# x=list(_dec.iloc[i,:])
# y=list(__dec.iloc[i,:])
# if 1 in set(x):
# disA+=1
# if 1 in y:
# disC+=1

# for i in range(len(__dec.index)):
# x=list(__dec.iloc[i,:])
# y=list(_dec.iloc[i,:])
# z=list(dec_.iloc[i,:])
# w=list(dec__.iloc[i,:])
# if (1 in x) and (True in np.isnan(y)) and (0 not in z):

# if (1 in x) and ((0 not in z) and ('?' not in z)):
# if (1 in x) and ((0 not in z) and (1 in z)):
if (1 in y) and all(j==1 for j in z):
dis_int+=1
# if (1 in y) and ((0 not in w) and ('?' not in w)):
if (1 in x) and all(j==1 for j in w):
dis_int_+=1
# if the input dataframe is complete
if not null_values:
disAC, disA, disC, dis_int, dis_int_ = 0, 0, 0, 0, 0
num_itemsets = 1

else:
an = list(antecedent)
con = list(consequent)
an.extend(con)

# select data of antecedent, consequent and combined from disabled
dec = disabled.loc[:, an]
_dec = disabled.loc[:, list(antecedent)]
__dec = disabled.loc[:, list(consequent)]

# select data of antecedent and consequent from original
dec_ = df_or.loc[:, list(antecedent)]
dec__ = df_or.loc[:, list(consequent)]

# disabled counts
disAC, disA, disC, dis_int, dis_int_ = 0, 0, 0, 0, 0
for i in range(len(dec.index)):
# select the i-th iset from the disabled dataset
item_comb = list(dec.iloc[i, :])
item_dis_an = list(_dec.iloc[i, :])
item_dis_con = list(__dec.iloc[i, :])

# select the i-th iset from the original dataset
item_or_an = list(dec_.iloc[i, :])
item_or_con = list(dec__.iloc[i, :])

# check and keep count if there is a null value in combined, antecedent, consequent
if 1 in set(item_comb):
disAC += 1
if 1 in set(item_dis_an):
disA += 1
if 1 in item_dis_con:
disC += 1

# check and keep count if there is a null value in consequent AND all items are present in antecedent
if (1 in item_dis_con) and all(
j == 1 for j in item_or_an
):
dis_int += 1

# check and keep count if there is a null value in antecedent AND all items are present in consequent
if (1 in item_dis_an) and all(
j == 1 for j in item_or_con
):
dis_int_ += 1

except KeyError as e:
s = (
Expand All @@ -283,12 +315,15 @@ def certainty_metric_helper(sAC, sA, sC):
raise KeyError(s)
# check for the threshold

score = metric_dict[metric](sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_)
# print(score)
score = metric_dict[metric](
sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_
)
if score >= min_threshold:
rule_antecedents.append(antecedent)
rule_consequents.append(consequent)
rule_supports.append([sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_])
rule_supports.append(
[sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_]
)

# check if frequent rule was generated
if not rule_supports:
Expand Down Expand Up @@ -316,9 +351,11 @@ def certainty_metric_helper(sAC, sA, sC):
disA = rule_supports[4]
disC = rule_supports[5]
dis_int = rule_supports[6]
dis_int_= rule_supports[7]
dis_int_ = rule_supports[7]

for m in return_metrics:
df_res[m] = metric_dict[m](sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_)
df_res[m] = metric_dict[m](
sAC, sA, sC, disAC, disA, disC, dis_int, dis_int_
)

return df_res
Loading

0 comments on commit d061a9f

Please sign in to comment.