From 60a878159bc110237bb3946cbc6ac1bba2dccbc3 Mon Sep 17 00:00:00 2001
From: Blaz Mramor <bmramor@outbrain.com>
Date: Tue, 12 Mar 2024 15:07:01 +0100
Subject: [PATCH 01/18] LR with priors initial implementation

---
 outrank/algorithms/importance_estimator.py | 33 ++++++++++++++--------
 outrank/core_ranking.py                    |  6 +++-
 outrank/core_utils.py                      |  5 +++-
 outrank/task_selftest.py                   |  2 +-
 4 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/outrank/algorithms/importance_estimator.py b/outrank/algorithms/importance_estimator.py
index a228c4a..0b605ea 100644
--- a/outrank/algorithms/importance_estimator.py
+++ b/outrank/algorithms/importance_estimator.py
@@ -11,7 +11,7 @@
 import pandas as pd
 from scipy.stats import pearsonr
 from sklearn.feature_selection import mutual_info_classif
-from sklearn.linear_model import LogisticRegression
+from sklearn.linear_model import LogisticRegression, SGDClassifier
 from sklearn.metrics import adjusted_mutual_info_score
 from sklearn.model_selection import cross_val_score
 from sklearn.preprocessing import OneHotEncoder
@@ -38,11 +38,11 @@ def sklearn_MI(vector_first: Any, vector_second: Any) -> float:
 
 
 def sklearn_surrogate(
-    vector_first: Any, vector_second: Any, surrogate_model: str,
+    vector_first: Any, vector_second: Any, X: Any, surrogate_model: str
 ) -> float:
-    if surrogate_model == 'surrogate-LR':
+    if 'surrogate-LR' in surrogate_model:
         clf = LogisticRegression(max_iter=100000)
-    elif surrogate_model == 'surrogate-SVM':
+    elif 'surrogate-SVM' in surrogate_model:
         clf = SVC(gamma='auto', probability=True)
 
     transf = OneHotEncoder()
@@ -57,17 +57,22 @@ def sklearn_surrogate(
     unique_values, counts = np.unique(vector_second, return_counts=True)
 
     # Establish min support for this type of ranking.
-    if counts[0] < len(unique_values) * (2**5):
-        estimate_feature_importance = 0
+    # if counts[0] < len(unique_values) * (2**5):
+    #     estimate_feature_importance = 0
 
-    else:
+    if X.shape[0] == 0 and X.shape[1] == 0:
         vector_first = transf.fit_transform(vector_first.reshape(-1, 1))
         estimate_feature_importance_list = cross_val_score(
             clf, vector_first, vector_second, scoring='neg_log_loss', cv=4,
         )
-
-        estimate_feature_importance = 1 + \
-            np.median(estimate_feature_importance_list)
+    else:
+        X = np.concatenate((X,vector_first.reshape(-1, 1)), axis=1)
+        X = transf.fit_transform(X)
+        estimate_feature_importance_list = cross_val_score(
+            clf, X, vector_second, scoring='neg_log_loss', cv=4,
+        )   
+    estimate_feature_importance = 1 + \
+        np.median(estimate_feature_importance_list)        
 
     return estimate_feature_importance
 
@@ -97,7 +102,7 @@ def sklearn_mi_adj(vector_first, vector_second):
     return estimate_feature_importance
 
 
-def get_importances_estimate_pairwise(combination, args, tmp_df):
+def get_importances_estimate_pairwise(combination, reference_model_features, args, tmp_df):
     """A method for parallel importances estimation. As interaction scoring is independent, individual scores can be computed in parallel."""
 
     feature_one = combination[0]
@@ -122,8 +127,12 @@ def get_importances_estimate_pairwise(combination, args, tmp_df):
         estimate_feature_importance = sklearn_MI(vector_first, vector_second)
 
     elif 'surrogate-' in args.heuristic:
+        X = np.array(float)
+        if ('-prior' in args.heuristic) and (len(reference_model_features) > 0):
+            X = tmp_df[reference_model_features].values
+
         estimate_feature_importance = sklearn_surrogate(
-            vector_first, vector_second, args.heuristic,
+            vector_first, vector_second, X, args.heuristic
         )
 
     elif 'MI-numba' in args.heuristic:
diff --git a/outrank/core_ranking.py b/outrank/core_ranking.py
index 39843f7..4806128 100644
--- a/outrank/core_ranking.py
+++ b/outrank/core_ranking.py
@@ -130,9 +130,13 @@ def mixed_rank_graph(
     # Map the scoring calls to the worker pool
     pbar.set_description('Allocating thread pool')
 
+    reference_model_features = {}
+    if 'prior' in args.heuristic:
+        reference_model_features = [(" AND ").join(item.split(",")) for item in extract_features_from_reference_JSON(args.reference_model_JSON, full_feature_space = True)]
+
     # starmap is an alternative that is slower unfortunately (but nicer)
     def get_grounded_importances_estimate(combination: tuple[str]) -> Any:
-        return get_importances_estimate_pairwise(combination, args, tmp_df=tmp_df)
+        return get_importances_estimate_pairwise(combination, reference_model_features, args, tmp_df=tmp_df)
 
     start_enc_timer = timer()
     with cpu_pool as p:
diff --git a/outrank/core_utils.py b/outrank/core_utils.py
index 0136d42..0845b59 100644
--- a/outrank/core_utils.py
+++ b/outrank/core_utils.py
@@ -393,7 +393,7 @@ def parse_csv_raw(data_path) -> DatasetInformationStorage:
     )
 
 
-def extract_features_from_reference_JSON(json_path: str, combined_features_only = False) -> set[Any]:
+def extract_features_from_reference_JSON(json_path: str, combined_features_only = False, full_feature_space = False) -> set[Any]:
     """Given a model's JSON, extract unique features"""
 
     with open(json_path) as jp:
@@ -401,6 +401,9 @@ def extract_features_from_reference_JSON(json_path: str, combined_features_only
 
     unique_features = set()
     feature_space = content['desc'].get('features', [])
+    if full_feature_space:
+        return set(feature_space)
+
     fields_space = content['desc'].get('fields', [])
     joint_space = feature_space + fields_space
 
diff --git a/outrank/task_selftest.py b/outrank/task_selftest.py
index 9b5ff6f..2cb188e 100644
--- a/outrank/task_selftest.py
+++ b/outrank/task_selftest.py
@@ -22,7 +22,7 @@ def conduct_self_test():
         'outrank --task data_generator --num_synthetic_rows 100000', shell=True,
     )
     subprocess.run(
-        'outrank --task ranking --data_path test_data_synthetic --data_source csv-raw --combination_number_upper_bound 60;',
+        'outrank --task ranking --data_path test_data_synthetic --data_source csv-raw --combination_number_upper_bound 60 --heuristic surrogate-LR-prior --reference_model_JSON tests/test_ref_model.json;',
         shell=True,
     )
 

From 7630de496eb4120e5195c1a8aaae5845c8e257dc Mon Sep 17 00:00:00 2001
From: Blaz Mramor <bmramor@outbrain.com>
Date: Wed, 13 Mar 2024 09:10:07 +0100
Subject: [PATCH 02/18] add sgd

---
 outrank/algorithms/importance_estimator.py | 2 ++
 outrank/task_selftest.py                   | 6 +++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/outrank/algorithms/importance_estimator.py b/outrank/algorithms/importance_estimator.py
index 0b605ea..02777ce 100644
--- a/outrank/algorithms/importance_estimator.py
+++ b/outrank/algorithms/importance_estimator.py
@@ -44,6 +44,8 @@ def sklearn_surrogate(
         clf = LogisticRegression(max_iter=100000)
     elif 'surrogate-SVM' in surrogate_model:
         clf = SVC(gamma='auto', probability=True)
+    elif 'surrogate-SGD' in surrogate_model:
+        clf = SGDClassifier(max_iter=100000, loss='log_loss')
 
     transf = OneHotEncoder()
 
diff --git a/outrank/task_selftest.py b/outrank/task_selftest.py
index 2cb188e..e27819b 100644
--- a/outrank/task_selftest.py
+++ b/outrank/task_selftest.py
@@ -22,7 +22,7 @@ def conduct_self_test():
         'outrank --task data_generator --num_synthetic_rows 100000', shell=True,
     )
     subprocess.run(
-        'outrank --task ranking --data_path test_data_synthetic --data_source csv-raw --combination_number_upper_bound 60 --heuristic surrogate-LR-prior --reference_model_JSON tests/test_ref_model.json;',
+        'outrank --task ranking --data_path test_data_synthetic --data_source csv-raw --combination_number_upper_bound 60 --heuristic surrogate-SGD-prior --reference_model_JSON tests/test_ref_model.json;',
         shell=True,
     )
 
@@ -40,3 +40,7 @@ def conduct_self_test():
             shutil.rmtree(path)
 
     logger.info('All tests passed, OutRank seems in shape \N{winking face}')
+
+if __name__ == '__main__':
+    conduct_self_test()
+

From 0e6e204aea63467369e3e0b987e7b946b590922a Mon Sep 17 00:00:00 2001
From: Blaz Mramor <bmramor@outbrain.com>
Date: Fri, 15 Mar 2024 11:45:40 +0100
Subject: [PATCH 03/18] adding reference model json for tests

---
 tests/test_ref_model.json | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 tests/test_ref_model.json

diff --git a/tests/test_ref_model.json b/tests/test_ref_model.json
new file mode 100644
index 0000000..6c36715
--- /dev/null
+++ b/tests/test_ref_model.json
@@ -0,0 +1,5 @@
+{
+    "desc": {
+        "features": ["f0","f1","f0,f1"]
+    }
+}
\ No newline at end of file

From 289f8eb9cfa4a112fbefaf3be9edebafc59531c8 Mon Sep 17 00:00:00 2001
From: bmramor <bmramor@outbrain.com>
Date: Tue, 19 Mar 2024 10:36:34 +0000
Subject: [PATCH 04/18] cleaning up

---
 examples/run_ranking_prior.sh              | 21 ++++++++++++
 outrank/algorithms/importance_estimator.py | 39 +++++++++++++---------
 outrank/core_ranking.py                    |  3 +-
 outrank/core_utils.py                      |  7 ++++
 outrank/task_selftest.py                   |  2 +-
 5 files changed, 55 insertions(+), 17 deletions(-)
 create mode 100644 examples/run_ranking_prior.sh

diff --git a/examples/run_ranking_prior.sh b/examples/run_ranking_prior.sh
new file mode 100644
index 0000000..15a8642
--- /dev/null
+++ b/examples/run_ranking_prior.sh
@@ -0,0 +1,21 @@
+##########################################################################################################
+# A very generic OutRank invocation (default). It includes visualizations and other relevant statistics. #
+##########################################################################################################
+
+# This run compares features "one-at-a-time" and summarizes, visualizes the outputs.
+# hint - if unsure what parameters do, you can always run "outrank --help"
+
+outrank \
+    --task all \
+    --data_path $PATH_TO_YOUR_DATA \
+    --data_source ob-csv \
+    --heuristic surrogate-SGD-prior \
+    --target_ranking_only True \
+    --interaction_order 1 \
+    --combination_number_upper_bound 2048 \
+    --num_threads 12 \
+    --output_folder ./some_output_folder \
+    --subsampling 1 \
+    --minibatch_size 10000 \
+    --label_column info_click_valid \
+    --reference_model_JSON $PATH_TO_YOUR_REFERENCE_MODEL
diff --git a/outrank/algorithms/importance_estimator.py b/outrank/algorithms/importance_estimator.py
index 02777ce..b9c2e52 100644
--- a/outrank/algorithms/importance_estimator.py
+++ b/outrank/algorithms/importance_estimator.py
@@ -17,9 +17,14 @@
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.svm import SVC
 
+from outrank.core_utils import is_prior_heuristic
+
+
 logger = logging.getLogger('syn-logger')
 logger.setLevel(logging.DEBUG)
 
+num_folds = 4
+
 try:
     from outrank.algorithms.feature_ranking import ranking_mi_numba
 
@@ -40,13 +45,9 @@ def sklearn_MI(vector_first: Any, vector_second: Any) -> float:
 def sklearn_surrogate(
     vector_first: Any, vector_second: Any, X: Any, surrogate_model: str
 ) -> float:
-    if 'surrogate-LR' in surrogate_model:
-        clf = LogisticRegression(max_iter=100000)
-    elif 'surrogate-SVM' in surrogate_model:
-        clf = SVC(gamma='auto', probability=True)
-    elif 'surrogate-SGD' in surrogate_model:
-        clf = SGDClassifier(max_iter=100000, loss='log_loss')
-
+    
+    clf = initialize_classifier(surrogate_model)
+    
     transf = OneHotEncoder()
 
     # They do not commute, swap if needed
@@ -58,20 +59,16 @@ def sklearn_surrogate(
 
     unique_values, counts = np.unique(vector_second, return_counts=True)
 
-    # Establish min support for this type of ranking.
-    # if counts[0] < len(unique_values) * (2**5):
-    #     estimate_feature_importance = 0
-
     if X.shape[0] == 0 and X.shape[1] == 0:
         vector_first = transf.fit_transform(vector_first.reshape(-1, 1))
         estimate_feature_importance_list = cross_val_score(
-            clf, vector_first, vector_second, scoring='neg_log_loss', cv=4,
+            clf, vector_first, vector_second, scoring='neg_log_loss', cv=num_folds,
         )
     else:
-        X = np.concatenate((X,vector_first.reshape(-1, 1)), axis=1)
+        X = np.concatenate((X, vector_first.reshape(-1, 1)), axis=1)
         X = transf.fit_transform(X)
         estimate_feature_importance_list = cross_val_score(
-            clf, X, vector_second, scoring='neg_log_loss', cv=4,
+            clf, X, vector_second, scoring='neg_log_loss', cv=num_folds,
         )   
     estimate_feature_importance = 1 + \
         np.median(estimate_feature_importance_list)        
@@ -130,7 +127,7 @@ def get_importances_estimate_pairwise(combination, reference_model_features, arg
 
     elif 'surrogate-' in args.heuristic:
         X = np.array(float)
-        if ('-prior' in args.heuristic) and (len(reference_model_features) > 0):
+        if is_prior_heuristic(args) and (len(reference_model_features) > 0):
             X = tmp_df[reference_model_features].values
 
         estimate_feature_importance = sklearn_surrogate(
@@ -224,3 +221,15 @@ def get_importances_estimate_nonmyopic(args: Any, tmp_df: pd.DataFrame):
     # TODO - nonmyopic algorithms - tmp_df \ args.label vs. label
     # TODO - this is to be executed directly on df - no need for parallel kernel(s)
     pass
+
+
+def initialize_classifier(surrogate_model: string):
+    if 'surrogate-LR' in surrogate_model:
+        return LogisticRegression(max_iter=100000)
+    elif 'surrogate-SVM' in surrogate_model:
+        return SVC(gamma='auto', probability=True)
+    elif 'surrogate-SGD' in surrogate_model:
+        return SGDClassifier(max_iter=100000, loss='log_loss')
+    else:
+        logging.warning(f'The chosen surrogate model {surrogate_model} is not supported, falling back to surrogate-SGD')
+        return SGDClassifier(max_iter=100000, loss='log_loss')
diff --git a/outrank/core_ranking.py b/outrank/core_ranking.py
index 4806128..1722991 100644
--- a/outrank/core_ranking.py
+++ b/outrank/core_ranking.py
@@ -32,6 +32,7 @@
 from outrank.core_utils import internal_hash
 from outrank.core_utils import NominalFeatureSummary
 from outrank.core_utils import NumericFeatureSummary
+from outrank.core_utils import is_prior_heuristic
 from outrank.feature_transformations.ranking_transformers import FeatureTransformerGeneric
 from outrank.feature_transformations.ranking_transformers import FeatureTransformerNoise
 
@@ -131,7 +132,7 @@ def mixed_rank_graph(
     pbar.set_description('Allocating thread pool')
 
     reference_model_features = {}
-    if 'prior' in args.heuristic:
+    if is_prior_heuristic(args):            
         reference_model_features = [(" AND ").join(item.split(",")) for item in extract_features_from_reference_JSON(args.reference_model_JSON, full_feature_space = True)]
 
     # starmap is an alternative that is slower unfortunately (but nicer)
diff --git a/outrank/core_utils.py b/outrank/core_utils.py
index 0845b59..0680008 100644
--- a/outrank/core_utils.py
+++ b/outrank/core_utils.py
@@ -644,3 +644,10 @@ def summarize_rare_counts(
     final_df.to_csv(
         f'{args.output_folder}/feature_sparsity_summary.tsv', index=False, sep='\t',
     )
+
+
+def is_prior_heuristic(args: Any):
+    if "-prior" in args.heuristic and args.reference_model_JSON and args.reference_model_JSON != "":
+        return True
+    return False
+
diff --git a/outrank/task_selftest.py b/outrank/task_selftest.py
index e27819b..0e36d48 100644
--- a/outrank/task_selftest.py
+++ b/outrank/task_selftest.py
@@ -22,7 +22,7 @@ def conduct_self_test():
         'outrank --task data_generator --num_synthetic_rows 100000', shell=True,
     )
     subprocess.run(
-        'outrank --task ranking --data_path test_data_synthetic --data_source csv-raw --combination_number_upper_bound 60 --heuristic surrogate-SGD-prior --reference_model_JSON tests/test_ref_model.json;',
+        'outrank --task ranking --data_path test_data_synthetic --data_source csv-raw --combination_number_upper_bound 60;',
         shell=True,
     )
 

From 54dd256f95979a8ed3becbb41725afbdd300397a Mon Sep 17 00:00:00 2001
From: bmramor <bmramor@outbrain.com>
Date: Tue, 19 Mar 2024 10:45:13 +0000
Subject: [PATCH 05/18] typing bug

---
 outrank/algorithms/importance_estimator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/outrank/algorithms/importance_estimator.py b/outrank/algorithms/importance_estimator.py
index b9c2e52..4cd0fbc 100644
--- a/outrank/algorithms/importance_estimator.py
+++ b/outrank/algorithms/importance_estimator.py
@@ -223,7 +223,7 @@ def get_importances_estimate_nonmyopic(args: Any, tmp_df: pd.DataFrame):
     pass
 
 
-def initialize_classifier(surrogate_model: string):
+def initialize_classifier(surrogate_model: str):
     if 'surrogate-LR' in surrogate_model:
         return LogisticRegression(max_iter=100000)
     elif 'surrogate-SVM' in surrogate_model:

From c632690531c8132c20f3d1218d49b2a8e2f9bb6e Mon Sep 17 00:00:00 2001
From: bmramor <bmramor@outbrain.com>
Date: Tue, 19 Mar 2024 14:16:44 +0000
Subject: [PATCH 06/18] support for combined features ranking

---
 examples/run_ranking_prior.sh | 4 ++--
 outrank/core_ranking.py       | 4 +++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/examples/run_ranking_prior.sh b/examples/run_ranking_prior.sh
index 15a8642..4421d0a 100644
--- a/examples/run_ranking_prior.sh
+++ b/examples/run_ranking_prior.sh
@@ -11,11 +11,11 @@ outrank \
     --data_source ob-csv \
     --heuristic surrogate-SGD-prior \
     --target_ranking_only True \
-    --interaction_order 1 \
+    --interaction_order 2 \
     --combination_number_upper_bound 2048 \
     --num_threads 12 \
     --output_folder ./some_output_folder \
-    --subsampling 1 \
+    --subsampling 100 \
     --minibatch_size 10000 \
     --label_column info_click_valid \
     --reference_model_JSON $PATH_TO_YOUR_REFERENCE_MODEL
diff --git a/outrank/core_ranking.py b/outrank/core_ranking.py
index 1722991..60aa90a 100644
--- a/outrank/core_ranking.py
+++ b/outrank/core_ranking.py
@@ -197,12 +197,14 @@ def compute_combined_features(
     if args.reference_model_JSON != '':
         combined_features = extract_features_from_reference_JSON(args.reference_model_JSON, combined_features_only = True)
         full_combination_space = [combination.split(',') for combination in combined_features]
+        if is_prior_heuristic(args):
+            full_combination_space = list(set(full_combination_space) | set(itertools.combinations(all_columns, interaction_order)))
     else:
         full_combination_space = list(
             itertools.combinations(all_columns, interaction_order),
         )
 
-    if args.combination_number_upper_bound and args.reference_model_JSON != '':
+    if args.combination_number_upper_bound:
         random.shuffle(full_combination_space)
         full_combination_space = full_combination_space[
             : args.combination_number_upper_bound

From 5e5380305e6a0b49446632a5a66f4a2101970179 Mon Sep 17 00:00:00 2001
From: bmramor <bmramor@outbrain.com>
Date: Wed, 20 Mar 2024 13:28:19 +0000
Subject: [PATCH 07/18] combinations for priors

---
 outrank/algorithms/importance_estimator.py |  2 +-
 outrank/core_ranking.py                    | 39 +++++++++++++++-------
 outrank/task_selftest.py                   |  2 +-
 3 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/outrank/algorithms/importance_estimator.py b/outrank/algorithms/importance_estimator.py
index 4cd0fbc..dd28823 100644
--- a/outrank/algorithms/importance_estimator.py
+++ b/outrank/algorithms/importance_estimator.py
@@ -59,7 +59,7 @@ def sklearn_surrogate(
 
     unique_values, counts = np.unique(vector_second, return_counts=True)
 
-    if X.shape[0] == 0 and X.shape[1] == 0:
+    if X.size <= 1:
         vector_first = transf.fit_transform(vector_first.reshape(-1, 1))
         estimate_feature_importance_list = cross_val_score(
             clf, vector_first, vector_second, scoring='neg_log_loss', cv=num_folds,
diff --git a/outrank/core_ranking.py b/outrank/core_ranking.py
index 60aa90a..2976df9 100644
--- a/outrank/core_ranking.py
+++ b/outrank/core_ranking.py
@@ -119,6 +119,10 @@ def mixed_rank_graph(
     combinations = prior_combinations_sample(combinations, args)
     random.shuffle(combinations)
 
+    reference_model_features = {}
+    if is_prior_heuristic(args):
+        reference_model_features = [(" AND ").join(tuple(sorted(item.split(",")))) for item in extract_features_from_reference_JSON(args.reference_model_JSON, full_feature_space = True)]
+
     if args.heuristic == 'Constant':
         final_constant_imp = []
         for c1, c2 in combinations:
@@ -131,10 +135,6 @@ def mixed_rank_graph(
     # Map the scoring calls to the worker pool
     pbar.set_description('Allocating thread pool')
 
-    reference_model_features = {}
-    if is_prior_heuristic(args):            
-        reference_model_features = [(" AND ").join(item.split(",")) for item in extract_features_from_reference_JSON(args.reference_model_JSON, full_feature_space = True)]
-
     # starmap is an alternative that is slower unfortunately (but nicer)
     def get_grounded_importances_estimate(combination: tuple[str]) -> Any:
         return get_importances_estimate_pairwise(combination, reference_model_features, args, tmp_df=tmp_df)
@@ -194,21 +194,36 @@ def compute_combined_features(
     join_string = ' AND_REL ' if is_3mr else ' AND '
     interaction_order = 2 if is_3mr else args.interaction_order
 
-    if args.reference_model_JSON != '':
-        combined_features = extract_features_from_reference_JSON(args.reference_model_JSON, combined_features_only = True)
-        full_combination_space = [combination.split(',') for combination in combined_features]
-        if is_prior_heuristic(args):
-            full_combination_space = list(set(full_combination_space) | set(itertools.combinations(all_columns, interaction_order)))
+    model_combinations = []
+    if is_prior_heuristic(args):
+        model_combinations = extract_features_from_reference_JSON(args.reference_model_JSON, combined_features_only = True)
+        model_combinations = [tuple(sorted(combination.split(','))) for combination in model_combinations]
+        feature_combination_space = []
+        if args.interaction_order > 1:
+            feature_combination_space = list(
+                itertools.combinations(all_columns, interaction_order),
+            )
+
+        full_combination_space = feature_combination_space + [tuple for tuple in model_combinations if tuple not in feature_combination_space]
+        del feature_combination_space
     else:
-        full_combination_space = list(
-            itertools.combinations(all_columns, interaction_order),
-        )
+        if args.reference_model_JSON != '':
+            model_combinations = extract_features_from_reference_JSON(args.reference_model_JSON, combined_features_only = True)
+            model_combinations = [tuple(sorted(combination.split(','))) for combination in model_combinations]
+            full_combination_space = [combination.split(',') for combination in model_combinations]
+        else:
+            full_combination_space = list(
+                itertools.combinations(all_columns, interaction_order),
+            )
 
     if args.combination_number_upper_bound:
         random.shuffle(full_combination_space)
         full_combination_space = full_combination_space[
             : args.combination_number_upper_bound
         ]
+        if is_prior_heuristic(args):
+            full_combination_space = full_combination_space + [tuple for tuple in model_combinations if tuple not in full_combination_space]
+
 
     com_counter = 0
     new_feature_hash = {}
diff --git a/outrank/task_selftest.py b/outrank/task_selftest.py
index 0e36d48..a63abd5 100644
--- a/outrank/task_selftest.py
+++ b/outrank/task_selftest.py
@@ -22,7 +22,7 @@ def conduct_self_test():
         'outrank --task data_generator --num_synthetic_rows 100000', shell=True,
     )
     subprocess.run(
-        'outrank --task ranking --data_path test_data_synthetic --data_source csv-raw --combination_number_upper_bound 60;',
+        'outrank --task ranking --data_path test_data_synthetic --data_source csv-raw --combination_number_upper_bound 60 --heuristic surrogate-SGD-prior --reference_model_JSON tests/test_ref_model.json --interaction_order 2;',
         shell=True,
     )
 

From 92408bb0240d8c09c68152ddd6d3f4c7c9c63fa7 Mon Sep 17 00:00:00 2001
From: bmramor <bmramor@outbrain.com>
Date: Thu, 21 Mar 2024 09:40:41 +0000
Subject: [PATCH 08/18] remove a bug for non-prior surrogate

---
 .../algorithms/synthetic_data_generators/generator_naive.py    | 3 ++-
 outrank/core_ranking.py                                        | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/outrank/algorithms/synthetic_data_generators/generator_naive.py b/outrank/algorithms/synthetic_data_generators/generator_naive.py
index 23c5f8a..d0606e6 100644
--- a/outrank/algorithms/synthetic_data_generators/generator_naive.py
+++ b/outrank/algorithms/synthetic_data_generators/generator_naive.py
@@ -13,7 +13,8 @@ def generate_random_matrix(num_features=100, size=20000):
     target = sample[:, 30]
     # Some noise
 
-    target[target < 20] = 0
+    target[target < 40] = 0
+    target[target > 39] = 0
     return sample, target
 
 
diff --git a/outrank/core_ranking.py b/outrank/core_ranking.py
index 2976df9..c2b70da 100644
--- a/outrank/core_ranking.py
+++ b/outrank/core_ranking.py
@@ -210,7 +210,7 @@ def compute_combined_features(
         if args.reference_model_JSON != '':
             model_combinations = extract_features_from_reference_JSON(args.reference_model_JSON, combined_features_only = True)
             model_combinations = [tuple(sorted(combination.split(','))) for combination in model_combinations]
-            full_combination_space = [combination.split(',') for combination in model_combinations]
+            full_combination_space = model_combinations
         else:
             full_combination_space = list(
                 itertools.combinations(all_columns, interaction_order),
@@ -707,6 +707,7 @@ def estimate_importances_minibatches(
                 logger,
                 local_pbar,
             )
+            print(importances_batch)
 
             bounds_storage_batch.append(bounds_storage)
             memory_storage_batch.append(memory_storage)

From 31d3dd575b0e4388e431e7275d43097b36ff4562 Mon Sep 17 00:00:00 2001
From: bmramor <bmramor@outbrain.com>
Date: Thu, 21 Mar 2024 21:36:38 +0000
Subject: [PATCH 09/18] some more bug handling

---
 outrank/core_ranking.py  | 13 ++++---------
 outrank/task_selftest.py | 14 +++++---------
 2 files changed, 9 insertions(+), 18 deletions(-)

diff --git a/outrank/core_ranking.py b/outrank/core_ranking.py
index c2b70da..dc56aaa 100644
--- a/outrank/core_ranking.py
+++ b/outrank/core_ranking.py
@@ -116,8 +116,8 @@ def mixed_rank_graph(
     out_time_struct['encoding_columns'] = end_enc_timer - start_enc_timer
 
     combinations = get_combinations_from_columns(all_columns, args)
-    combinations = prior_combinations_sample(combinations, args)
-    random.shuffle(combinations)
+    #combinations = prior_combinations_sample(combinations, args)
+    #random.shuffle(combinations)
 
     reference_model_features = {}
     if is_prior_heuristic(args):
@@ -195,17 +195,14 @@ def compute_combined_features(
     interaction_order = 2 if is_3mr else args.interaction_order
 
     model_combinations = []
+    full_combination_space = []
     if is_prior_heuristic(args):
         model_combinations = extract_features_from_reference_JSON(args.reference_model_JSON, combined_features_only = True)
         model_combinations = [tuple(sorted(combination.split(','))) for combination in model_combinations]
-        feature_combination_space = []
         if args.interaction_order > 1:
-            feature_combination_space = list(
+            full_combination_space = list(
                 itertools.combinations(all_columns, interaction_order),
             )
-
-        full_combination_space = feature_combination_space + [tuple for tuple in model_combinations if tuple not in feature_combination_space]
-        del feature_combination_space
     else:
         if args.reference_model_JSON != '':
             model_combinations = extract_features_from_reference_JSON(args.reference_model_JSON, combined_features_only = True)
@@ -247,7 +244,6 @@ def compute_combined_features(
     pbar.set_description('Concatenating into final frame ..')
     input_dataframe = pd.concat([input_dataframe, tmp_df], axis=1)
     del tmp_df
-
     return input_dataframe
 
 
@@ -707,7 +703,6 @@ def estimate_importances_minibatches(
                 logger,
                 local_pbar,
             )
-            print(importances_batch)
 
             bounds_storage_batch.append(bounds_storage)
             memory_storage_batch.append(memory_storage)
diff --git a/outrank/task_selftest.py b/outrank/task_selftest.py
index a63abd5..9335843 100644
--- a/outrank/task_selftest.py
+++ b/outrank/task_selftest.py
@@ -1,38 +1,31 @@
 # helper set of methods that enable anywhere verification of core functions
 from __future__ import annotations
-
 import logging
 import os
 import shutil
 import subprocess
-
 import pandas as pd
-
 logging.basicConfig(
     format='%(asctime)s - %(message)s',
     datefmt='%d-%b-%y %H:%M:%S',
 )
 logger = logging.getLogger('syn-logger')
 logger.setLevel(logging.DEBUG)
-
-
 def conduct_self_test():
     # Simulate full flow, ranking only
     subprocess.run(
         'outrank --task data_generator --num_synthetic_rows 100000', shell=True,
     )
     subprocess.run(
-        'outrank --task ranking --data_path test_data_synthetic --data_source csv-raw --combination_number_upper_bound 60 --heuristic surrogate-SGD-prior --reference_model_JSON tests/test_ref_model.json --interaction_order 2;',
+        'outrank --task ranking --data_path test_data_synthetic --data_source csv-raw --combination_number_upper_bound 60;',
         shell=True,
     )
 
     dfx = pd.read_csv('ranking_outputs/pairwise_ranks.tsv', sep='\t')
-
     logger.info("Verifying output's properties ..")
     assert dfx.shape[0] == 120
     assert dfx.shape[1] == 3
     assert dfx['FeatureA'].values.tolist().pop() == 'label-(81; 100)' or dfx['FeatureB'].values.tolist().pop() == 'label-(81; 100)'
-
     to_remove = ['ranking_outputs', 'test_data_synthetic']
     for path in to_remove:
         if os.path.exists(path) and os.path.isdir(path):
@@ -41,6 +34,9 @@ def conduct_self_test():
 
     logger.info('All tests passed, OutRank seems in shape \N{winking face}')
 
+    shutil.rmtree(path)
+
+    logger.info('All tests passed, OutRank seems in shape \N{winking face}')
+
 if __name__ == '__main__':
     conduct_self_test()
-

From 344be92c2227877f382da2b57d21b2831d828def Mon Sep 17 00:00:00 2001
From: bmramor <bmramor@outbrain.com>
Date: Thu, 21 Mar 2024 21:39:04 +0000
Subject: [PATCH 10/18] formatting

---
 outrank/task_selftest.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/outrank/task_selftest.py b/outrank/task_selftest.py
index 9335843..c0cb54a 100644
--- a/outrank/task_selftest.py
+++ b/outrank/task_selftest.py
@@ -1,16 +1,21 @@
 # helper set of methods that enable anywhere verification of core functions
 from __future__ import annotations
+
 import logging
 import os
 import shutil
 import subprocess
+
 import pandas as pd
+
 logging.basicConfig(
     format='%(asctime)s - %(message)s',
     datefmt='%d-%b-%y %H:%M:%S',
 )
 logger = logging.getLogger('syn-logger')
 logger.setLevel(logging.DEBUG)
+
+
 def conduct_self_test():
     # Simulate full flow, ranking only
     subprocess.run(
@@ -22,10 +27,12 @@ def conduct_self_test():
     )
 
     dfx = pd.read_csv('ranking_outputs/pairwise_ranks.tsv', sep='\t')
+
     logger.info("Verifying output's properties ..")
     assert dfx.shape[0] == 120
     assert dfx.shape[1] == 3
     assert dfx['FeatureA'].values.tolist().pop() == 'label-(81; 100)' or dfx['FeatureB'].values.tolist().pop() == 'label-(81; 100)'
+
     to_remove = ['ranking_outputs', 'test_data_synthetic']
     for path in to_remove:
         if os.path.exists(path) and os.path.isdir(path):

From bcd128c622d0e43586be55b3eb8fcda26f5cba3e Mon Sep 17 00:00:00 2001
From: bmramor <bmramor@outbrain.com>
Date: Thu, 21 Mar 2024 21:39:54 +0000
Subject: [PATCH 11/18] formatting

---
 outrank/task_selftest.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/outrank/task_selftest.py b/outrank/task_selftest.py
index c0cb54a..78b11d1 100644
--- a/outrank/task_selftest.py
+++ b/outrank/task_selftest.py
@@ -41,9 +41,5 @@ def conduct_self_test():
 
     logger.info('All tests passed, OutRank seems in shape \N{winking face}')
 
-    shutil.rmtree(path)
-
-    logger.info('All tests passed, OutRank seems in shape \N{winking face}')
-
 if __name__ == '__main__':
     conduct_self_test()

From 9ef5117832c21aeb337bf462c345b82a3b73a355 Mon Sep 17 00:00:00 2001
From: bmramor <bmramor@outbrain.com>
Date: Fri, 22 Mar 2024 09:48:14 +0000
Subject: [PATCH 12/18] fix tests

---
 .../synthetic_data_generators/generator_naive.py           | 2 +-
 outrank/core_ranking.py                                    | 1 +
 outrank/task_selftest.py                                   | 7 +++----
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/outrank/algorithms/synthetic_data_generators/generator_naive.py b/outrank/algorithms/synthetic_data_generators/generator_naive.py
index d0606e6..7404360 100644
--- a/outrank/algorithms/synthetic_data_generators/generator_naive.py
+++ b/outrank/algorithms/synthetic_data_generators/generator_naive.py
@@ -14,7 +14,7 @@ def generate_random_matrix(num_features=100, size=20000):
     # Some noise
 
     target[target < 40] = 0
-    target[target > 39] = 0
+    target[target > 39] = 1
     return sample, target
 
 
diff --git a/outrank/core_ranking.py b/outrank/core_ranking.py
index dc56aaa..6d1e855 100644
--- a/outrank/core_ranking.py
+++ b/outrank/core_ranking.py
@@ -244,6 +244,7 @@ def compute_combined_features(
     pbar.set_description('Concatenating into final frame ..')
     input_dataframe = pd.concat([input_dataframe, tmp_df], axis=1)
     del tmp_df
+    
     return input_dataframe
 
 
diff --git a/outrank/task_selftest.py b/outrank/task_selftest.py
index 78b11d1..744120a 100644
--- a/outrank/task_selftest.py
+++ b/outrank/task_selftest.py
@@ -1,6 +1,5 @@
 # helper set of methods that enable anywhere verification of core functions
 from __future__ import annotations
-
 import logging
 import os
 import shutil
@@ -22,16 +21,16 @@ def conduct_self_test():
         'outrank --task data_generator --num_synthetic_rows 100000', shell=True,
     )
     subprocess.run(
-        'outrank --task ranking --data_path test_data_synthetic --data_source csv-raw --combination_number_upper_bound 60;',
+        'outrank --task ranking --data_path test_data_synthetic --data_source csv-raw;',
         shell=True,
     )
 
     dfx = pd.read_csv('ranking_outputs/pairwise_ranks.tsv', sep='\t')
 
     logger.info("Verifying output's properties ..")
-    assert dfx.shape[0] == 120
+    assert dfx.shape[0] == 201
     assert dfx.shape[1] == 3
-    assert dfx['FeatureA'].values.tolist().pop() == 'label-(81; 100)' or dfx['FeatureB'].values.tolist().pop() == 'label-(81; 100)'
+    assert dfx['FeatureA'].values.tolist().pop() == 'label-(2; 100)' or dfx['FeatureB'].values.tolist().pop() == 'label-(2; 100)'
 
     to_remove = ['ranking_outputs', 'test_data_synthetic']
     for path in to_remove:

From 41c097527d948aa4c49dd951129588d39665f3b1 Mon Sep 17 00:00:00 2001
From: Blaz Mramor <bmramor@outbrain.com>
Date: Thu, 28 Mar 2024 22:02:05 +0100
Subject: [PATCH 13/18] debugging

---
 outrank/algorithms/importance_estimator.py | 16 ++++++----------
 outrank/core_utils.py                      |  4 ++--
 2 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/outrank/algorithms/importance_estimator.py b/outrank/algorithms/importance_estimator.py
index dd28823..e37ba1c 100644
--- a/outrank/algorithms/importance_estimator.py
+++ b/outrank/algorithms/importance_estimator.py
@@ -57,19 +57,15 @@ def sklearn_surrogate(
         vector_first = vector_third
         del vector_third
 
-    unique_values, counts = np.unique(vector_second, return_counts=True)
-
     if X.size <= 1:
-        vector_first = transf.fit_transform(vector_first.reshape(-1, 1))
-        estimate_feature_importance_list = cross_val_score(
-            clf, vector_first, vector_second, scoring='neg_log_loss', cv=num_folds,
-        )
+        X = vector_first.reshape(-1, 1)
     else:
         X = np.concatenate((X, vector_first.reshape(-1, 1)), axis=1)
-        X = transf.fit_transform(X)
-        estimate_feature_importance_list = cross_val_score(
-            clf, X, vector_second, scoring='neg_log_loss', cv=num_folds,
-        )   
+
+    X = transf.fit_transform(X)
+    estimate_feature_importance_list = cross_val_score(
+        clf, X, vector_second, scoring='neg_log_loss', cv=num_folds,
+    )
     estimate_feature_importance = 1 + \
         np.median(estimate_feature_importance_list)        
 
diff --git a/outrank/core_utils.py b/outrank/core_utils.py
index 0680008..55988d6 100644
--- a/outrank/core_utils.py
+++ b/outrank/core_utils.py
@@ -393,7 +393,7 @@ def parse_csv_raw(data_path) -> DatasetInformationStorage:
     )
 
 
-def extract_features_from_reference_JSON(json_path: str, combined_features_only = False, full_feature_space = False) -> set[Any]:
+def extract_features_from_reference_JSON(json_path: str, combined_features_only = False, all_features = False) -> set[Any]:
     """Given a model's JSON, extract unique features"""
 
     with open(json_path) as jp:
@@ -401,7 +401,7 @@ def extract_features_from_reference_JSON(json_path: str, combined_features_only
 
     unique_features = set()
     feature_space = content['desc'].get('features', [])
-    if full_feature_space:
+    if all_features:
         return set(feature_space)
 
     fields_space = content['desc'].get('fields', [])

From bfbe096c0dfa2ec8446533a8b2a9ce5139fdc516 Mon Sep 17 00:00:00 2001
From: Blaz Mramor <bmramor@outbrain.com>
Date: Thu, 28 Mar 2024 22:03:36 +0100
Subject: [PATCH 14/18] debugging

---
 outrank/core_ranking.py | 41 ++++++++++++++++++++---------------------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/outrank/core_ranking.py b/outrank/core_ranking.py
index 6d1e855..a406c26 100644
--- a/outrank/core_ranking.py
+++ b/outrank/core_ranking.py
@@ -116,12 +116,21 @@ def mixed_rank_graph(
     out_time_struct['encoding_columns'] = end_enc_timer - start_enc_timer
 
     combinations = get_combinations_from_columns(all_columns, args)
-    #combinations = prior_combinations_sample(combinations, args)
-    #random.shuffle(combinations)
 
     reference_model_features = {}
     if is_prior_heuristic(args):
-        reference_model_features = [(" AND ").join(tuple(sorted(item.split(",")))) for item in extract_features_from_reference_JSON(args.reference_model_JSON, full_feature_space = True)]
+        reference_model_features = [(" AND ").join(tuple(sorted(item.split(",")))) for item in extract_features_from_reference_JSON(args.reference_model_JSON, all_features=True)]
+        combinations = [comb for comb in combinations if comb[0] not in reference_model_features and comb[1] not in reference_model_features]
+        print(combinations)
+        print("\n\n")
+
+    combinations = prior_combinations_sample(combinations, args)
+    print(GLOBAL_PRIOR_COMB_COUNTS)
+    print(combinations)
+    print("\n\n")
+    random.shuffle(combinations)
+    print(combinations)
+    print("\n\n")
 
     if args.heuristic == 'Constant':
         final_constant_imp = []
@@ -196,30 +205,20 @@ def compute_combined_features(
 
     model_combinations = []
     full_combination_space = []
-    if is_prior_heuristic(args):
+
+    if args.reference_model_JSON != '':
         model_combinations = extract_features_from_reference_JSON(args.reference_model_JSON, combined_features_only = True)
         model_combinations = [tuple(sorted(combination.split(','))) for combination in model_combinations]
-        if args.interaction_order > 1:
-            full_combination_space = list(
-                itertools.combinations(all_columns, interaction_order),
-            )
-    else:
-        if args.reference_model_JSON != '':
-            model_combinations = extract_features_from_reference_JSON(args.reference_model_JSON, combined_features_only = True)
-            model_combinations = [tuple(sorted(combination.split(','))) for combination in model_combinations]
-            full_combination_space = model_combinations
-        else:
+        full_combination_space = model_combinations
+
+    if args.interaction_order > 1:
             full_combination_space = list(
                 itertools.combinations(all_columns, interaction_order),
             )
+    full_combination_space = prior_combinations_sample(full_combination_space, args)
 
-    if args.combination_number_upper_bound:
-        random.shuffle(full_combination_space)
-        full_combination_space = full_combination_space[
-            : args.combination_number_upper_bound
-        ]
-        if is_prior_heuristic(args):
-            full_combination_space = full_combination_space + [tuple for tuple in model_combinations if tuple not in full_combination_space]
+    if is_prior_heuristic(args):
+        full_combination_space = full_combination_space + [tuple for tuple in model_combinations if tuple not in full_combination_space]
 
 
     com_counter = 0

From 75d37d2d803ec971fa37a1cd2e0de809745366a4 Mon Sep 17 00:00:00 2001
From: Blaz Mramor <bmramor@outbrain.com>
Date: Thu, 28 Mar 2024 22:33:37 +0100
Subject: [PATCH 15/18] fix global variable creation

---
 outrank/core_ranking.py | 26 +++++++++++---------------
 outrank/core_utils.py   |  4 ++--
 outrank/task_summary.py |  7 ++++---
 3 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/outrank/core_ranking.py b/outrank/core_ranking.py
index a406c26..832a540 100644
--- a/outrank/core_ranking.py
+++ b/outrank/core_ranking.py
@@ -51,9 +51,10 @@
 def prior_combinations_sample(combinations: list[tuple[Any, ...]], args: Any) -> list[tuple[Any, ...]]:
     """Make sure only relevant subspace of combinations is selected based on prior counts"""
 
-    if len(GLOBAL_PRIOR_COMB_COUNTS) == 0:
-        for combination in combinations:
-            GLOBAL_PRIOR_COMB_COUNTS[combination] += 1
+    missing_combinations = set(set(combinations)).difference(GLOBAL_PRIOR_COMB_COUNTS.keys())
+    if len(missing_combinations) > 0:
+        for combination in missing_combinations:
+            GLOBAL_PRIOR_COMB_COUNTS[combination] = 0
         tmp = combinations[:args.combination_number_upper_bound]
     else:
         tmp = list(x[0] for x in sorted(GLOBAL_PRIOR_COMB_COUNTS.items(), key=lambda x:x[1], reverse=False))[:args.combination_number_upper_bound]
@@ -121,16 +122,9 @@ def mixed_rank_graph(
     if is_prior_heuristic(args):
         reference_model_features = [(" AND ").join(tuple(sorted(item.split(",")))) for item in extract_features_from_reference_JSON(args.reference_model_JSON, all_features=True)]
         combinations = [comb for comb in combinations if comb[0] not in reference_model_features and comb[1] not in reference_model_features]
-        print(combinations)
-        print("\n\n")
 
     combinations = prior_combinations_sample(combinations, args)
-    print(GLOBAL_PRIOR_COMB_COUNTS)
-    print(combinations)
-    print("\n\n")
     random.shuffle(combinations)
-    print(combinations)
-    print("\n\n")
 
     if args.heuristic == 'Constant':
         final_constant_imp = []
@@ -206,10 +200,6 @@ def compute_combined_features(
     model_combinations = []
     full_combination_space = []
 
-    if args.reference_model_JSON != '':
-        model_combinations = extract_features_from_reference_JSON(args.reference_model_JSON, combined_features_only = True)
-        model_combinations = [tuple(sorted(combination.split(','))) for combination in model_combinations]
-        full_combination_space = model_combinations
 
     if args.interaction_order > 1:
             full_combination_space = list(
@@ -217,6 +207,12 @@ def compute_combined_features(
             )
     full_combination_space = prior_combinations_sample(full_combination_space, args)
 
+    if args.reference_model_JSON != '':
+        model_combinations = extract_features_from_reference_JSON(args.reference_model_JSON, combined_features_only = True)
+        model_combinations = [tuple(sorted(combination.split(','))) for combination in model_combinations]
+        if not is_prior_heuristic(args):
+            full_combination_space = model_combinations
+
     if is_prior_heuristic(args):
         full_combination_space = full_combination_space + [tuple for tuple in model_combinations if tuple not in full_combination_space]
 
@@ -243,7 +239,7 @@ def compute_combined_features(
     pbar.set_description('Concatenating into final frame ..')
     input_dataframe = pd.concat([input_dataframe, tmp_df], axis=1)
     del tmp_df
-    
+
     return input_dataframe
 
 
diff --git a/outrank/core_utils.py b/outrank/core_utils.py
index 55988d6..336cc35 100644
--- a/outrank/core_utils.py
+++ b/outrank/core_utils.py
@@ -646,8 +646,8 @@ def summarize_rare_counts(
     )
 
 
-def is_prior_heuristic(args: Any):
-    if "-prior" in args.heuristic and args.reference_model_JSON and args.reference_model_JSON != "":
+def is_prior_heuristic(args: Any) -> bool:
+    if "-prior" in args.heuristic and args.reference_model_JSON:
         return True
     return False
 
diff --git a/outrank/task_summary.py b/outrank/task_summary.py
index 38475d9..458c9b3 100644
--- a/outrank/task_summary.py
+++ b/outrank/task_summary.py
@@ -37,9 +37,10 @@ def outrank_task_result_summary(args):
 
     min_score = np.min(final_df[f'Score {args.heuristic}'].values)
     max_score = np.max(final_df[f'Score {args.heuristic}'].values)
-    final_df[f'Score {args.heuristic}'] = (
-        final_df[f'Score {args.heuristic}'] - min_score
-    ) / (max_score - min_score)
+    if "MI" in args.heuristic:
+        final_df[f'Score {args.heuristic}'] = (
+            final_df[f'Score {args.heuristic}'] - min_score
+        ) / (max_score - min_score)
     logging.info(f'Storing summary files to {args.output_folder}')
     pd.set_option('display.max_rows', None, 'display.max_columns', None)
     singles_path = os.path.join(args.output_folder, 'feature_singles.tsv')

From cf305ae273632f1881b9c88a0ac713e021219d7c Mon Sep 17 00:00:00 2001
From: Blaz Mramor <bmramor@outbrain.com>
Date: Sun, 31 Mar 2024 21:04:33 +0200
Subject: [PATCH 16/18] prior combinations fix

---
 outrank/core_ranking.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/outrank/core_ranking.py b/outrank/core_ranking.py
index 832a540..5633ada 100644
--- a/outrank/core_ranking.py
+++ b/outrank/core_ranking.py
@@ -51,13 +51,15 @@
 def prior_combinations_sample(combinations: list[tuple[Any, ...]], args: Any) -> list[tuple[Any, ...]]:
     """Make sure only relevant subspace of combinations is selected based on prior counts"""
 
+    if len(combinations) == 0:
+        return []
+
     missing_combinations = set(set(combinations)).difference(GLOBAL_PRIOR_COMB_COUNTS.keys())
     if len(missing_combinations) > 0:
         for combination in missing_combinations:
             GLOBAL_PRIOR_COMB_COUNTS[combination] = 0
-        tmp = combinations[:args.combination_number_upper_bound]
-    else:
-        tmp = list(x[0] for x in sorted(GLOBAL_PRIOR_COMB_COUNTS.items(), key=lambda x:x[1], reverse=False))[:args.combination_number_upper_bound]
+
+    tmp = sorted(combinations, key=GLOBAL_PRIOR_COMB_COUNTS.get, reverse=False)[:args.combination_number_upper_bound]
 
     for combination in tmp:
         GLOBAL_PRIOR_COMB_COUNTS[combination] += 1

From 6d650ddb7694d0dfafc06f5c247bc525e84cb0cf Mon Sep 17 00:00:00 2001
From: Blaz Mramor <bmramor@outbrain.com>
Date: Sun, 31 Mar 2024 21:40:21 +0200
Subject: [PATCH 17/18] remove logger from function

---
 outrank/core_ranking.py      | 5 ++---
 tests/ranking_module_test.py | 4 ++--
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/outrank/core_ranking.py b/outrank/core_ranking.py
index 5633ada..30f892c 100644
--- a/outrank/core_ranking.py
+++ b/outrank/core_ranking.py
@@ -186,7 +186,6 @@ def enrich_with_transformations(
 
 def compute_combined_features(
     input_dataframe: pd.DataFrame,
-    logger: Any,
     args: Any,
     pbar: Any,
     is_3mr: bool = False,
@@ -547,7 +546,7 @@ def compute_batch_ranking(
     if args.interaction_order > 1 or args.reference_model_JSON:
         pbar.set_description('Constructing new features')
         input_dataframe = compute_combined_features(
-            input_dataframe, logger, args, pbar,
+            input_dataframe, args, pbar,
         )
 
     # in case of 3mr we compute the score of combinations against the target
@@ -556,7 +555,7 @@ def compute_batch_ranking(
             'Constructing features for computing relations in 3mr',
         )
         input_dataframe = compute_combined_features(
-            input_dataframe, logger, args, pbar, True,
+            input_dataframe, args, pbar, True,
         )
 
     if args.include_noise_baseline_features == 'True' and args.heuristic != 'Constant':
diff --git a/tests/ranking_module_test.py b/tests/ranking_module_test.py
index e49880c..fd99092 100644
--- a/tests/ranking_module_test.py
+++ b/tests/ranking_module_test.py
@@ -82,7 +82,7 @@ def test_compute_combinations(self):
         random_df.columns = ['F1', 'F2', 'F3']
         local_pbar = tqdm.tqdm(total=100, position=0)
         transformed_df = compute_combined_features(
-            random_df, None, args, local_pbar,
+            random_df, args, local_pbar,
         )
         self.assertEqual(transformed_df.shape[1], 4)
 
@@ -91,7 +91,7 @@ def test_compute_combinations(self):
         random_df = pd.DataFrame(random_matrix)
         random_df.columns = ['F1', 'F2', 'F3']
         transformed_df = compute_combined_features(
-            random_df, None, args, local_pbar,
+            random_df, args, local_pbar,
         )
         self.assertEqual(transformed_df.shape[1], 6)
 

From 5dd9dd9d79424f272556c27181ab02b78893353b Mon Sep 17 00:00:00 2001
From: Blaz Mramor <bmramor@outbrain.com>
Date: Tue, 2 Apr 2024 14:48:14 +0200
Subject: [PATCH 18/18] double line space

---
 outrank/task_selftest.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/outrank/task_selftest.py b/outrank/task_selftest.py
index 744120a..cd1cb45 100644
--- a/outrank/task_selftest.py
+++ b/outrank/task_selftest.py
@@ -40,5 +40,6 @@ def conduct_self_test():
 
     logger.info('All tests passed, OutRank seems in shape \N{winking face}')
 
+
 if __name__ == '__main__':
     conduct_self_test()