outbrain · SkBlaz · Apr 3, 2024 · Mar 12, 2024 · Mar 13, 2024 · Mar 15, 2024
diff --git a/outrank/algorithms/importance_estimator.py b/outrank/algorithms/importance_estimator.py
@@ -11,7 +11,7 @@
 import pandas as pd
 from scipy.stats import pearsonr
 from sklearn.feature_selection import mutual_info_classif
-from sklearn.linear_model import LogisticRegression
+from sklearn.linear_model import LogisticRegression, SGDClassifier
 from sklearn.metrics import adjusted_mutual_info_score
 from sklearn.model_selection import cross_val_score
 from sklearn.preprocessing import OneHotEncoder
@@ -38,12 +38,14 @@ def sklearn_MI(vector_first: Any, vector_second: Any) -> float:
 
 
 def sklearn_surrogate(
-    vector_first: Any, vector_second: Any, surrogate_model: str,
+    vector_first: Any, vector_second: Any, X: Any, surrogate_model: str
 ) -> float:
-    if surrogate_model == 'surrogate-LR':
+    if 'surrogate-LR' in surrogate_model:
         clf = LogisticRegression(max_iter=100000)
-    elif surrogate_model == 'surrogate-SVM':
+    elif 'surrogate-SVM' in surrogate_model:
         clf = SVC(gamma='auto', probability=True)
+    elif 'surrogate-SGD' in surrogate_model:
+        clf = SGDClassifier(max_iter=100000, loss='log_loss')
 
     transf = OneHotEncoder()
 
@@ -57,17 +59,22 @@ def sklearn_surrogate(
     unique_values, counts = np.unique(vector_second, return_counts=True)
 
     # Establish min support for this type of ranking.
-    if counts[0] < len(unique_values) * (2**5):
-        estimate_feature_importance = 0
+    # if counts[0] < len(unique_values) * (2**5):
+    #     estimate_feature_importance = 0
 
-    else:
+    if X.shape[0] == 0 and X.shape[1] == 0:
         vector_first = transf.fit_transform(vector_first.reshape(-1, 1))
         estimate_feature_importance_list = cross_val_score(
             clf, vector_first, vector_second, scoring='neg_log_loss', cv=4,
         )
-
-        estimate_feature_importance = 1 + \
-            np.median(estimate_feature_importance_list)
+    else:
+        X = np.concatenate((X,vector_first.reshape(-1, 1)), axis=1)
+        X = transf.fit_transform(X)
+        estimate_feature_importance_list = cross_val_score(
+            clf, X, vector_second, scoring='neg_log_loss', cv=4,
+        )   
+    estimate_feature_importance = 1 + \
+        np.median(estimate_feature_importance_list)        
 
     return estimate_feature_importance
 
@@ -97,7 +104,7 @@ def sklearn_mi_adj(vector_first, vector_second):
     return estimate_feature_importance
 
 
-def get_importances_estimate_pairwise(combination, args, tmp_df):
+def get_importances_estimate_pairwise(combination, reference_model_features, args, tmp_df):
     """A method for parallel importances estimation. As interaction scoring is independent, individual scores can be computed in parallel."""
 
     feature_one = combination[0]
@@ -122,8 +129,12 @@ def get_importances_estimate_pairwise(combination, args, tmp_df):
         estimate_feature_importance = sklearn_MI(vector_first, vector_second)
 
     elif 'surrogate-' in args.heuristic:
+        X = np.array(float)
+        if ('-prior' in args.heuristic) and (len(reference_model_features) > 0):
+            X = tmp_df[reference_model_features].values
+
         estimate_feature_importance = sklearn_surrogate(
-            vector_first, vector_second, args.heuristic,
+            vector_first, vector_second, X, args.heuristic
         )
 
     elif 'MI-numba' in args.heuristic:

diff --git a/outrank/core_ranking.py b/outrank/core_ranking.py
@@ -130,9 +130,13 @@ def mixed_rank_graph(
     # Map the scoring calls to the worker pool
     pbar.set_description('Allocating thread pool')
 
+    reference_model_features = {}
+    if 'prior' in args.heuristic:
+        reference_model_features = [(" AND ").join(item.split(",")) for item in extract_features_from_reference_JSON(args.reference_model_JSON, full_feature_space = True)]
+
     # starmap is an alternative that is slower unfortunately (but nicer)
     def get_grounded_importances_estimate(combination: tuple[str]) -> Any:
-        return get_importances_estimate_pairwise(combination, args, tmp_df=tmp_df)
+        return get_importances_estimate_pairwise(combination, reference_model_features, args, tmp_df=tmp_df)
 
     start_enc_timer = timer()
     with cpu_pool as p:

diff --git a/outrank/core_utils.py b/outrank/core_utils.py
@@ -393,14 +393,17 @@ def parse_csv_raw(data_path) -> DatasetInformationStorage:
     )
 
 
-def extract_features_from_reference_JSON(json_path: str, combined_features_only = False) -> set[Any]:
+def extract_features_from_reference_JSON(json_path: str, combined_features_only = False, full_feature_space = False) -> set[Any]:
     """Given a model's JSON, extract unique features"""
 
     with open(json_path) as jp:
         content = json.load(jp)
 
     unique_features = set()
     feature_space = content['desc'].get('features', [])
+    if full_feature_space:
+        return set(feature_space)
+
     fields_space = content['desc'].get('fields', [])
     joint_space = feature_space + fields_space
 

diff --git a/outrank/task_selftest.py b/outrank/task_selftest.py
@@ -22,7 +22,7 @@ def conduct_self_test():
         'outrank --task data_generator --num_synthetic_rows 100000', shell=True,
     )
     subprocess.run(
-        'outrank --task ranking --data_path test_data_synthetic --data_source csv-raw --combination_number_upper_bound 60;',
+        'outrank --task ranking --data_path test_data_synthetic --data_source csv-raw --combination_number_upper_bound 60 --heuristic surrogate-SGD-prior --reference_model_JSON tests/test_ref_model.json;',
         shell=True,
     )
 
@@ -40,3 +40,7 @@ def conduct_self_test():
             shutil.rmtree(path)
 
     logger.info('All tests passed, OutRank seems in shape \N{winking face}')
+
+if __name__ == '__main__':
+    conduct_self_test()
+
diff --git a/tests/test_ref_model.json b/tests/test_ref_model.json
@@ -0,0 +1,5 @@
+{
+    "desc": {
+        "features": ["f0","f1","f0,f1"]
+    }
+}