outbrain · SkBlaz · Apr 3, 2024 · Mar 12, 2024 · Mar 13, 2024 · Mar 15, 2024
diff --git a/examples/run_ranking_prior.sh b/examples/run_ranking_prior.sh
@@ -0,0 +1,21 @@
+##########################################################################################################
+# A very generic OutRank invocation (default). It includes visualizations and other relevant statistics. #
+##########################################################################################################
+
+# This run compares features "one-at-a-time" and summarizes, visualizes the outputs.
+# hint - if unsure what parameters do, you can always run "outrank --help"
+
+outrank \
+    --task all \
+    --data_path $PATH_TO_YOUR_DATA \
+    --data_source ob-csv \
+    --heuristic surrogate-SGD-prior \
+    --target_ranking_only True \
+    --interaction_order 2 \
+    --combination_number_upper_bound 2048 \
+    --num_threads 12 \
+    --output_folder ./some_output_folder \
+    --subsampling 100 \
+    --minibatch_size 10000 \
+    --label_column info_click_valid \
+    --reference_model_JSON $PATH_TO_YOUR_REFERENCE_MODEL
diff --git a/outrank/algorithms/importance_estimator.py b/outrank/algorithms/importance_estimator.py
@@ -11,15 +11,20 @@
 import pandas as pd
 from scipy.stats import pearsonr
 from sklearn.feature_selection import mutual_info_classif
-from sklearn.linear_model import LogisticRegression
+from sklearn.linear_model import LogisticRegression, SGDClassifier
 from sklearn.metrics import adjusted_mutual_info_score
 from sklearn.model_selection import cross_val_score
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.svm import SVC
 
+from outrank.core_utils import is_prior_heuristic
+
+
 logger = logging.getLogger('syn-logger')
 logger.setLevel(logging.DEBUG)
 
+num_folds = 4
+
 try:
     from outrank.algorithms.feature_ranking import ranking_mi_numba
 
@@ -38,13 +43,11 @@ def sklearn_MI(vector_first: Any, vector_second: Any) -> float:
 
 
 def sklearn_surrogate(
-    vector_first: Any, vector_second: Any, surrogate_model: str,
+    vector_first: Any, vector_second: Any, X: Any, surrogate_model: str
 ) -> float:
-    if surrogate_model == 'surrogate-LR':
-        clf = LogisticRegression(max_iter=100000)
-    elif surrogate_model == 'surrogate-SVM':
-        clf = SVC(gamma='auto', probability=True)
-
+
+    clf = initialize_classifier(surrogate_model)
+
     transf = OneHotEncoder()
 
     # They do not commute, swap if needed
@@ -56,18 +59,19 @@ def sklearn_surrogate(
 
     unique_values, counts = np.unique(vector_second, return_counts=True)
 
-    # Establish min support for this type of ranking.
-    if counts[0] < len(unique_values) * (2**5):
-        estimate_feature_importance = 0
-
-    else:
+    if X.size <= 1:
         vector_first = transf.fit_transform(vector_first.reshape(-1, 1))
         estimate_feature_importance_list = cross_val_score(
-            clf, vector_first, vector_second, scoring='neg_log_loss', cv=4,
+            clf, vector_first, vector_second, scoring='neg_log_loss', cv=num_folds,
         )
-
-        estimate_feature_importance = 1 + \
-            np.median(estimate_feature_importance_list)
+    else:
+        X = np.concatenate((X, vector_first.reshape(-1, 1)), axis=1)
+        X = transf.fit_transform(X)
+        estimate_feature_importance_list = cross_val_score(
+            clf, X, vector_second, scoring='neg_log_loss', cv=num_folds,
+        )   
+    estimate_feature_importance = 1 + \
+        np.median(estimate_feature_importance_list)        
 
     return estimate_feature_importance
 
@@ -97,7 +101,7 @@ def sklearn_mi_adj(vector_first, vector_second):
     return estimate_feature_importance
 
 
-def get_importances_estimate_pairwise(combination, args, tmp_df):
+def get_importances_estimate_pairwise(combination, reference_model_features, args, tmp_df):
     """A method for parallel importances estimation. As interaction scoring is independent, individual scores can be computed in parallel."""
 
     feature_one = combination[0]
@@ -122,8 +126,12 @@ def get_importances_estimate_pairwise(combination, args, tmp_df):
         estimate_feature_importance = sklearn_MI(vector_first, vector_second)
 
     elif 'surrogate-' in args.heuristic:
+        X = np.array(float)
+        if is_prior_heuristic(args) and (len(reference_model_features) > 0):
+            X = tmp_df[reference_model_features].values
+
         estimate_feature_importance = sklearn_surrogate(
-            vector_first, vector_second, args.heuristic,
+            vector_first, vector_second, X, args.heuristic
         )
 
     elif 'MI-numba' in args.heuristic:
@@ -213,3 +221,15 @@ def get_importances_estimate_nonmyopic(args: Any, tmp_df: pd.DataFrame):
     # TODO - nonmyopic algorithms - tmp_df \ args.label vs. label
     # TODO - this is to be executed directly on df - no need for parallel kernel(s)
     pass
+
+
+def initialize_classifier(surrogate_model: str):
+    if 'surrogate-LR' in surrogate_model:
+        return LogisticRegression(max_iter=100000)
+    elif 'surrogate-SVM' in surrogate_model:
+        return SVC(gamma='auto', probability=True)
+    elif 'surrogate-SGD' in surrogate_model:
+        return SGDClassifier(max_iter=100000, loss='log_loss')
+    else:
+        logging.warning(f'The chosen surrogate model {surrogate_model} is not supported, falling back to surrogate-SGD')
+        return SGDClassifier(max_iter=100000, loss='log_loss')
diff --git a/outrank/algorithms/synthetic_data_generators/generator_naive.py b/outrank/algorithms/synthetic_data_generators/generator_naive.py
@@ -13,7 +13,8 @@ def generate_random_matrix(num_features=100, size=20000):
     target = sample[:, 30]
     # Some noise
 
-    target[target < 20] = 0
+    target[target < 40] = 0
+    target[target > 39] = 1
     return sample, target
 
 

diff --git a/outrank/core_ranking.py b/outrank/core_ranking.py
@@ -32,6 +32,7 @@
 from outrank.core_utils import internal_hash
 from outrank.core_utils import NominalFeatureSummary
 from outrank.core_utils import NumericFeatureSummary
+from outrank.core_utils import is_prior_heuristic
 from outrank.feature_transformations.ranking_transformers import FeatureTransformerGeneric
 from outrank.feature_transformations.ranking_transformers import FeatureTransformerNoise
 
@@ -115,8 +116,12 @@ def mixed_rank_graph(
     out_time_struct['encoding_columns'] = end_enc_timer - start_enc_timer
 
     combinations = get_combinations_from_columns(all_columns, args)
-    combinations = prior_combinations_sample(combinations, args)
-    random.shuffle(combinations)
+    #combinations = prior_combinations_sample(combinations, args)
+    #random.shuffle(combinations)
+
+    reference_model_features = {}
+    if is_prior_heuristic(args):
+        reference_model_features = [(" AND ").join(tuple(sorted(item.split(",")))) for item in extract_features_from_reference_JSON(args.reference_model_JSON, full_feature_space = True)]
 
     if args.heuristic == 'Constant':
         final_constant_imp = []
@@ -132,7 +137,7 @@ def mixed_rank_graph(
 
     # starmap is an alternative that is slower unfortunately (but nicer)
     def get_grounded_importances_estimate(combination: tuple[str]) -> Any:
-        return get_importances_estimate_pairwise(combination, args, tmp_df=tmp_df)
+        return get_importances_estimate_pairwise(combination, reference_model_features, args, tmp_df=tmp_df)
 
     start_enc_timer = timer()
     with cpu_pool as p:
@@ -189,19 +194,33 @@ def compute_combined_features(
     join_string = ' AND_REL ' if is_3mr else ' AND '
     interaction_order = 2 if is_3mr else args.interaction_order
 
-    if args.reference_model_JSON != '':
-        combined_features = extract_features_from_reference_JSON(args.reference_model_JSON, combined_features_only = True)
-        full_combination_space = [combination.split(',') for combination in combined_features]
+    model_combinations = []
+    full_combination_space = []
+    if is_prior_heuristic(args):
+        model_combinations = extract_features_from_reference_JSON(args.reference_model_JSON, combined_features_only = True)
+        model_combinations = [tuple(sorted(combination.split(','))) for combination in model_combinations]
+        if args.interaction_order > 1:
+            full_combination_space = list(
+                itertools.combinations(all_columns, interaction_order),
+            )
     else:
-        full_combination_space = list(
-            itertools.combinations(all_columns, interaction_order),
-        )
+        if args.reference_model_JSON != '':
+            model_combinations = extract_features_from_reference_JSON(args.reference_model_JSON, combined_features_only = True)
+            model_combinations = [tuple(sorted(combination.split(','))) for combination in model_combinations]
+            full_combination_space = model_combinations
+        else:
+            full_combination_space = list(
+                itertools.combinations(all_columns, interaction_order),
+            )
 
-    if args.combination_number_upper_bound and args.reference_model_JSON != '':
+    if args.combination_number_upper_bound:
         random.shuffle(full_combination_space)
         full_combination_space = full_combination_space[
             : args.combination_number_upper_bound
         ]
+        if is_prior_heuristic(args):
+            full_combination_space = full_combination_space + [tuple for tuple in model_combinations if tuple not in full_combination_space]
+
 
     com_counter = 0
     new_feature_hash = {}
@@ -225,7 +244,7 @@ def compute_combined_features(
     pbar.set_description('Concatenating into final frame ..')
     input_dataframe = pd.concat([input_dataframe, tmp_df], axis=1)
     del tmp_df
-
+    
     return input_dataframe
 
 

diff --git a/outrank/core_utils.py b/outrank/core_utils.py
@@ -393,14 +393,17 @@ def parse_csv_raw(data_path) -> DatasetInformationStorage:
     )
 
 
-def extract_features_from_reference_JSON(json_path: str, combined_features_only = False) -> set[Any]:
+def extract_features_from_reference_JSON(json_path: str, combined_features_only = False, full_feature_space = False) -> set[Any]:
     """Given a model's JSON, extract unique features"""
 
     with open(json_path) as jp:
         content = json.load(jp)
 
     unique_features = set()
     feature_space = content['desc'].get('features', [])
+    if full_feature_space:
+        return set(feature_space)
+
     fields_space = content['desc'].get('fields', [])
     joint_space = feature_space + fields_space
 
@@ -641,3 +644,10 @@ def summarize_rare_counts(
     final_df.to_csv(
         f'{args.output_folder}/feature_sparsity_summary.tsv', index=False, sep='\t',
     )
+
+
+def is_prior_heuristic(args: Any):
+    if "-prior" in args.heuristic and args.reference_model_JSON and args.reference_model_JSON != "":
+        return True
+    return False
+
diff --git a/outrank/task_selftest.py b/outrank/task_selftest.py
@@ -1,6 +1,5 @@
 # helper set of methods that enable anywhere verification of core functions
 from __future__ import annotations
-
 import logging
 import os
 import shutil
@@ -22,16 +21,16 @@ def conduct_self_test():
         'outrank --task data_generator --num_synthetic_rows 100000', shell=True,
     )
     subprocess.run(
-        'outrank --task ranking --data_path test_data_synthetic --data_source csv-raw --combination_number_upper_bound 60;',
+        'outrank --task ranking --data_path test_data_synthetic --data_source csv-raw;',
         shell=True,
     )
 
     dfx = pd.read_csv('ranking_outputs/pairwise_ranks.tsv', sep='\t')
 
     logger.info("Verifying output's properties ..")
-    assert dfx.shape[0] == 120
+    assert dfx.shape[0] == 201
     assert dfx.shape[1] == 3
-    assert dfx['FeatureA'].values.tolist().pop() == 'label-(81; 100)' or dfx['FeatureB'].values.tolist().pop() == 'label-(81; 100)'
+    assert dfx['FeatureA'].values.tolist().pop() == 'label-(2; 100)' or dfx['FeatureB'].values.tolist().pop() == 'label-(2; 100)'
 
     to_remove = ['ranking_outputs', 'test_data_synthetic']
     for path in to_remove:
@@ -40,3 +39,6 @@ def conduct_self_test():
             shutil.rmtree(path)
 
     logger.info('All tests passed, OutRank seems in shape \N{winking face}')
+
+if __name__ == '__main__':
+    conduct_self_test()
diff --git a/tests/test_ref_model.json b/tests/test_ref_model.json
@@ -0,0 +1,5 @@
+{
+    "desc": {
+        "features": ["f0","f1","f0,f1"]
+    }
+}