Merge pull request #66 from outbrain/log_priors

LR with priors initial implementation
outbrain · Apr 3, 2024 · d6dc5d3 · d6dc5d3
2 parents 1162595 + 5dd9dd9
commit d6dc5d3
Show file tree

Hide file tree

Showing 9 changed files with 125 additions and 53 deletions.
diff --git a/examples/run_ranking_prior.sh b/examples/run_ranking_prior.sh
@@ -0,0 +1,21 @@
+##########################################################################################################
+# A very generic OutRank invocation (default). It includes visualizations and other relevant statistics. #
+##########################################################################################################
+
+# This run compares features "one-at-a-time" and summarizes, visualizes the outputs.
+# hint - if unsure what parameters do, you can always run "outrank --help"
+
+outrank \
+    --task all \
+    --data_path $PATH_TO_YOUR_DATA \
+    --data_source ob-csv \
+    --heuristic surrogate-SGD-prior \
+    --target_ranking_only True \
+    --interaction_order 2 \
+    --combination_number_upper_bound 2048 \
+    --num_threads 12 \
+    --output_folder ./some_output_folder \
+    --subsampling 100 \
+    --minibatch_size 10000 \
+    --label_column info_click_valid \
+    --reference_model_JSON $PATH_TO_YOUR_REFERENCE_MODEL
diff --git a/outrank/algorithms/importance_estimator.py b/outrank/algorithms/importance_estimator.py
@@ -11,15 +11,20 @@
 import pandas as pd
 from scipy.stats import pearsonr
 from sklearn.feature_selection import mutual_info_classif
-from sklearn.linear_model import LogisticRegression
+from sklearn.linear_model import LogisticRegression, SGDClassifier
 from sklearn.metrics import adjusted_mutual_info_score
 from sklearn.model_selection import cross_val_score
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.svm import SVC
 
+from outrank.core_utils import is_prior_heuristic
+
+
 logger = logging.getLogger('syn-logger')
 logger.setLevel(logging.DEBUG)
 
+num_folds = 4
+
 try:
     from outrank.algorithms.feature_ranking import ranking_mi_numba
 
@@ -38,13 +43,11 @@ def sklearn_MI(vector_first: Any, vector_second: Any) -> float:
 
 
 def sklearn_surrogate(
-    vector_first: Any, vector_second: Any, surrogate_model: str,
+    vector_first: Any, vector_second: Any, X: Any, surrogate_model: str
 ) -> float:
-    if surrogate_model == 'surrogate-LR':
-        clf = LogisticRegression(max_iter=100000)
-    elif surrogate_model == 'surrogate-SVM':
-        clf = SVC(gamma='auto', probability=True)
-
+
+    clf = initialize_classifier(surrogate_model)
+
     transf = OneHotEncoder()
 
     # They do not commute, swap if needed
@@ -54,20 +57,17 @@ def sklearn_surrogate(
         vector_first = vector_third
         del vector_third
 
-    unique_values, counts = np.unique(vector_second, return_counts=True)
-
-    # Establish min support for this type of ranking.
-    if counts[0] < len(unique_values) * (2**5):
-        estimate_feature_importance = 0
-
+    if X.size <= 1:
+        X = vector_first.reshape(-1, 1)
     else:
-        vector_first = transf.fit_transform(vector_first.reshape(-1, 1))
-        estimate_feature_importance_list = cross_val_score(
-            clf, vector_first, vector_second, scoring='neg_log_loss', cv=4,
-        )
+        X = np.concatenate((X, vector_first.reshape(-1, 1)), axis=1)
 
-        estimate_feature_importance = 1 + \
-            np.median(estimate_feature_importance_list)
+    X = transf.fit_transform(X)
+    estimate_feature_importance_list = cross_val_score(
+        clf, X, vector_second, scoring='neg_log_loss', cv=num_folds,
+    )
+    estimate_feature_importance = 1 + \
+        np.median(estimate_feature_importance_list)        
 
     return estimate_feature_importance
 
@@ -97,7 +97,7 @@ def sklearn_mi_adj(vector_first, vector_second):
     return estimate_feature_importance
 
 
-def get_importances_estimate_pairwise(combination, args, tmp_df):
+def get_importances_estimate_pairwise(combination, reference_model_features, args, tmp_df):
     """A method for parallel importances estimation. As interaction scoring is independent, individual scores can be computed in parallel."""
 
     feature_one = combination[0]
@@ -122,8 +122,12 @@ def get_importances_estimate_pairwise(combination, args, tmp_df):
         estimate_feature_importance = sklearn_MI(vector_first, vector_second)
 
     elif 'surrogate-' in args.heuristic:
+        X = np.array(float)
+        if is_prior_heuristic(args) and (len(reference_model_features) > 0):
+            X = tmp_df[reference_model_features].values
+
         estimate_feature_importance = sklearn_surrogate(
-            vector_first, vector_second, args.heuristic,
+            vector_first, vector_second, X, args.heuristic
         )
 
     elif 'MI-numba' in args.heuristic:
@@ -213,3 +217,15 @@ def get_importances_estimate_nonmyopic(args: Any, tmp_df: pd.DataFrame):
     # TODO - nonmyopic algorithms - tmp_df \ args.label vs. label
     # TODO - this is to be executed directly on df - no need for parallel kernel(s)
     pass
+
+
+def initialize_classifier(surrogate_model: str):
+    if 'surrogate-LR' in surrogate_model:
+        return LogisticRegression(max_iter=100000)
+    elif 'surrogate-SVM' in surrogate_model:
+        return SVC(gamma='auto', probability=True)
+    elif 'surrogate-SGD' in surrogate_model:
+        return SGDClassifier(max_iter=100000, loss='log_loss')
+    else:
+        logging.warning(f'The chosen surrogate model {surrogate_model} is not supported, falling back to surrogate-SGD')
+        return SGDClassifier(max_iter=100000, loss='log_loss')
diff --git a/outrank/algorithms/synthetic_data_generators/generator_naive.py b/outrank/algorithms/synthetic_data_generators/generator_naive.py
@@ -13,7 +13,8 @@ def generate_random_matrix(num_features=100, size=20000):
     target = sample[:, 30]
     # Some noise
 
-    target[target < 20] = 0
+    target[target < 40] = 0
+    target[target > 39] = 1
     return sample, target
 
 

diff --git a/outrank/core_ranking.py b/outrank/core_ranking.py
@@ -32,6 +32,7 @@
 from outrank.core_utils import internal_hash
 from outrank.core_utils import NominalFeatureSummary
 from outrank.core_utils import NumericFeatureSummary
+from outrank.core_utils import is_prior_heuristic
 from outrank.feature_transformations.ranking_transformers import FeatureTransformerGeneric
 from outrank.feature_transformations.ranking_transformers import FeatureTransformerNoise
 
@@ -50,12 +51,15 @@
 def prior_combinations_sample(combinations: list[tuple[Any, ...]], args: Any) -> list[tuple[Any, ...]]:
     """Make sure only relevant subspace of combinations is selected based on prior counts"""
 
-    if len(GLOBAL_PRIOR_COMB_COUNTS) == 0:
-        for combination in combinations:
-            GLOBAL_PRIOR_COMB_COUNTS[combination] += 1
-        tmp = combinations[:args.combination_number_upper_bound]
-    else:
-        tmp = list(x[0] for x in sorted(GLOBAL_PRIOR_COMB_COUNTS.items(), key=lambda x:x[1], reverse=False))[:args.combination_number_upper_bound]
+    if len(combinations) == 0:
+        return []
+
+    missing_combinations = set(set(combinations)).difference(GLOBAL_PRIOR_COMB_COUNTS.keys())
+    if len(missing_combinations) > 0:
+        for combination in missing_combinations:
+            GLOBAL_PRIOR_COMB_COUNTS[combination] = 0
+
+    tmp = sorted(combinations, key=GLOBAL_PRIOR_COMB_COUNTS.get, reverse=False)[:args.combination_number_upper_bound]
 
     for combination in tmp:
         GLOBAL_PRIOR_COMB_COUNTS[combination] += 1
@@ -115,6 +119,12 @@ def mixed_rank_graph(
     out_time_struct['encoding_columns'] = end_enc_timer - start_enc_timer
 
     combinations = get_combinations_from_columns(all_columns, args)
+
+    reference_model_features = {}
+    if is_prior_heuristic(args):
+        reference_model_features = [(" AND ").join(tuple(sorted(item.split(",")))) for item in extract_features_from_reference_JSON(args.reference_model_JSON, all_features=True)]
+        combinations = [comb for comb in combinations if comb[0] not in reference_model_features and comb[1] not in reference_model_features]
+
     combinations = prior_combinations_sample(combinations, args)
     random.shuffle(combinations)
 
@@ -132,7 +142,7 @@ def mixed_rank_graph(
 
     # starmap is an alternative that is slower unfortunately (but nicer)
     def get_grounded_importances_estimate(combination: tuple[str]) -> Any:
-        return get_importances_estimate_pairwise(combination, args, tmp_df=tmp_df)
+        return get_importances_estimate_pairwise(combination, reference_model_features, args, tmp_df=tmp_df)
 
     start_enc_timer = timer()
     with cpu_pool as p:
@@ -176,7 +186,6 @@ def enrich_with_transformations(
 
 def compute_combined_features(
     input_dataframe: pd.DataFrame,
-    logger: Any,
     args: Any,
     pbar: Any,
     is_3mr: bool = False,
@@ -189,19 +198,25 @@ def compute_combined_features(
     join_string = ' AND_REL ' if is_3mr else ' AND '
     interaction_order = 2 if is_3mr else args.interaction_order
 
+    model_combinations = []
+    full_combination_space = []
+
+
+    if args.interaction_order > 1:
+            full_combination_space = list(
+                itertools.combinations(all_columns, interaction_order),
+            )
+    full_combination_space = prior_combinations_sample(full_combination_space, args)
+
     if args.reference_model_JSON != '':
-        combined_features = extract_features_from_reference_JSON(args.reference_model_JSON, combined_features_only = True)
-        full_combination_space = [combination.split(',') for combination in combined_features]
-    else:
-        full_combination_space = list(
-            itertools.combinations(all_columns, interaction_order),
-        )
+        model_combinations = extract_features_from_reference_JSON(args.reference_model_JSON, combined_features_only = True)
+        model_combinations = [tuple(sorted(combination.split(','))) for combination in model_combinations]
+        if not is_prior_heuristic(args):
+            full_combination_space = model_combinations
+
+    if is_prior_heuristic(args):
+        full_combination_space = full_combination_space + [tuple for tuple in model_combinations if tuple not in full_combination_space]
 
-    if args.combination_number_upper_bound and args.reference_model_JSON != '':
-        random.shuffle(full_combination_space)
-        full_combination_space = full_combination_space[
-            : args.combination_number_upper_bound
-        ]
 
     com_counter = 0
     new_feature_hash = {}
@@ -531,7 +546,7 @@ def compute_batch_ranking(
     if args.interaction_order > 1 or args.reference_model_JSON:
         pbar.set_description('Constructing new features')
         input_dataframe = compute_combined_features(
-            input_dataframe, logger, args, pbar,
+            input_dataframe, args, pbar,
         )
 
     # in case of 3mr we compute the score of combinations against the target
@@ -540,7 +555,7 @@ def compute_batch_ranking(
             'Constructing features for computing relations in 3mr',
         )
         input_dataframe = compute_combined_features(
-            input_dataframe, logger, args, pbar, True,
+            input_dataframe, args, pbar, True,
         )
 
     if args.include_noise_baseline_features == 'True' and args.heuristic != 'Constant':

diff --git a/outrank/core_utils.py b/outrank/core_utils.py
@@ -393,14 +393,17 @@ def parse_csv_raw(data_path) -> DatasetInformationStorage:
     )
 
 
-def extract_features_from_reference_JSON(json_path: str, combined_features_only = False) -> set[Any]:
+def extract_features_from_reference_JSON(json_path: str, combined_features_only = False, all_features = False) -> set[Any]:
     """Given a model's JSON, extract unique features"""
 
     with open(json_path) as jp:
         content = json.load(jp)
 
     unique_features = set()
     feature_space = content['desc'].get('features', [])
+    if all_features:
+        return set(feature_space)
+
     fields_space = content['desc'].get('fields', [])
     joint_space = feature_space + fields_space
 
@@ -641,3 +644,10 @@ def summarize_rare_counts(
     final_df.to_csv(
         f'{args.output_folder}/feature_sparsity_summary.tsv', index=False, sep='\t',
     )
+
+
+def is_prior_heuristic(args: Any) -> bool:
+    if "-prior" in args.heuristic and args.reference_model_JSON:
+        return True
+    return False
+
diff --git a/outrank/task_selftest.py b/outrank/task_selftest.py
@@ -1,6 +1,5 @@
 # helper set of methods that enable anywhere verification of core functions
 from __future__ import annotations
-
 import logging
 import os
 import shutil
@@ -22,16 +21,16 @@ def conduct_self_test():
         'outrank --task data_generator --num_synthetic_rows 100000', shell=True,
     )
     subprocess.run(
-        'outrank --task ranking --data_path test_data_synthetic --data_source csv-raw --combination_number_upper_bound 60;',
+        'outrank --task ranking --data_path test_data_synthetic --data_source csv-raw;',
         shell=True,
     )
 
     dfx = pd.read_csv('ranking_outputs/pairwise_ranks.tsv', sep='\t')
 
     logger.info("Verifying output's properties ..")
-    assert dfx.shape[0] == 120
+    assert dfx.shape[0] == 201
     assert dfx.shape[1] == 3
-    assert dfx['FeatureA'].values.tolist().pop() == 'label-(81; 100)' or dfx['FeatureB'].values.tolist().pop() == 'label-(81; 100)'
+    assert dfx['FeatureA'].values.tolist().pop() == 'label-(2; 100)' or dfx['FeatureB'].values.tolist().pop() == 'label-(2; 100)'
 
     to_remove = ['ranking_outputs', 'test_data_synthetic']
     for path in to_remove:
@@ -40,3 +39,7 @@ def conduct_self_test():
             shutil.rmtree(path)
 
     logger.info('All tests passed, OutRank seems in shape \N{winking face}')
+
+
+if __name__ == '__main__':
+    conduct_self_test()
diff --git a/outrank/task_summary.py b/outrank/task_summary.py
@@ -37,9 +37,10 @@ def outrank_task_result_summary(args):
 
     min_score = np.min(final_df[f'Score {args.heuristic}'].values)
     max_score = np.max(final_df[f'Score {args.heuristic}'].values)
-    final_df[f'Score {args.heuristic}'] = (
-        final_df[f'Score {args.heuristic}'] - min_score
-    ) / (max_score - min_score)
+    if "MI" in args.heuristic:
+        final_df[f'Score {args.heuristic}'] = (
+            final_df[f'Score {args.heuristic}'] - min_score
+        ) / (max_score - min_score)
     logging.info(f'Storing summary files to {args.output_folder}')
     pd.set_option('display.max_rows', None, 'display.max_columns', None)
     singles_path = os.path.join(args.output_folder, 'feature_singles.tsv')

diff --git a/tests/ranking_module_test.py b/tests/ranking_module_test.py
@@ -82,7 +82,7 @@ def test_compute_combinations(self):
         random_df.columns = ['F1', 'F2', 'F3']
         local_pbar = tqdm.tqdm(total=100, position=0)
         transformed_df = compute_combined_features(
-            random_df, None, args, local_pbar,
+            random_df, args, local_pbar,
         )
         self.assertEqual(transformed_df.shape[1], 4)
 
@@ -91,7 +91,7 @@ def test_compute_combinations(self):
         random_df = pd.DataFrame(random_matrix)
         random_df.columns = ['F1', 'F2', 'F3']
         transformed_df = compute_combined_features(
-            random_df, None, args, local_pbar,
+            random_df, args, local_pbar,
         )
         self.assertEqual(transformed_df.shape[1], 6)
 

diff --git a/tests/test_ref_model.json b/tests/test_ref_model.json
@@ -0,0 +1,5 @@
+{
+    "desc": {
+        "features": ["f0","f1","f0,f1"]
+    }
+}