Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LR with priors initial implementation #66

Merged
merged 18 commits into from
Apr 3, 2024
21 changes: 21 additions & 0 deletions examples/run_ranking_prior.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
##########################################################################################################
# A very generic OutRank invocation (default). It includes visualizations and other relevant statistics. #
##########################################################################################################

# This run compares features "one-at-a-time" and summarizes, visualizes the outputs.
# hint - if unsure what parameters do, you can always run "outrank --help"

outrank \
--task all \
--data_path $PATH_TO_YOUR_DATA \
--data_source ob-csv \
--heuristic surrogate-SGD-prior \
--target_ranking_only True \
--interaction_order 2 \
--combination_number_upper_bound 2048 \
--num_threads 12 \
--output_folder ./some_output_folder \
--subsampling 100 \
--minibatch_size 10000 \
--label_column info_click_valid \
--reference_model_JSON $PATH_TO_YOUR_REFERENCE_MODEL
56 changes: 38 additions & 18 deletions outrank/algorithms/importance_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,20 @@
import pandas as pd
from scipy.stats import pearsonr
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC

from outrank.core_utils import is_prior_heuristic


logger = logging.getLogger('syn-logger')
logger.setLevel(logging.DEBUG)

num_folds = 4

try:
from outrank.algorithms.feature_ranking import ranking_mi_numba

Expand All @@ -38,13 +43,11 @@ def sklearn_MI(vector_first: Any, vector_second: Any) -> float:


def sklearn_surrogate(
vector_first: Any, vector_second: Any, surrogate_model: str,
vector_first: Any, vector_second: Any, X: Any, surrogate_model: str
) -> float:
if surrogate_model == 'surrogate-LR':
bmramor marked this conversation as resolved.
Show resolved Hide resolved
clf = LogisticRegression(max_iter=100000)
elif surrogate_model == 'surrogate-SVM':
clf = SVC(gamma='auto', probability=True)


clf = initialize_classifier(surrogate_model)

transf = OneHotEncoder()

# They do not commute, swap if needed
Expand All @@ -56,18 +59,19 @@ def sklearn_surrogate(

unique_values, counts = np.unique(vector_second, return_counts=True)
miha-jenko marked this conversation as resolved.
Show resolved Hide resolved

# Establish min support for this type of ranking.
if counts[0] < len(unique_values) * (2**5):
estimate_feature_importance = 0

else:
if X.size <= 1:
vector_first = transf.fit_transform(vector_first.reshape(-1, 1))
miha-jenko marked this conversation as resolved.
Show resolved Hide resolved
estimate_feature_importance_list = cross_val_score(
clf, vector_first, vector_second, scoring='neg_log_loss', cv=4,
clf, vector_first, vector_second, scoring='neg_log_loss', cv=num_folds,
)

estimate_feature_importance = 1 + \
np.median(estimate_feature_importance_list)
else:
X = np.concatenate((X, vector_first.reshape(-1, 1)), axis=1)
X = transf.fit_transform(X)
estimate_feature_importance_list = cross_val_score(
clf, X, vector_second, scoring='neg_log_loss', cv=num_folds,
)
estimate_feature_importance = 1 + \
np.median(estimate_feature_importance_list)

return estimate_feature_importance

Expand Down Expand Up @@ -97,7 +101,7 @@ def sklearn_mi_adj(vector_first, vector_second):
return estimate_feature_importance


def get_importances_estimate_pairwise(combination, args, tmp_df):
def get_importances_estimate_pairwise(combination, reference_model_features, args, tmp_df):
"""A method for parallel importances estimation. As interaction scoring is independent, individual scores can be computed in parallel."""

feature_one = combination[0]
Expand All @@ -122,8 +126,12 @@ def get_importances_estimate_pairwise(combination, args, tmp_df):
estimate_feature_importance = sklearn_MI(vector_first, vector_second)

elif 'surrogate-' in args.heuristic:
X = np.array(float)
if is_prior_heuristic(args) and (len(reference_model_features) > 0):
X = tmp_df[reference_model_features].values

estimate_feature_importance = sklearn_surrogate(
vector_first, vector_second, args.heuristic,
vector_first, vector_second, X, args.heuristic
)

elif 'MI-numba' in args.heuristic:
Expand Down Expand Up @@ -213,3 +221,15 @@ def get_importances_estimate_nonmyopic(args: Any, tmp_df: pd.DataFrame):
# TODO - nonmyopic algorithms - tmp_df \ args.label vs. label
# TODO - this is to be executed directly on df - no need for parallel kernel(s)
pass


def initialize_classifier(surrogate_model: str):
if 'surrogate-LR' in surrogate_model:
return LogisticRegression(max_iter=100000)
elif 'surrogate-SVM' in surrogate_model:
return SVC(gamma='auto', probability=True)
elif 'surrogate-SGD' in surrogate_model:
return SGDClassifier(max_iter=100000, loss='log_loss')
else:
logging.warning(f'The chosen surrogate model {surrogate_model} is not supported, falling back to surrogate-SGD')
return SGDClassifier(max_iter=100000, loss='log_loss')
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ def generate_random_matrix(num_features=100, size=20000):
target = sample[:, 30]
# Some noise

target[target < 20] = 0
target[target < 40] = 0
target[target > 39] = 1
return sample, target


Expand Down
41 changes: 30 additions & 11 deletions outrank/core_ranking.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from outrank.core_utils import internal_hash
from outrank.core_utils import NominalFeatureSummary
from outrank.core_utils import NumericFeatureSummary
from outrank.core_utils import is_prior_heuristic
from outrank.feature_transformations.ranking_transformers import FeatureTransformerGeneric
from outrank.feature_transformations.ranking_transformers import FeatureTransformerNoise

Expand Down Expand Up @@ -115,8 +116,12 @@ def mixed_rank_graph(
out_time_struct['encoding_columns'] = end_enc_timer - start_enc_timer

combinations = get_combinations_from_columns(all_columns, args)
combinations = prior_combinations_sample(combinations, args)
random.shuffle(combinations)
#combinations = prior_combinations_sample(combinations, args)
miha-jenko marked this conversation as resolved.
Show resolved Hide resolved
#random.shuffle(combinations)

reference_model_features = {}
if is_prior_heuristic(args):
reference_model_features = [(" AND ").join(tuple(sorted(item.split(",")))) for item in extract_features_from_reference_JSON(args.reference_model_JSON, full_feature_space = True)]

if args.heuristic == 'Constant':
final_constant_imp = []
Expand All @@ -132,7 +137,7 @@ def mixed_rank_graph(

# starmap is an alternative that is slower unfortunately (but nicer)
def get_grounded_importances_estimate(combination: tuple[str]) -> Any:
return get_importances_estimate_pairwise(combination, args, tmp_df=tmp_df)
return get_importances_estimate_pairwise(combination, reference_model_features, args, tmp_df=tmp_df)

start_enc_timer = timer()
with cpu_pool as p:
Expand Down Expand Up @@ -189,19 +194,33 @@ def compute_combined_features(
join_string = ' AND_REL ' if is_3mr else ' AND '
bmramor marked this conversation as resolved.
Show resolved Hide resolved
interaction_order = 2 if is_3mr else args.interaction_order

if args.reference_model_JSON != '':
combined_features = extract_features_from_reference_JSON(args.reference_model_JSON, combined_features_only = True)
full_combination_space = [combination.split(',') for combination in combined_features]
model_combinations = []
full_combination_space = []
if is_prior_heuristic(args):
model_combinations = extract_features_from_reference_JSON(args.reference_model_JSON, combined_features_only = True)
model_combinations = [tuple(sorted(combination.split(','))) for combination in model_combinations]
if args.interaction_order > 1:
full_combination_space = list(
itertools.combinations(all_columns, interaction_order),
)
else:
full_combination_space = list(
itertools.combinations(all_columns, interaction_order),
)
if args.reference_model_JSON != '':
model_combinations = extract_features_from_reference_JSON(args.reference_model_JSON, combined_features_only = True)
model_combinations = [tuple(sorted(combination.split(','))) for combination in model_combinations]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

combination delimiter could be a const, as it repeats

full_combination_space = model_combinations
else:
full_combination_space = list(
itertools.combinations(all_columns, interaction_order),
)

if args.combination_number_upper_bound and args.reference_model_JSON != '':
if args.combination_number_upper_bound:
random.shuffle(full_combination_space)
full_combination_space = full_combination_space[
: args.combination_number_upper_bound
]
if is_prior_heuristic(args):
full_combination_space = full_combination_space + [tuple for tuple in model_combinations if tuple not in full_combination_space]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't this second part list(set(model_combinations).difference(set(full_combination_space)))



com_counter = 0
new_feature_hash = {}
Expand All @@ -225,7 +244,7 @@ def compute_combined_features(
pbar.set_description('Concatenating into final frame ..')
input_dataframe = pd.concat([input_dataframe, tmp_df], axis=1)
del tmp_df

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no need for this space

return input_dataframe


Expand Down
12 changes: 11 additions & 1 deletion outrank/core_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,14 +393,17 @@ def parse_csv_raw(data_path) -> DatasetInformationStorage:
)


def extract_features_from_reference_JSON(json_path: str, combined_features_only = False) -> set[Any]:
def extract_features_from_reference_JSON(json_path: str, combined_features_only = False, full_feature_space = False) -> set[Any]:
"""Given a model's JSON, extract unique features"""

with open(json_path) as jp:
content = json.load(jp)

unique_features = set()
feature_space = content['desc'].get('features', [])
if full_feature_space:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

full_feature_space sounds somewhat odd for a flag that computes a set

return set(feature_space)

fields_space = content['desc'].get('fields', [])
joint_space = feature_space + fields_space

Expand Down Expand Up @@ -641,3 +644,10 @@ def summarize_rare_counts(
final_df.to_csv(
f'{args.output_folder}/feature_sparsity_summary.tsv', index=False, sep='\t',
)


def is_prior_heuristic(args: Any):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

missing return type

if "-prior" in args.heuristic and args.reference_model_JSON and args.reference_model_JSON != "":
miha-jenko marked this conversation as resolved.
Show resolved Hide resolved
return True
return False

10 changes: 6 additions & 4 deletions outrank/task_selftest.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# helper set of methods that enable anywhere verification of core functions
from __future__ import annotations

import logging
import os
import shutil
Expand All @@ -22,16 +21,16 @@ def conduct_self_test():
'outrank --task data_generator --num_synthetic_rows 100000', shell=True,
)
subprocess.run(
'outrank --task ranking --data_path test_data_synthetic --data_source csv-raw --combination_number_upper_bound 60;',
'outrank --task ranking --data_path test_data_synthetic --data_source csv-raw;',
shell=True,
)

dfx = pd.read_csv('ranking_outputs/pairwise_ranks.tsv', sep='\t')

logger.info("Verifying output's properties ..")
assert dfx.shape[0] == 120
assert dfx.shape[0] == 201
assert dfx.shape[1] == 3
assert dfx['FeatureA'].values.tolist().pop() == 'label-(81; 100)' or dfx['FeatureB'].values.tolist().pop() == 'label-(81; 100)'
assert dfx['FeatureA'].values.tolist().pop() == 'label-(2; 100)' or dfx['FeatureB'].values.tolist().pop() == 'label-(2; 100)'

to_remove = ['ranking_outputs', 'test_data_synthetic']
for path in to_remove:
Expand All @@ -40,3 +39,6 @@ def conduct_self_test():
shutil.rmtree(path)

logger.info('All tests passed, OutRank seems in shape \N{winking face}')

bmramor marked this conversation as resolved.
Show resolved Hide resolved
if __name__ == '__main__':
conduct_self_test()
5 changes: 5 additions & 0 deletions tests/test_ref_model.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"desc": {
"features": ["f0","f1","f0,f1"]
}
}
Loading