Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LR with priors initial implementation #66

Merged
merged 18 commits into from
Apr 3, 2024
35 changes: 23 additions & 12 deletions outrank/algorithms/importance_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import pandas as pd
from scipy.stats import pearsonr
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
Expand All @@ -38,12 +38,14 @@ def sklearn_MI(vector_first: Any, vector_second: Any) -> float:


def sklearn_surrogate(
vector_first: Any, vector_second: Any, surrogate_model: str,
vector_first: Any, vector_second: Any, X: Any, surrogate_model: str
) -> float:
if surrogate_model == 'surrogate-LR':
bmramor marked this conversation as resolved.
Show resolved Hide resolved
if 'surrogate-LR' in surrogate_model:
clf = LogisticRegression(max_iter=100000)
elif surrogate_model == 'surrogate-SVM':
elif 'surrogate-SVM' in surrogate_model:
clf = SVC(gamma='auto', probability=True)
elif 'surrogate-SGD' in surrogate_model:
clf = SGDClassifier(max_iter=100000, loss='log_loss')

transf = OneHotEncoder()

Expand All @@ -57,17 +59,22 @@ def sklearn_surrogate(
unique_values, counts = np.unique(vector_second, return_counts=True)
miha-jenko marked this conversation as resolved.
Show resolved Hide resolved

# Establish min support for this type of ranking.
if counts[0] < len(unique_values) * (2**5):
estimate_feature_importance = 0
# if counts[0] < len(unique_values) * (2**5):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's remove such comments

# estimate_feature_importance = 0

else:
if X.shape[0] == 0 and X.shape[1] == 0:
vector_first = transf.fit_transform(vector_first.reshape(-1, 1))
miha-jenko marked this conversation as resolved.
Show resolved Hide resolved
estimate_feature_importance_list = cross_val_score(
clf, vector_first, vector_second, scoring='neg_log_loss', cv=4,
)

estimate_feature_importance = 1 + \
np.median(estimate_feature_importance_list)
else:
X = np.concatenate((X,vector_first.reshape(-1, 1)), axis=1)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is a space missing after X, wondering why lint didn't catch that. @miha-jenko maybe some idea?

X = transf.fit_transform(X)
estimate_feature_importance_list = cross_val_score(
clf, X, vector_second, scoring='neg_log_loss', cv=4,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's put the num. of folds to top of the file as a constant for now

)
estimate_feature_importance = 1 + \
np.median(estimate_feature_importance_list)

return estimate_feature_importance

Expand Down Expand Up @@ -97,7 +104,7 @@ def sklearn_mi_adj(vector_first, vector_second):
return estimate_feature_importance


def get_importances_estimate_pairwise(combination, args, tmp_df):
def get_importances_estimate_pairwise(combination, reference_model_features, args, tmp_df):
"""A method for parallel importances estimation. As interaction scoring is independent, individual scores can be computed in parallel."""

feature_one = combination[0]
Expand All @@ -122,8 +129,12 @@ def get_importances_estimate_pairwise(combination, args, tmp_df):
estimate_feature_importance = sklearn_MI(vector_first, vector_second)

elif 'surrogate-' in args.heuristic:
X = np.array(float)
if ('-prior' in args.heuristic) and (len(reference_model_features) > 0):
X = tmp_df[reference_model_features].values

estimate_feature_importance = sklearn_surrogate(
vector_first, vector_second, args.heuristic,
vector_first, vector_second, X, args.heuristic
)

elif 'MI-numba' in args.heuristic:
Expand Down
6 changes: 5 additions & 1 deletion outrank/core_ranking.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,9 +130,13 @@ def mixed_rank_graph(
# Map the scoring calls to the worker pool
pbar.set_description('Allocating thread pool')

reference_model_features = {}
if 'prior' in args.heuristic:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you check for -prior at some point, but prior at some other point. Consider creating a helper function is_prior_heuristic or something, that unifies this behavior (and centralizes it)

reference_model_features = [(" AND ").join(item.split(",")) for item in extract_features_from_reference_JSON(args.reference_model_JSON, full_feature_space = True)]

# starmap is an alternative that is slower unfortunately (but nicer)
def get_grounded_importances_estimate(combination: tuple[str]) -> Any:
return get_importances_estimate_pairwise(combination, args, tmp_df=tmp_df)
return get_importances_estimate_pairwise(combination, reference_model_features, args, tmp_df=tmp_df)

start_enc_timer = timer()
with cpu_pool as p:
Expand Down
5 changes: 4 additions & 1 deletion outrank/core_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,14 +393,17 @@ def parse_csv_raw(data_path) -> DatasetInformationStorage:
)


def extract_features_from_reference_JSON(json_path: str, combined_features_only = False) -> set[Any]:
def extract_features_from_reference_JSON(json_path: str, combined_features_only = False, full_feature_space = False) -> set[Any]:
"""Given a model's JSON, extract unique features"""

with open(json_path) as jp:
content = json.load(jp)

unique_features = set()
feature_space = content['desc'].get('features', [])
if full_feature_space:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

full_feature_space sounds somewhat odd for a flag that computes a set

return set(feature_space)

fields_space = content['desc'].get('fields', [])
joint_space = feature_space + fields_space

Expand Down
6 changes: 5 additions & 1 deletion outrank/task_selftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def conduct_self_test():
'outrank --task data_generator --num_synthetic_rows 100000', shell=True,
)
subprocess.run(
'outrank --task ranking --data_path test_data_synthetic --data_source csv-raw --combination_number_upper_bound 60;',
'outrank --task ranking --data_path test_data_synthetic --data_source csv-raw --combination_number_upper_bound 60 --heuristic surrogate-SGD-prior --reference_model_JSON tests/test_ref_model.json;',
shell=True,
)

Expand All @@ -40,3 +40,7 @@ def conduct_self_test():
shutil.rmtree(path)

logger.info('All tests passed, OutRank seems in shape \N{winking face}')

bmramor marked this conversation as resolved.
Show resolved Hide resolved
if __name__ == '__main__':
conduct_self_test()

5 changes: 5 additions & 0 deletions tests/test_ref_model.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"desc": {
"features": ["f0","f1","f0,f1"]
}
}
Loading