Skip to content

Commit

Permalink
Merge pull request #66 from outbrain/log_priors
Browse files Browse the repository at this point in the history
LR with priors initial implementation
  • Loading branch information
SkBlaz committed Apr 3, 2024
2 parents 1162595 + 5dd9dd9 commit d6dc5d3
Show file tree
Hide file tree
Showing 9 changed files with 125 additions and 53 deletions.
21 changes: 21 additions & 0 deletions examples/run_ranking_prior.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
##########################################################################################################
# A very generic OutRank invocation (default). It includes visualizations and other relevant statistics. #
##########################################################################################################

# This run compares features "one-at-a-time" and summarizes, visualizes the outputs.
# hint - if unsure what parameters do, you can always run "outrank --help"

outrank \
--task all \
--data_path $PATH_TO_YOUR_DATA \
--data_source ob-csv \
--heuristic surrogate-SGD-prior \
--target_ranking_only True \
--interaction_order 2 \
--combination_number_upper_bound 2048 \
--num_threads 12 \
--output_folder ./some_output_folder \
--subsampling 100 \
--minibatch_size 10000 \
--label_column info_click_valid \
--reference_model_JSON $PATH_TO_YOUR_REFERENCE_MODEL
58 changes: 37 additions & 21 deletions outrank/algorithms/importance_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,20 @@
import pandas as pd
from scipy.stats import pearsonr
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC

from outrank.core_utils import is_prior_heuristic


logger = logging.getLogger('syn-logger')
logger.setLevel(logging.DEBUG)

num_folds = 4

try:
from outrank.algorithms.feature_ranking import ranking_mi_numba

Expand All @@ -38,13 +43,11 @@ def sklearn_MI(vector_first: Any, vector_second: Any) -> float:


def sklearn_surrogate(
vector_first: Any, vector_second: Any, surrogate_model: str,
vector_first: Any, vector_second: Any, X: Any, surrogate_model: str
) -> float:
if surrogate_model == 'surrogate-LR':
clf = LogisticRegression(max_iter=100000)
elif surrogate_model == 'surrogate-SVM':
clf = SVC(gamma='auto', probability=True)


clf = initialize_classifier(surrogate_model)

transf = OneHotEncoder()

# They do not commute, swap if needed
Expand All @@ -54,20 +57,17 @@ def sklearn_surrogate(
vector_first = vector_third
del vector_third

unique_values, counts = np.unique(vector_second, return_counts=True)

# Establish min support for this type of ranking.
if counts[0] < len(unique_values) * (2**5):
estimate_feature_importance = 0

if X.size <= 1:
X = vector_first.reshape(-1, 1)
else:
vector_first = transf.fit_transform(vector_first.reshape(-1, 1))
estimate_feature_importance_list = cross_val_score(
clf, vector_first, vector_second, scoring='neg_log_loss', cv=4,
)
X = np.concatenate((X, vector_first.reshape(-1, 1)), axis=1)

estimate_feature_importance = 1 + \
np.median(estimate_feature_importance_list)
X = transf.fit_transform(X)
estimate_feature_importance_list = cross_val_score(
clf, X, vector_second, scoring='neg_log_loss', cv=num_folds,
)
estimate_feature_importance = 1 + \
np.median(estimate_feature_importance_list)

return estimate_feature_importance

Expand Down Expand Up @@ -97,7 +97,7 @@ def sklearn_mi_adj(vector_first, vector_second):
return estimate_feature_importance


def get_importances_estimate_pairwise(combination, args, tmp_df):
def get_importances_estimate_pairwise(combination, reference_model_features, args, tmp_df):
"""A method for parallel importances estimation. As interaction scoring is independent, individual scores can be computed in parallel."""

feature_one = combination[0]
Expand All @@ -122,8 +122,12 @@ def get_importances_estimate_pairwise(combination, args, tmp_df):
estimate_feature_importance = sklearn_MI(vector_first, vector_second)

elif 'surrogate-' in args.heuristic:
X = np.array(float)
if is_prior_heuristic(args) and (len(reference_model_features) > 0):
X = tmp_df[reference_model_features].values

estimate_feature_importance = sklearn_surrogate(
vector_first, vector_second, args.heuristic,
vector_first, vector_second, X, args.heuristic
)

elif 'MI-numba' in args.heuristic:
Expand Down Expand Up @@ -213,3 +217,15 @@ def get_importances_estimate_nonmyopic(args: Any, tmp_df: pd.DataFrame):
# TODO - nonmyopic algorithms - tmp_df \ args.label vs. label
# TODO - this is to be executed directly on df - no need for parallel kernel(s)
pass


def initialize_classifier(surrogate_model: str):
if 'surrogate-LR' in surrogate_model:
return LogisticRegression(max_iter=100000)
elif 'surrogate-SVM' in surrogate_model:
return SVC(gamma='auto', probability=True)
elif 'surrogate-SGD' in surrogate_model:
return SGDClassifier(max_iter=100000, loss='log_loss')
else:
logging.warning(f'The chosen surrogate model {surrogate_model} is not supported, falling back to surrogate-SGD')
return SGDClassifier(max_iter=100000, loss='log_loss')
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ def generate_random_matrix(num_features=100, size=20000):
target = sample[:, 30]
# Some noise

target[target < 20] = 0
target[target < 40] = 0
target[target > 39] = 1
return sample, target


Expand Down
57 changes: 36 additions & 21 deletions outrank/core_ranking.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from outrank.core_utils import internal_hash
from outrank.core_utils import NominalFeatureSummary
from outrank.core_utils import NumericFeatureSummary
from outrank.core_utils import is_prior_heuristic
from outrank.feature_transformations.ranking_transformers import FeatureTransformerGeneric
from outrank.feature_transformations.ranking_transformers import FeatureTransformerNoise

Expand All @@ -50,12 +51,15 @@
def prior_combinations_sample(combinations: list[tuple[Any, ...]], args: Any) -> list[tuple[Any, ...]]:
"""Make sure only relevant subspace of combinations is selected based on prior counts"""

if len(GLOBAL_PRIOR_COMB_COUNTS) == 0:
for combination in combinations:
GLOBAL_PRIOR_COMB_COUNTS[combination] += 1
tmp = combinations[:args.combination_number_upper_bound]
else:
tmp = list(x[0] for x in sorted(GLOBAL_PRIOR_COMB_COUNTS.items(), key=lambda x:x[1], reverse=False))[:args.combination_number_upper_bound]
if len(combinations) == 0:
return []

missing_combinations = set(set(combinations)).difference(GLOBAL_PRIOR_COMB_COUNTS.keys())
if len(missing_combinations) > 0:
for combination in missing_combinations:
GLOBAL_PRIOR_COMB_COUNTS[combination] = 0

tmp = sorted(combinations, key=GLOBAL_PRIOR_COMB_COUNTS.get, reverse=False)[:args.combination_number_upper_bound]

for combination in tmp:
GLOBAL_PRIOR_COMB_COUNTS[combination] += 1
Expand Down Expand Up @@ -115,6 +119,12 @@ def mixed_rank_graph(
out_time_struct['encoding_columns'] = end_enc_timer - start_enc_timer

combinations = get_combinations_from_columns(all_columns, args)

reference_model_features = {}
if is_prior_heuristic(args):
reference_model_features = [(" AND ").join(tuple(sorted(item.split(",")))) for item in extract_features_from_reference_JSON(args.reference_model_JSON, all_features=True)]
combinations = [comb for comb in combinations if comb[0] not in reference_model_features and comb[1] not in reference_model_features]

combinations = prior_combinations_sample(combinations, args)
random.shuffle(combinations)

Expand All @@ -132,7 +142,7 @@ def mixed_rank_graph(

# starmap is an alternative that is slower unfortunately (but nicer)
def get_grounded_importances_estimate(combination: tuple[str]) -> Any:
return get_importances_estimate_pairwise(combination, args, tmp_df=tmp_df)
return get_importances_estimate_pairwise(combination, reference_model_features, args, tmp_df=tmp_df)

start_enc_timer = timer()
with cpu_pool as p:
Expand Down Expand Up @@ -176,7 +186,6 @@ def enrich_with_transformations(

def compute_combined_features(
input_dataframe: pd.DataFrame,
logger: Any,
args: Any,
pbar: Any,
is_3mr: bool = False,
Expand All @@ -189,19 +198,25 @@ def compute_combined_features(
join_string = ' AND_REL ' if is_3mr else ' AND '
interaction_order = 2 if is_3mr else args.interaction_order

model_combinations = []
full_combination_space = []


if args.interaction_order > 1:
full_combination_space = list(
itertools.combinations(all_columns, interaction_order),
)
full_combination_space = prior_combinations_sample(full_combination_space, args)

if args.reference_model_JSON != '':
combined_features = extract_features_from_reference_JSON(args.reference_model_JSON, combined_features_only = True)
full_combination_space = [combination.split(',') for combination in combined_features]
else:
full_combination_space = list(
itertools.combinations(all_columns, interaction_order),
)
model_combinations = extract_features_from_reference_JSON(args.reference_model_JSON, combined_features_only = True)
model_combinations = [tuple(sorted(combination.split(','))) for combination in model_combinations]
if not is_prior_heuristic(args):
full_combination_space = model_combinations

if is_prior_heuristic(args):
full_combination_space = full_combination_space + [tuple for tuple in model_combinations if tuple not in full_combination_space]

if args.combination_number_upper_bound and args.reference_model_JSON != '':
random.shuffle(full_combination_space)
full_combination_space = full_combination_space[
: args.combination_number_upper_bound
]

com_counter = 0
new_feature_hash = {}
Expand Down Expand Up @@ -531,7 +546,7 @@ def compute_batch_ranking(
if args.interaction_order > 1 or args.reference_model_JSON:
pbar.set_description('Constructing new features')
input_dataframe = compute_combined_features(
input_dataframe, logger, args, pbar,
input_dataframe, args, pbar,
)

# in case of 3mr we compute the score of combinations against the target
Expand All @@ -540,7 +555,7 @@ def compute_batch_ranking(
'Constructing features for computing relations in 3mr',
)
input_dataframe = compute_combined_features(
input_dataframe, logger, args, pbar, True,
input_dataframe, args, pbar, True,
)

if args.include_noise_baseline_features == 'True' and args.heuristic != 'Constant':
Expand Down
12 changes: 11 additions & 1 deletion outrank/core_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,14 +393,17 @@ def parse_csv_raw(data_path) -> DatasetInformationStorage:
)


def extract_features_from_reference_JSON(json_path: str, combined_features_only = False) -> set[Any]:
def extract_features_from_reference_JSON(json_path: str, combined_features_only = False, all_features = False) -> set[Any]:
"""Given a model's JSON, extract unique features"""

with open(json_path) as jp:
content = json.load(jp)

unique_features = set()
feature_space = content['desc'].get('features', [])
if all_features:
return set(feature_space)

fields_space = content['desc'].get('fields', [])
joint_space = feature_space + fields_space

Expand Down Expand Up @@ -641,3 +644,10 @@ def summarize_rare_counts(
final_df.to_csv(
f'{args.output_folder}/feature_sparsity_summary.tsv', index=False, sep='\t',
)


def is_prior_heuristic(args: Any) -> bool:
if "-prior" in args.heuristic and args.reference_model_JSON:
return True
return False

11 changes: 7 additions & 4 deletions outrank/task_selftest.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# helper set of methods that enable anywhere verification of core functions
from __future__ import annotations

import logging
import os
import shutil
Expand All @@ -22,16 +21,16 @@ def conduct_self_test():
'outrank --task data_generator --num_synthetic_rows 100000', shell=True,
)
subprocess.run(
'outrank --task ranking --data_path test_data_synthetic --data_source csv-raw --combination_number_upper_bound 60;',
'outrank --task ranking --data_path test_data_synthetic --data_source csv-raw;',
shell=True,
)

dfx = pd.read_csv('ranking_outputs/pairwise_ranks.tsv', sep='\t')

logger.info("Verifying output's properties ..")
assert dfx.shape[0] == 120
assert dfx.shape[0] == 201
assert dfx.shape[1] == 3
assert dfx['FeatureA'].values.tolist().pop() == 'label-(81; 100)' or dfx['FeatureB'].values.tolist().pop() == 'label-(81; 100)'
assert dfx['FeatureA'].values.tolist().pop() == 'label-(2; 100)' or dfx['FeatureB'].values.tolist().pop() == 'label-(2; 100)'

to_remove = ['ranking_outputs', 'test_data_synthetic']
for path in to_remove:
Expand All @@ -40,3 +39,7 @@ def conduct_self_test():
shutil.rmtree(path)

logger.info('All tests passed, OutRank seems in shape \N{winking face}')


if __name__ == '__main__':
conduct_self_test()
7 changes: 4 additions & 3 deletions outrank/task_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,10 @@ def outrank_task_result_summary(args):

min_score = np.min(final_df[f'Score {args.heuristic}'].values)
max_score = np.max(final_df[f'Score {args.heuristic}'].values)
final_df[f'Score {args.heuristic}'] = (
final_df[f'Score {args.heuristic}'] - min_score
) / (max_score - min_score)
if "MI" in args.heuristic:
final_df[f'Score {args.heuristic}'] = (
final_df[f'Score {args.heuristic}'] - min_score
) / (max_score - min_score)
logging.info(f'Storing summary files to {args.output_folder}')
pd.set_option('display.max_rows', None, 'display.max_columns', None)
singles_path = os.path.join(args.output_folder, 'feature_singles.tsv')
Expand Down
4 changes: 2 additions & 2 deletions tests/ranking_module_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def test_compute_combinations(self):
random_df.columns = ['F1', 'F2', 'F3']
local_pbar = tqdm.tqdm(total=100, position=0)
transformed_df = compute_combined_features(
random_df, None, args, local_pbar,
random_df, args, local_pbar,
)
self.assertEqual(transformed_df.shape[1], 4)

Expand All @@ -91,7 +91,7 @@ def test_compute_combinations(self):
random_df = pd.DataFrame(random_matrix)
random_df.columns = ['F1', 'F2', 'F3']
transformed_df = compute_combined_features(
random_df, None, args, local_pbar,
random_df, args, local_pbar,
)
self.assertEqual(transformed_df.shape[1], 6)

Expand Down
5 changes: 5 additions & 0 deletions tests/test_ref_model.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"desc": {
"features": ["f0","f1","f0,f1"]
}
}

0 comments on commit d6dc5d3

Please sign in to comment.