Skip to content

Commit

Permalink
Merge pull request #45 from outbrain/3mr-fixes
Browse files Browse the repository at this point in the history
3mr fixes
  • Loading branch information
SkBlaz committed Oct 9, 2023
2 parents ee41b67 + 4fe538c commit 05f9039
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 6 deletions.
7 changes: 7 additions & 0 deletions outrank/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,13 @@ def main():
help="Which ';'-separated features should be one-hot encoded into n new features (coverage analysis)",
)

parser.add_argument(
'--silent',
type=str,
default='False',
help='Suppress the logo and tips.',
)

parser.add_argument(
'--subfeature_mapping',
type=str,
Expand Down
23 changes: 21 additions & 2 deletions outrank/algorithms/importance_estimator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# A module for pairwise computation of importances -- entrypoint for the core ranking algorighm(s)
from __future__ import annotations

import logging
import operator
import traceback
from typing import Any
Expand All @@ -16,6 +17,9 @@
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC

logger = logging.getLogger('syn-logger')
logger.setLevel(logging.DEBUG)

try:
from outrank.algorithms.feature_ranking import ranking_mi_numba

Expand Down Expand Up @@ -99,6 +103,13 @@ def get_importances_estimate_pairwise(combination, args, tmp_df):
feature_one = combination[0]
feature_two = combination[1]

if feature_one not in tmp_df.columns:
logging.info(f'{feature_one} not found in the constructed data frame - consider increasing --combination_number_upper_bound for better coverage.')
return [feature_one, feature_two, 0]
elif feature_two not in tmp_df.columns:
logging.info(f'{feature_two} not found in the constructed data frame - consider increasing --combination_number_upper_bound for better coverage.')
return [feature_one, feature_two, 0]

vector_first = tmp_df[[feature_one]].values.ravel()
vector_second = tmp_df[[feature_two]].values.ravel()

Expand Down Expand Up @@ -156,10 +167,18 @@ def rank_features_3MR(
def calc_higher_order(feature, is_redundancy=True):
values = []
for feat in ranked_features:
interaction_tuple = (feat, feature)
if is_redundancy:
values.append(redundancy_dict[(feat, feature)])
if interaction_tuple in redundancy_dict:
values.append(redundancy_dict[interaction_tuple])
else:
logging.info('Not accounting for redundancy tuple {} - please increase the --combination_number_upper_bound for beter coverage of interactions/redundancies.')
else:
values.append(relational_dict[(feat, feature)])
if interaction_tuple in relational_dict:
values.append(relational_dict[interaction_tuple])
else:
logging.info('Not accounting for interaction tuple {} - please increase the --combination_number_upper_bound for beter coverage of interactions/redundancies.')

if strategy == 'sum':
return sum(values)
if strategy == 'mean':
Expand Down
6 changes: 5 additions & 1 deletion outrank/core_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,11 +182,14 @@ def parse_ob_line_vw(

# Hash multi-value tuples and store name-val mappings
for remaining_part in remainder:
core_parts = remaining_part.split(' ')
core_parts = remaining_part.strip().split(' ')
namespace_part = core_parts[0]
other_parts = '-'.join(x for x in core_parts[1:] if x != '')

if namespace_part in fw_col_mapping:
remainder_hash[fw_col_mapping[namespace_part]] = other_parts
else:
logging.error(f"Didn't find namespace {namespace_part}")

# Construct the consistently-mapped instance based on the remainder mapping
the_real_instance = [
Expand All @@ -200,6 +203,7 @@ def parse_ob_line_vw(
]

parts = [label] + the_real_instance

return parts


Expand Down
5 changes: 3 additions & 2 deletions outrank/task_ranking.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,9 @@ def outrank_task_conduct_ranking(args: Any):
if args.task in ['identify_rare_values', 'feature_summary_transformers']:
args.heuristic = 'Constant'

display_tool_name()
display_random_tip()
if args.silent != 'True':
display_tool_name()
display_random_tip()

dataset_info = get_dataset_info(args)

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def _read_description():
packages = [x for x in setuptools.find_packages() if x != 'test']
setuptools.setup(
name='outrank',
version='0.94.1',
version='0.94.2',
description='OutRank: Feature ranking for massive sparse data sets.',
long_description=_read_description(),
long_description_content_type='text/markdown',
Expand Down

0 comments on commit 05f9039

Please sign in to comment.