diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 81b0ac6..0936032 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -37,9 +37,9 @@ repos: rev: 6.1.0 hooks: - id: flake8 -- repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.5.1 - hooks: - - id: mypy - additional_dependencies: [types-all] - exclude: ^testing/resources/ +# - repo: https://github.com/pre-commit/mirrors-mypy +# rev: v1.5.1 +# hooks: +# - id: mypy +# additional_dependencies: [types-all] +# exclude: ^testing/resources/ diff --git a/benchmarks/analyse_rankings.py b/benchmarks/analyse_rankings.py index cc0f724..7898a2b 100644 --- a/benchmarks/analyse_rankings.py +++ b/benchmarks/analyse_rankings.py @@ -6,6 +6,7 @@ import matplotlib.pyplot as plt + def extract_just_ranking(dfile): """Extract ranking from an output file.""" ranks = [] diff --git a/outrank/__main__.py b/outrank/__main__.py index 00a4295..1ccc1d2 100644 --- a/outrank/__main__.py +++ b/outrank/__main__.py @@ -1,10 +1,10 @@ from __future__ import annotations import argparse -import json import logging from outrank.task_generators import outrank_task_generate_data_set +from outrank.task_instance_ranking import outrank_task_rank_instances from outrank.task_ranking import outrank_task_conduct_ranking from outrank.task_selftest import conduct_self_test from outrank.task_summary import outrank_task_result_summary @@ -276,6 +276,9 @@ def main(): elif task == 'data_generator': outrank_task_generate_data_set(args) + elif task == 'instance_ranking': + outrank_task_rank_instances(args) + else: logging.info(f'Warning, the selected task: {task} does not exist.') diff --git a/outrank/algorithms/importance_estimator.py b/outrank/algorithms/importance_estimator.py index e37ba1c..56953c5 100644 --- a/outrank/algorithms/importance_estimator.py +++ b/outrank/algorithms/importance_estimator.py @@ -11,7 +11,8 @@ import pandas as pd from scipy.stats import pearsonr from sklearn.feature_selection import mutual_info_classif -from sklearn.linear_model import LogisticRegression, SGDClassifier +from sklearn.linear_model import LogisticRegression +from sklearn.linear_model import SGDClassifier from sklearn.metrics import adjusted_mutual_info_score from sklearn.model_selection import cross_val_score from sklearn.preprocessing import OneHotEncoder @@ -19,7 +20,6 @@ from outrank.core_utils import is_prior_heuristic - logger = logging.getLogger('syn-logger') logger.setLevel(logging.DEBUG) @@ -43,11 +43,11 @@ def sklearn_MI(vector_first: Any, vector_second: Any) -> float: def sklearn_surrogate( - vector_first: Any, vector_second: Any, X: Any, surrogate_model: str + vector_first: Any, vector_second: Any, X: Any, surrogate_model: str, ) -> float: - + clf = initialize_classifier(surrogate_model) - + transf = OneHotEncoder() # They do not commute, swap if needed @@ -67,7 +67,7 @@ def sklearn_surrogate( clf, X, vector_second, scoring='neg_log_loss', cv=num_folds, ) estimate_feature_importance = 1 + \ - np.median(estimate_feature_importance_list) + np.median(estimate_feature_importance_list) return estimate_feature_importance @@ -127,7 +127,7 @@ def get_importances_estimate_pairwise(combination, reference_model_features, arg X = tmp_df[reference_model_features].values estimate_feature_importance = sklearn_surrogate( - vector_first, vector_second, X, args.heuristic + vector_first, vector_second, X, args.heuristic, ) elif 'MI-numba' in args.heuristic: diff --git a/outrank/core_ranking.py b/outrank/core_ranking.py index 30f892c..91b8381 100644 --- a/outrank/core_ranking.py +++ b/outrank/core_ranking.py @@ -21,18 +21,20 @@ import pandas as pd import tqdm -from outrank.algorithms.importance_estimator import get_importances_estimate_pairwise -from outrank.algorithms.sketches.counting_counters_ordinary import PrimitiveConstrainedCounter -from outrank.algorithms.sketches.counting_ultiloglog import ( - HyperLogLogWCache as HyperLogLog, -) +from outrank.algorithms.importance_estimator import \ + get_importances_estimate_pairwise +from outrank.algorithms.sketches.counting_counters_ordinary import \ + PrimitiveConstrainedCounter +from outrank.algorithms.sketches.counting_ultiloglog import \ + HyperLogLogWCache as HyperLogLog from outrank.core_utils import BatchRankingSummary from outrank.core_utils import extract_features_from_reference_JSON from outrank.core_utils import generic_line_parser +from outrank.core_utils import get_num_of_instances from outrank.core_utils import internal_hash +from outrank.core_utils import is_prior_heuristic from outrank.core_utils import NominalFeatureSummary from outrank.core_utils import NumericFeatureSummary -from outrank.core_utils import is_prior_heuristic from outrank.feature_transformations.ranking_transformers import FeatureTransformerGeneric from outrank.feature_transformations.ranking_transformers import FeatureTransformerNoise @@ -122,7 +124,7 @@ def mixed_rank_graph( reference_model_features = {} if is_prior_heuristic(args): - reference_model_features = [(" AND ").join(tuple(sorted(item.split(",")))) for item in extract_features_from_reference_JSON(args.reference_model_JSON, all_features=True)] + reference_model_features = [(' AND ').join(tuple(sorted(item.split(',')))) for item in extract_features_from_reference_JSON(args.reference_model_JSON, all_features=True)] combinations = [comb for comb in combinations if comb[0] not in reference_model_features and comb[1] not in reference_model_features] combinations = prior_combinations_sample(combinations, args) @@ -589,21 +591,6 @@ def compute_batch_ranking( ) -def get_num_of_instances(fname: str) -> int: - """Count the number of lines in a file, fast - useful for progress logging""" - - def _make_gen(reader): - while True: - b = reader(2**16) - if not b: - break - yield b - - with open(fname, 'rb') as f: - count = sum(buf.count(b'\n') for buf in _make_gen(f.raw.read)) - return count - - def get_grouped_df(importances_df_list: list[tuple[str, str, float]]) -> pd.DataFrame: """A helper method that enables median-based aggregation after processing""" diff --git a/outrank/core_utils.py b/outrank/core_utils.py index 336cc35..1ccbe6e 100644 --- a/outrank/core_utils.py +++ b/outrank/core_utils.py @@ -647,7 +647,21 @@ def summarize_rare_counts( def is_prior_heuristic(args: Any) -> bool: - if "-prior" in args.heuristic and args.reference_model_JSON: + if '-prior' in args.heuristic and args.reference_model_JSON: return True return False + +def get_num_of_instances(fname: str) -> int: + """Count the number of lines in a file, fast - useful for progress logging""" + + def _make_gen(reader): + while True: + b = reader(2**16) + if not b: + break + yield b + + with open(fname, 'rb') as f: + count = sum(buf.count(b'\n') for buf in _make_gen(f.raw.read)) + return count diff --git a/outrank/feature_transformations/feature_transformer_vault/__init__.py b/outrank/feature_transformations/feature_transformer_vault/__init__.py index dd6e518..14bea7a 100644 --- a/outrank/feature_transformations/feature_transformer_vault/__init__.py +++ b/outrank/feature_transformations/feature_transformer_vault/__init__.py @@ -2,9 +2,8 @@ from outrank.feature_transformations.feature_transformer_vault.default_transformers import DEFAULT_TRANSFORMERS from outrank.feature_transformations.feature_transformer_vault.default_transformers import MINIMAL_TRANSFORMERS -from outrank.feature_transformations.feature_transformer_vault.fw_transformers import ( - FW_TRANSFORMERS, -) +from outrank.feature_transformations.feature_transformer_vault.fw_transformers import \ + FW_TRANSFORMERS _tr_global_namespace = { 'default': DEFAULT_TRANSFORMERS, diff --git a/outrank/feature_transformations/feature_transformer_vault/default_transformers.py b/outrank/feature_transformations/feature_transformer_vault/default_transformers.py index 96d866c..705b910 100644 --- a/outrank/feature_transformations/feature_transformer_vault/default_transformers.py +++ b/outrank/feature_transformations/feature_transformer_vault/default_transformers.py @@ -1,5 +1,6 @@ # Some boilerplate transformations people tend to use from __future__ import annotations + MINIMAL_TRANSFORMERS = { '_tr_sqrt': 'np.sqrt(X)', '_tr_log(x+1)': 'np.log(X + 1)', diff --git a/outrank/feature_transformations/feature_transformer_vault/fw_transformers.py b/outrank/feature_transformations/feature_transformer_vault/fw_transformers.py index 81476c2..e1d8226 100644 --- a/outrank/feature_transformations/feature_transformer_vault/fw_transformers.py +++ b/outrank/feature_transformations/feature_transformer_vault/fw_transformers.py @@ -2,9 +2,8 @@ import numpy as np -from outrank.feature_transformations.feature_transformer_vault.default_transformers import ( - DEFAULT_TRANSFORMERS, -) +from outrank.feature_transformations.feature_transformer_vault.default_transformers import \ + DEFAULT_TRANSFORMERS FW_TRANSFORMERS = DEFAULT_TRANSFORMERS.copy() resolution_range = [1, 10, 50, 100] diff --git a/outrank/task_instance_ranking.py b/outrank/task_instance_ranking.py new file mode 100644 index 0000000..2e40244 --- /dev/null +++ b/outrank/task_instance_ranking.py @@ -0,0 +1,114 @@ +from __future__ import annotations + +import gzip +import os +from collections import Counter +from collections import defaultdict +from typing import Any + +import numpy as np +import pandas as pd +import tqdm + +from outrank.core_utils import generic_line_parser +from outrank.core_utils import get_dataset_info +from outrank.core_utils import get_num_of_instances + +try: + import matplotlib.pyplot as plt +except: + pass + + +def shannon_ent(string: str) -> float: + counts = Counter(string) + frequencies = ((i / len(string)) for i in counts.values()) + return -np.sum(f * np.log2(f) for f in frequencies) + + +def compute_entropy_avg(line: list) -> float: + joint_ent = 0 + for field in line: + joint_ent += shannon_ent(field) + return joint_ent + + +def score_line(line): + nan_prop = line.count('') / len(line) + out_struct = {} + out_struct['empty_string_prop'] = nan_prop + out_struct['empty_dict'] = line.count('{}') / len(line) + out_struct['all_empty'] = (line.count('{}') + line.count('')) / len(line) + out_struct['all_zero'] = line.count('0') / len(line) + for j in [30, 60, 100, 200, 300]: + out_struct[f'all_more_{j}_chars'] = len( + [x for x in line if len(x) > j], ) / len(line) + out_struct['row_entropy'] = compute_entropy_avg(line) + return out_struct + + +def outrank_task_rank_instances(args: Any) -> None: + + data_encoding = 'utf-8' + delimiter = '\t' + dataset_info = get_dataset_info(args) + local_pbar = tqdm.tqdm( + total=get_num_of_instances(dataset_info.data_path) - 1, + position=0, + disable=args.disable_tqdm == 'True', + ) + local_pbar.set_description('Starting ranking computation') + + _, file_extension = os.path.splitext(dataset_info.data_path) + + if file_extension == '.gz': + file_stream = gzip.open( + dataset_info.data_path, + 'rt', + encoding=data_encoding, + ) + + else: + file_stream = open(dataset_info.data_path, encoding=data_encoding) + line_counter = 0 + out_scores_lab = defaultdict(list) + + for line in file_stream: + line_counter += 1 + local_pbar.update(1) + + parsed_line = generic_line_parser( + line, + delimiter, + args, + dataset_info.fw_map, + dataset_info.column_names, + ) + + if line_counter > 100_000: + break + out_scores_lab[line[0]].append(score_line(parsed_line)) + + for label, out_scores in out_scores_lab.items(): + out_df = pd.DataFrame(out_scores) + os.makedirs(args.output_folder, exist_ok=True) + for col in out_df.columns: + sorted_vals = out_df[col].sort_values() + plt.figure(figsize=(5, 5), dpi=300) + plt.title(col + f' label: {label}') + plt.hist( + x=sorted_vals * 100, + color='black', + density=True, + bins=100, + ) + if 'entropy' not in col: + plt.xlabel('Proportion of namespaces (%)') + else: + plt.xlabel('Row entropy') + plt.ylabel('Density') + plt.tight_layout() + fname = f'distPlot{col}_{label}.pdf' + plt.savefig(os.path.join(args.output_folder, fname), dpi=300) + plt.cla() + plt.clf() diff --git a/outrank/task_selftest.py b/outrank/task_selftest.py index cd1cb45..b28536c 100644 --- a/outrank/task_selftest.py +++ b/outrank/task_selftest.py @@ -1,5 +1,6 @@ # helper set of methods that enable anywhere verification of core functions from __future__ import annotations + import logging import os import shutil diff --git a/tests/fw_transformers_test.py b/tests/fw_transformers_test.py index f3c76be..1f43cc5 100644 --- a/tests/fw_transformers_test.py +++ b/tests/fw_transformers_test.py @@ -5,9 +5,8 @@ import numpy as np -from outrank.feature_transformations.feature_transformer_vault.fw_transformers import ( - FW_TRANSFORMERS, -) +from outrank.feature_transformations.feature_transformer_vault.fw_transformers import \ + FW_TRANSFORMERS sys.path.append('./outrank') diff --git a/tests/hll_test.py b/tests/hll_test.py index 8830778..0f32ea0 100644 --- a/tests/hll_test.py +++ b/tests/hll_test.py @@ -3,9 +3,8 @@ import sys import unittest -from outrank.algorithms.sketches.counting_ultiloglog import ( - HyperLogLogWCache as HyperLogLog, -) +from outrank.algorithms.sketches.counting_ultiloglog import \ + HyperLogLogWCache as HyperLogLog sys.path.append('./outrank') diff --git a/tests/mi_numba_test.py b/tests/mi_numba_test.py index ffd21de..a81f1cd 100644 --- a/tests/mi_numba_test.py +++ b/tests/mi_numba_test.py @@ -5,9 +5,8 @@ import numpy as np -from outrank.algorithms.feature_ranking.ranking_mi_numba import ( - mutual_info_estimator_numba, -) +from outrank.algorithms.feature_ranking.ranking_mi_numba import \ + mutual_info_estimator_numba np.random.seed(123) sys.path.append('./outrank') diff --git a/tests/ranking_module_test.py b/tests/ranking_module_test.py index fd99092..7c282f3 100644 --- a/tests/ranking_module_test.py +++ b/tests/ranking_module_test.py @@ -12,12 +12,10 @@ from outrank.core_ranking import compute_combined_features from outrank.core_ranking import get_combinations_from_columns from outrank.core_ranking import mixed_rank_graph -from outrank.feature_transformations.feature_transformer_vault import ( - default_transformers, -) -from outrank.feature_transformations.ranking_transformers import ( - FeatureTransformerGeneric, -) +from outrank.feature_transformations.feature_transformer_vault import \ + default_transformers +from outrank.feature_transformations.ranking_transformers import \ + FeatureTransformerGeneric sys.path.append('./outrank')