Skip to content

Commit

Permalink
Instance profiling task added (#69)
Browse files Browse the repository at this point in the history
* poc instance rank

* instance ranks

* imports, more systematically

* Some modifications

* Fname
  • Loading branch information
SkBlaz committed Jun 18, 2024
1 parent d6dc5d3 commit b5c9ee4
Show file tree
Hide file tree
Showing 15 changed files with 172 additions and 58 deletions.
12 changes: 6 additions & 6 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ repos:
rev: 6.1.0
hooks:
- id: flake8
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.5.1
hooks:
- id: mypy
additional_dependencies: [types-all]
exclude: ^testing/resources/
# - repo: https://github.com/pre-commit/mirrors-mypy
# rev: v1.5.1
# hooks:
# - id: mypy
# additional_dependencies: [types-all]
# exclude: ^testing/resources/
1 change: 1 addition & 0 deletions benchmarks/analyse_rankings.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import matplotlib.pyplot as plt


def extract_just_ranking(dfile):
"""Extract ranking from an output file."""
ranks = []
Expand Down
5 changes: 4 additions & 1 deletion outrank/__main__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from __future__ import annotations

import argparse
import json
import logging

from outrank.task_generators import outrank_task_generate_data_set
from outrank.task_instance_ranking import outrank_task_rank_instances
from outrank.task_ranking import outrank_task_conduct_ranking
from outrank.task_selftest import conduct_self_test
from outrank.task_summary import outrank_task_result_summary
Expand Down Expand Up @@ -276,6 +276,9 @@ def main():
elif task == 'data_generator':
outrank_task_generate_data_set(args)

elif task == 'instance_ranking':
outrank_task_rank_instances(args)

else:
logging.info(f'Warning, the selected task: {task} does not exist.')

Expand Down
14 changes: 7 additions & 7 deletions outrank/algorithms/importance_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,15 @@
import pandas as pd
from scipy.stats import pearsonr
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC

from outrank.core_utils import is_prior_heuristic


logger = logging.getLogger('syn-logger')
logger.setLevel(logging.DEBUG)

Expand All @@ -43,11 +43,11 @@ def sklearn_MI(vector_first: Any, vector_second: Any) -> float:


def sklearn_surrogate(
vector_first: Any, vector_second: Any, X: Any, surrogate_model: str
vector_first: Any, vector_second: Any, X: Any, surrogate_model: str,
) -> float:

clf = initialize_classifier(surrogate_model)

transf = OneHotEncoder()

# They do not commute, swap if needed
Expand All @@ -67,7 +67,7 @@ def sklearn_surrogate(
clf, X, vector_second, scoring='neg_log_loss', cv=num_folds,
)
estimate_feature_importance = 1 + \
np.median(estimate_feature_importance_list)
np.median(estimate_feature_importance_list)

return estimate_feature_importance

Expand Down Expand Up @@ -127,7 +127,7 @@ def get_importances_estimate_pairwise(combination, reference_model_features, arg
X = tmp_df[reference_model_features].values

estimate_feature_importance = sklearn_surrogate(
vector_first, vector_second, X, args.heuristic
vector_first, vector_second, X, args.heuristic,
)

elif 'MI-numba' in args.heuristic:
Expand Down
31 changes: 9 additions & 22 deletions outrank/core_ranking.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,20 @@
import pandas as pd
import tqdm

from outrank.algorithms.importance_estimator import get_importances_estimate_pairwise
from outrank.algorithms.sketches.counting_counters_ordinary import PrimitiveConstrainedCounter
from outrank.algorithms.sketches.counting_ultiloglog import (
HyperLogLogWCache as HyperLogLog,
)
from outrank.algorithms.importance_estimator import \
get_importances_estimate_pairwise
from outrank.algorithms.sketches.counting_counters_ordinary import \
PrimitiveConstrainedCounter
from outrank.algorithms.sketches.counting_ultiloglog import \
HyperLogLogWCache as HyperLogLog
from outrank.core_utils import BatchRankingSummary
from outrank.core_utils import extract_features_from_reference_JSON
from outrank.core_utils import generic_line_parser
from outrank.core_utils import get_num_of_instances
from outrank.core_utils import internal_hash
from outrank.core_utils import is_prior_heuristic
from outrank.core_utils import NominalFeatureSummary
from outrank.core_utils import NumericFeatureSummary
from outrank.core_utils import is_prior_heuristic
from outrank.feature_transformations.ranking_transformers import FeatureTransformerGeneric
from outrank.feature_transformations.ranking_transformers import FeatureTransformerNoise

Expand Down Expand Up @@ -122,7 +124,7 @@ def mixed_rank_graph(

reference_model_features = {}
if is_prior_heuristic(args):
reference_model_features = [(" AND ").join(tuple(sorted(item.split(",")))) for item in extract_features_from_reference_JSON(args.reference_model_JSON, all_features=True)]
reference_model_features = [(' AND ').join(tuple(sorted(item.split(',')))) for item in extract_features_from_reference_JSON(args.reference_model_JSON, all_features=True)]
combinations = [comb for comb in combinations if comb[0] not in reference_model_features and comb[1] not in reference_model_features]

combinations = prior_combinations_sample(combinations, args)
Expand Down Expand Up @@ -589,21 +591,6 @@ def compute_batch_ranking(
)


def get_num_of_instances(fname: str) -> int:
"""Count the number of lines in a file, fast - useful for progress logging"""

def _make_gen(reader):
while True:
b = reader(2**16)
if not b:
break
yield b

with open(fname, 'rb') as f:
count = sum(buf.count(b'\n') for buf in _make_gen(f.raw.read))
return count


def get_grouped_df(importances_df_list: list[tuple[str, str, float]]) -> pd.DataFrame:
"""A helper method that enables median-based aggregation after processing"""

Expand Down
16 changes: 15 additions & 1 deletion outrank/core_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -647,7 +647,21 @@ def summarize_rare_counts(


def is_prior_heuristic(args: Any) -> bool:
if "-prior" in args.heuristic and args.reference_model_JSON:
if '-prior' in args.heuristic and args.reference_model_JSON:
return True
return False


def get_num_of_instances(fname: str) -> int:
"""Count the number of lines in a file, fast - useful for progress logging"""

def _make_gen(reader):
while True:
b = reader(2**16)
if not b:
break
yield b

with open(fname, 'rb') as f:
count = sum(buf.count(b'\n') for buf in _make_gen(f.raw.read))
return count
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,8 @@

from outrank.feature_transformations.feature_transformer_vault.default_transformers import DEFAULT_TRANSFORMERS
from outrank.feature_transformations.feature_transformer_vault.default_transformers import MINIMAL_TRANSFORMERS
from outrank.feature_transformations.feature_transformer_vault.fw_transformers import (
FW_TRANSFORMERS,
)
from outrank.feature_transformations.feature_transformer_vault.fw_transformers import \
FW_TRANSFORMERS

_tr_global_namespace = {
'default': DEFAULT_TRANSFORMERS,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Some boilerplate transformations people tend to use
from __future__ import annotations

MINIMAL_TRANSFORMERS = {
'_tr_sqrt': 'np.sqrt(X)',
'_tr_log(x+1)': 'np.log(X + 1)',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,8 @@

import numpy as np

from outrank.feature_transformations.feature_transformer_vault.default_transformers import (
DEFAULT_TRANSFORMERS,
)
from outrank.feature_transformations.feature_transformer_vault.default_transformers import \
DEFAULT_TRANSFORMERS

FW_TRANSFORMERS = DEFAULT_TRANSFORMERS.copy()
resolution_range = [1, 10, 50, 100]
Expand Down
114 changes: 114 additions & 0 deletions outrank/task_instance_ranking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
from __future__ import annotations

import gzip
import os
from collections import Counter
from collections import defaultdict
from typing import Any

import numpy as np
import pandas as pd
import tqdm

from outrank.core_utils import generic_line_parser
from outrank.core_utils import get_dataset_info
from outrank.core_utils import get_num_of_instances

try:
import matplotlib.pyplot as plt
except:
pass


def shannon_ent(string: str) -> float:
counts = Counter(string)
frequencies = ((i / len(string)) for i in counts.values())
return -np.sum(f * np.log2(f) for f in frequencies)


def compute_entropy_avg(line: list) -> float:
joint_ent = 0
for field in line:
joint_ent += shannon_ent(field)
return joint_ent


def score_line(line):
nan_prop = line.count('') / len(line)
out_struct = {}
out_struct['empty_string_prop'] = nan_prop
out_struct['empty_dict'] = line.count('{}') / len(line)
out_struct['all_empty'] = (line.count('{}') + line.count('')) / len(line)
out_struct['all_zero'] = line.count('0') / len(line)
for j in [30, 60, 100, 200, 300]:
out_struct[f'all_more_{j}_chars'] = len(
[x for x in line if len(x) > j], ) / len(line)
out_struct['row_entropy'] = compute_entropy_avg(line)
return out_struct


def outrank_task_rank_instances(args: Any) -> None:

data_encoding = 'utf-8'
delimiter = '\t'
dataset_info = get_dataset_info(args)
local_pbar = tqdm.tqdm(
total=get_num_of_instances(dataset_info.data_path) - 1,
position=0,
disable=args.disable_tqdm == 'True',
)
local_pbar.set_description('Starting ranking computation')

_, file_extension = os.path.splitext(dataset_info.data_path)

if file_extension == '.gz':
file_stream = gzip.open(
dataset_info.data_path,
'rt',
encoding=data_encoding,
)

else:
file_stream = open(dataset_info.data_path, encoding=data_encoding)
line_counter = 0
out_scores_lab = defaultdict(list)

for line in file_stream:
line_counter += 1
local_pbar.update(1)

parsed_line = generic_line_parser(
line,
delimiter,
args,
dataset_info.fw_map,
dataset_info.column_names,
)

if line_counter > 100_000:
break
out_scores_lab[line[0]].append(score_line(parsed_line))

for label, out_scores in out_scores_lab.items():
out_df = pd.DataFrame(out_scores)
os.makedirs(args.output_folder, exist_ok=True)
for col in out_df.columns:
sorted_vals = out_df[col].sort_values()
plt.figure(figsize=(5, 5), dpi=300)
plt.title(col + f' label: {label}')
plt.hist(
x=sorted_vals * 100,
color='black',
density=True,
bins=100,
)
if 'entropy' not in col:
plt.xlabel('Proportion of namespaces (%)')
else:
plt.xlabel('Row entropy')
plt.ylabel('Density')
plt.tight_layout()
fname = f'distPlot{col}_{label}.pdf'
plt.savefig(os.path.join(args.output_folder, fname), dpi=300)
plt.cla()
plt.clf()
1 change: 1 addition & 0 deletions outrank/task_selftest.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# helper set of methods that enable anywhere verification of core functions
from __future__ import annotations

import logging
import os
import shutil
Expand Down
5 changes: 2 additions & 3 deletions tests/fw_transformers_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@

import numpy as np

from outrank.feature_transformations.feature_transformer_vault.fw_transformers import (
FW_TRANSFORMERS,
)
from outrank.feature_transformations.feature_transformer_vault.fw_transformers import \
FW_TRANSFORMERS

sys.path.append('./outrank')

Expand Down
5 changes: 2 additions & 3 deletions tests/hll_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,8 @@
import sys
import unittest

from outrank.algorithms.sketches.counting_ultiloglog import (
HyperLogLogWCache as HyperLogLog,
)
from outrank.algorithms.sketches.counting_ultiloglog import \
HyperLogLogWCache as HyperLogLog

sys.path.append('./outrank')

Expand Down
5 changes: 2 additions & 3 deletions tests/mi_numba_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@

import numpy as np

from outrank.algorithms.feature_ranking.ranking_mi_numba import (
mutual_info_estimator_numba,
)
from outrank.algorithms.feature_ranking.ranking_mi_numba import \
mutual_info_estimator_numba

np.random.seed(123)
sys.path.append('./outrank')
Expand Down
10 changes: 4 additions & 6 deletions tests/ranking_module_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,10 @@
from outrank.core_ranking import compute_combined_features
from outrank.core_ranking import get_combinations_from_columns
from outrank.core_ranking import mixed_rank_graph
from outrank.feature_transformations.feature_transformer_vault import (
default_transformers,
)
from outrank.feature_transformations.ranking_transformers import (
FeatureTransformerGeneric,
)
from outrank.feature_transformations.feature_transformer_vault import \
default_transformers
from outrank.feature_transformations.ranking_transformers import \
FeatureTransformerGeneric

sys.path.append('./outrank')

Expand Down

0 comments on commit b5c9ee4

Please sign in to comment.