From a28b373fa262a537cbf84f5302755c92d36ac2d2 Mon Sep 17 00:00:00 2001 From: bskrlj Date: Fri, 14 Jun 2024 14:11:37 +0200 Subject: [PATCH] instance ranks --- outrank/task_instance_ranking.py | 68 ++++++++++++++++++++++---------- 1 file changed, 48 insertions(+), 20 deletions(-) diff --git a/outrank/task_instance_ranking.py b/outrank/task_instance_ranking.py index f118e93..37274b9 100644 --- a/outrank/task_instance_ranking.py +++ b/outrank/task_instance_ranking.py @@ -3,8 +3,11 @@ import gzip import logging import os +from collections import Counter +from collections import defaultdict from typing import Any +import numpy as np import pandas as pd import tqdm @@ -18,6 +21,19 @@ pass +def shannon_ent(string): + counts = Counter(string) + frequencies = ((i / len(string)) for i in counts.values()) + return -np.sum(f * np.log2(f) for f in frequencies) + + +def compute_entropy_avg(line): + joint_ent = 0 + for field in line: + joint_ent += shannon_ent(field) + return joint_ent + + def score_line(line): nan_prop = line.count('') / len(line) out_struct = {} @@ -27,8 +43,8 @@ def score_line(line): out_struct['all_zero'] = line.count('0') / len(line) for j in [30, 60, 100, 200, 300]: out_struct[f'all_more_{j}_chars'] = len( - [x for x in line if len(x) > j], - ) / len(line) + [x for x in line if len(x) > j], ) / len(line) + out_struct['row_entropy'] = compute_entropy_avg(line) return out_struct @@ -56,7 +72,7 @@ def outrank_task_rank_instances(args: Any) -> None: else: file_stream = open(dataset_info.data_path, encoding=data_encoding) line_counter = 0 - out_scores = [] + out_scores_lab = defaultdict(list) for line in file_stream: line_counter += 1 @@ -69,20 +85,32 @@ def outrank_task_rank_instances(args: Any) -> None: dataset_info.fw_map, dataset_info.column_names, ) - out_scores.append(score_line(parsed_line)) - - out_df = pd.DataFrame(out_scores) - os.makedirs(args.output_folder, exist_ok=True) - for col in out_df.columns: - sorted_vals = out_df[col].sort_values() - enx = list(range(out_df.shape[0])) - plt.figure(figsize=(5, 5), dpi=300) - plt.title(col) - plt.hist(x=sorted_vals * 100, color='black', density=True, bins=100) - plt.xlabel('Missing namespace values (%)') - plt.ylabel('Density') - plt.tight_layout() - fname = f'distPlot{col}.pdf' - plt.savefig(os.path.join(args.output_folder, fname), dpi=300) - plt.cla() - plt.clf() + + if line_counter > 100_000: + break + out_scores_lab[line[0]].append(score_line(parsed_line)) + + for label, out_scores in out_scores_lab.items(): + out_df = pd.DataFrame(out_scores) + os.makedirs(args.output_folder, exist_ok=True) + for col in out_df.columns: + sorted_vals = out_df[col].sort_values() + enx = list(range(out_df.shape[0])) + plt.figure(figsize=(5, 5), dpi=300) + plt.title(col + f' label: {label}') + plt.hist( + x=sorted_vals * 100, + color='black', + density=True, + bins=100, + ) + if not 'entropy' in col: + plt.xlabel('Proportion of namespaces (%)') + else: + plt.xlabel('Row entropy') + plt.ylabel('Density') + plt.tight_layout() + fname = f'distPlot{col}_{label}.pdf' + plt.savefig(os.path.join(args.output_folder, fname), dpi=300) + plt.cla() + plt.clf()