Skip to content

Commit

Permalink
instance ranks
Browse files Browse the repository at this point in the history
  • Loading branch information
SkBlaz committed Jun 14, 2024
1 parent feeb698 commit a28b373
Showing 1 changed file with 48 additions and 20 deletions.
68 changes: 48 additions & 20 deletions outrank/task_instance_ranking.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,11 @@
import gzip
import logging
import os
from collections import Counter
from collections import defaultdict
from typing import Any

import numpy as np
import pandas as pd
import tqdm

Expand All @@ -18,6 +21,19 @@
pass


def shannon_ent(string):
counts = Counter(string)
frequencies = ((i / len(string)) for i in counts.values())
return -np.sum(f * np.log2(f) for f in frequencies)


def compute_entropy_avg(line):
joint_ent = 0
for field in line:
joint_ent += shannon_ent(field)
return joint_ent


def score_line(line):
nan_prop = line.count('') / len(line)
out_struct = {}
Expand All @@ -27,8 +43,8 @@ def score_line(line):
out_struct['all_zero'] = line.count('0') / len(line)
for j in [30, 60, 100, 200, 300]:
out_struct[f'all_more_{j}_chars'] = len(
[x for x in line if len(x) > j],
) / len(line)
[x for x in line if len(x) > j], ) / len(line)
out_struct['row_entropy'] = compute_entropy_avg(line)
return out_struct


Expand Down Expand Up @@ -56,7 +72,7 @@ def outrank_task_rank_instances(args: Any) -> None:
else:
file_stream = open(dataset_info.data_path, encoding=data_encoding)
line_counter = 0
out_scores = []
out_scores_lab = defaultdict(list)

for line in file_stream:
line_counter += 1
Expand All @@ -69,20 +85,32 @@ def outrank_task_rank_instances(args: Any) -> None:
dataset_info.fw_map,
dataset_info.column_names,
)
out_scores.append(score_line(parsed_line))

out_df = pd.DataFrame(out_scores)
os.makedirs(args.output_folder, exist_ok=True)
for col in out_df.columns:
sorted_vals = out_df[col].sort_values()
enx = list(range(out_df.shape[0]))
plt.figure(figsize=(5, 5), dpi=300)
plt.title(col)
plt.hist(x=sorted_vals * 100, color='black', density=True, bins=100)
plt.xlabel('Missing namespace values (%)')
plt.ylabel('Density')
plt.tight_layout()
fname = f'distPlot{col}.pdf'
plt.savefig(os.path.join(args.output_folder, fname), dpi=300)
plt.cla()
plt.clf()

if line_counter > 100_000:
break
out_scores_lab[line[0]].append(score_line(parsed_line))

for label, out_scores in out_scores_lab.items():
out_df = pd.DataFrame(out_scores)
os.makedirs(args.output_folder, exist_ok=True)
for col in out_df.columns:
sorted_vals = out_df[col].sort_values()
enx = list(range(out_df.shape[0]))
plt.figure(figsize=(5, 5), dpi=300)
plt.title(col + f' label: {label}')
plt.hist(
x=sorted_vals * 100,
color='black',
density=True,
bins=100,
)
if not 'entropy' in col:
plt.xlabel('Proportion of namespaces (%)')
else:
plt.xlabel('Row entropy')
plt.ylabel('Density')
plt.tight_layout()
fname = f'distPlot{col}_{label}.pdf'
plt.savefig(os.path.join(args.output_folder, fname), dpi=300)
plt.cla()
plt.clf()

0 comments on commit a28b373

Please sign in to comment.