instance ranks

outbrain · Jun 14, 2024 · a28b373 · a28b373
1 parent feeb698
commit a28b373
Showing 1 changed file with 48 additions and 20 deletions.
diff --git a/outrank/task_instance_ranking.py b/outrank/task_instance_ranking.py
@@ -3,8 +3,11 @@
 import gzip
 import logging
 import os
+from collections import Counter
+from collections import defaultdict
 from typing import Any
 
+import numpy as np
 import pandas as pd
 import tqdm
 
@@ -18,6 +21,19 @@
     pass
 
 
+def shannon_ent(string):
+    counts = Counter(string)
+    frequencies = ((i / len(string)) for i in counts.values())
+    return -np.sum(f * np.log2(f) for f in frequencies)
+
+
+def compute_entropy_avg(line):
+    joint_ent = 0
+    for field in line:
+        joint_ent += shannon_ent(field)
+    return joint_ent
+
+
 def score_line(line):
     nan_prop = line.count('') / len(line)
     out_struct = {}
@@ -27,8 +43,8 @@ def score_line(line):
     out_struct['all_zero'] = line.count('0') / len(line)
     for j in [30, 60, 100, 200, 300]:
         out_struct[f'all_more_{j}_chars'] = len(
-            [x for x in line if len(x) > j],
-        ) / len(line)
+            [x for x in line if len(x) > j], ) / len(line)
+    out_struct['row_entropy'] = compute_entropy_avg(line)
     return out_struct
 
 
@@ -56,7 +72,7 @@ def outrank_task_rank_instances(args: Any) -> None:
     else:
         file_stream = open(dataset_info.data_path, encoding=data_encoding)
     line_counter = 0
-    out_scores = []
+    out_scores_lab = defaultdict(list)
 
     for line in file_stream:
         line_counter += 1
@@ -69,20 +85,32 @@ def outrank_task_rank_instances(args: Any) -> None:
             dataset_info.fw_map,
             dataset_info.column_names,
         )
-        out_scores.append(score_line(parsed_line))
-
-    out_df = pd.DataFrame(out_scores)
-    os.makedirs(args.output_folder, exist_ok=True)
-    for col in out_df.columns:
-        sorted_vals = out_df[col].sort_values()
-        enx = list(range(out_df.shape[0]))
-        plt.figure(figsize=(5, 5), dpi=300)
-        plt.title(col)
-        plt.hist(x=sorted_vals * 100, color='black', density=True, bins=100)
-        plt.xlabel('Missing namespace values (%)')
-        plt.ylabel('Density')
-        plt.tight_layout()
-        fname = f'distPlot{col}.pdf'
-        plt.savefig(os.path.join(args.output_folder, fname), dpi=300)
-        plt.cla()
-        plt.clf()
+
+        if line_counter > 100_000:
+            break
+        out_scores_lab[line[0]].append(score_line(parsed_line))
+
+    for label, out_scores in out_scores_lab.items():
+        out_df = pd.DataFrame(out_scores)
+        os.makedirs(args.output_folder, exist_ok=True)
+        for col in out_df.columns:
+            sorted_vals = out_df[col].sort_values()
+            enx = list(range(out_df.shape[0]))
+            plt.figure(figsize=(5, 5), dpi=300)
+            plt.title(col + f' label: {label}')
+            plt.hist(
+                x=sorted_vals * 100,
+                color='black',
+                density=True,
+                bins=100,
+            )
+            if not 'entropy' in col:
+                plt.xlabel('Proportion of namespaces (%)')
+            else:
+                plt.xlabel('Row entropy')
+            plt.ylabel('Density')
+            plt.tight_layout()
+            fname = f'distPlot{col}_{label}.pdf'
+            plt.savefig(os.path.join(args.output_folder, fname), dpi=300)
+            plt.cla()
+            plt.clf()