pairwise tests

outbrain · Oct 15, 2023 · fc76b4a · fc76b4a
1 parent dd83868
commit fc76b4a
Show file tree

Hide file tree

Showing 2 changed files with 22 additions and 1 deletion.
diff --git a/benchmarks/generator_naive.py b/benchmarks/generator_naive.py
@@ -64,6 +64,10 @@ def generate_random_matrix(num_features, size=2000000):
             os.path.join(args.verify_outputs, 'feature_singles.tsv'), sep='\t',
         )
 
+        rankings_pairwise = pd.read_csv(
+            os.path.join(args.verify_outputs, 'pairwise_ranks.tsv'), sep='\t',
+        )
+
         # Partial match test
         if rankings.iloc[2]['Feature'] != 'f31-(90; 100)' and rankings.iloc[2]['Score MI-numba-randomized'] > 0.9:
             raise Exception(
@@ -83,3 +87,17 @@ def generate_random_matrix(num_features, size=2000000):
             logger.info(
                 f'Identified the appropriate feature in the haystack ({rankings.iloc[1].Feature})',
             )
+
+
+        # Tests related to pairwise rankings
+        sorted_by_scores = rankings_pairwise.sort_values(by=['Score', 'FeatureA'])
+
+        if len(sorted_by_scores) < 10000:
+            Exception('Number of pairwise comparisons insufficient!')
+        else:
+            logger.info('Found enough pairwise comparisons ..')
+
+        if sorted_by_scores.iloc[-1]['FeatureA'] == 'f45-(90; 100)' and sorted_by_scores.iloc[-1]['FeatureB'] == 'f45-(90; 100)' and sorted_by_scores.iloc[-1]['Score'] > 1.0:
+            logger.info('Similarity check passed for f45 ..')
+        else:
+            raise Exception('Most similar features not identified ..')
diff --git a/scripts/run_benchmarks.sh b/scripts/run_benchmarks.sh
@@ -14,11 +14,14 @@ then
     python generator_naive.py --output_df_name dataset_naive --num_features 100 --size 10000;
 
     # Substantial subsampling must retrieve the needle.
-    outrank --data_path dataset_naive --data_source csv-raw --subsampling 1 --task all --heuristic MI-numba-randomized --target_ranking_only True --interaction_order 1 --output_folder ./ranking_outputs --minibatch_size 20000;
+    outrank --data_path dataset_naive --data_source csv-raw --subsampling 1 --task all --heuristic MI-numba-randomized --target_ranking_only False --interaction_order 1 --output_folder ./ranking_outputs --minibatch_size 20000;
 
     python generator_naive.py --verify_outputs ranking_outputs;
 
     rm -r ranking_outputs dataset_naive;
+
+    python generator_naive.py --output_df_name dataset_naive --num_features 100 --size 10000;
+
     exit
 fi
 ###################################################################