Coverage alignment heuristic (#76)

* cov alignment heuristic * cov heu test * more tests * tests * Setup.py * tests * overflow * proper types? * int32 * moved test
outbrain · Aug 22, 2024 · bed2095 · bed2095
1 parent 92c1e2e
commit bed2095
Show file tree

Hide file tree

Showing 6 changed files with 95 additions and 6 deletions.
diff --git a/outrank/__main__.py b/outrank/__main__.py
@@ -243,7 +243,7 @@ def main():
     args = parser.parse_args()
 
     if args.task == 'selftest':
-        conduct_self_test()
+        conduct_self_test('MI-numba-randomized')
         exit()
 
     if args.data_path is None and args.task != 'data_generator':

diff --git a/outrank/algorithms/feature_ranking/ranking_cov_alignment.py b/outrank/algorithms/feature_ranking/ranking_cov_alignment.py
@@ -0,0 +1,28 @@
+from __future__ import annotations
+
+import numpy as np
+import numpy.typing as npt
+
+np.random.seed(123)
+max_size = 10**6
+
+
+def max_pair_coverage(array1: npt.NDArray[np.int32], array2: npt.NDArray[np.int32]) -> float:
+    def hash_pair(el1: np.int32, el2: np.int32):
+        return (el1 * 1471343 - el2) % max_size
+
+    counts = np.zeros(max_size, dtype=np.int32)
+    tot_len = len(array1)
+    for i in range(tot_len):
+        identifier = hash_pair(array1[i], array2[i])
+        counts[identifier] += 1
+
+    return np.max(counts) / tot_len
+
+
+if __name__ == '__main__':
+
+    array1 = np.array([1,1,2,3,1,1,1,5] * 100000)
+    array2 = np.array([0,0,5,5,3,0,0,0] * 100000)
+    coverage = max_pair_coverage(array1, array2)
+    assert coverage == 0.5
diff --git a/outrank/algorithms/importance_estimator.py b/outrank/algorithms/importance_estimator.py
@@ -18,6 +18,7 @@
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.svm import SVC
 
+from outrank.algorithms.feature_ranking import ranking_cov_alignment
 from outrank.core_utils import is_prior_heuristic
 
 logger = logging.getLogger('syn-logger')
@@ -129,6 +130,8 @@ def get_importances_estimate_pairwise(combination, reference_model_features, arg
         estimate_feature_importance = sklearn_surrogate(
             vector_first, vector_second, X, args.heuristic,
         )
+    elif 'max-value-coverage' in args.heuristic:
+        estimate_feature_importance = ranking_cov_alignment.max_pair_coverage(vector_first, vector_second)
 
     elif 'MI-numba' in args.heuristic:
         estimate_feature_importance = numba_mi(

diff --git a/outrank/task_selftest.py b/outrank/task_selftest.py
@@ -16,13 +16,13 @@
 logger.setLevel(logging.DEBUG)
 
 
-def conduct_self_test():
+def conduct_self_test(heuristic='MI-numba-randomized'):
     # Simulate full flow, ranking only
     subprocess.run(
         'outrank --task data_generator --num_synthetic_rows 100000', shell=True,
     )
     subprocess.run(
-        'outrank --task ranking --data_path test_data_synthetic --data_source csv-raw;',
+        f'outrank --task ranking --data_path test_data_synthetic --data_source csv-raw --heuristic {heuristic};',
         shell=True,
     )
 
@@ -39,8 +39,10 @@ def conduct_self_test():
             logger.info(f'Removing {path} as part of cleanup ..')
             shutil.rmtree(path)
 
-    logger.info('All tests passed, OutRank seems in shape \N{winking face}')
+    logger.info(f'All tests passed for heuristic: {heuristic} \N{rocket}')
 
 
 if __name__ == '__main__':
-    conduct_self_test()
+    conduct_self_test('MI-numba-randomized')
+    conduct_self_test('max-value-coverage')
+    logger.info('OutRank seems in shape \N{winking face}')
diff --git a/setup.py b/setup.py
@@ -23,7 +23,7 @@ def _read_description():
 packages = [x for x in setuptools.find_packages() if x != 'test']
 setuptools.setup(
     name='outrank',
-    version='0.96.0',
+    version='0.96.1',
     description='OutRank: Feature ranking for massive sparse data sets.',
     long_description=_read_description(),
     long_description_content_type='text/markdown',

diff --git a/tests/cov_heu_test.py b/tests/cov_heu_test.py
@@ -0,0 +1,56 @@
+from __future__ import annotations
+
+import sys
+import unittest
+
+import numpy as np
+
+from outrank.algorithms.feature_ranking.ranking_cov_alignment import \
+    max_pair_coverage
+
+np.random.seed(123)
+sys.path.append('./outrank')
+
+
+class TestMaxPairCoverage(unittest.TestCase):
+    def test_basic_functionality(self):
+        array1 = np.array([1, 2, 3, 1, 2])
+        array2 = np.array([4, 5, 6, 4, 5])
+        result = max_pair_coverage(array1, array2)
+        self.assertAlmostEqual(result, 2/5, places=5)
+
+    def test_identical_elements(self):
+        array1 = np.array([1, 1, 1, 1])
+        array2 = np.array([1, 1, 1, 1])
+        result = max_pair_coverage(array1, array2)
+        self.assertEqual(result, 1.0)
+
+    def test_large_arrays(self):
+        array1 = np.random.randint(0, 100, size=10000)
+        array2 = np.random.randint(0, 100, size=10000)
+        result = max_pair_coverage(array1, array2)
+        self.assertTrue(0 <= result <= 1)
+
+    def test_all_unique_pairs(self):
+        array1 = np.array([1, 2, 3, 4, 5])
+        array2 = np.array([6, 7, 8, 9, 10])
+        result = max_pair_coverage(array1, array2)
+        self.assertEqual(result, 1/5)
+
+    def test_all_same_pairs(self):
+        array1 = np.array([1, 1, 1, 1, 1])
+        array2 = np.array([2, 2, 2, 2, 2])
+        result = max_pair_coverage(array1, array2)
+        self.assertEqual(result, 1.0)
+
+    def test_high_collision_potential(self):
+        array1 = np.array([1] * 1000)
+        array2 = np.array([2] * 1000)
+        result = max_pair_coverage(array1, array2)
+        self.assertEqual(result, 1.0)
+
+    def test_very_large_arrays(self):
+        array1 = np.random.randint(0, 1000, size=1000000)
+        array2 = np.random.randint(0, 1000, size=1000000)
+        result = max_pair_coverage(array1, array2)
+        self.assertTrue(0 <= result <= 1)