Skip to content

Commit

Permalink
Coverage alignment heuristic (#76)
Browse files Browse the repository at this point in the history
* cov alignment heuristic

* cov heu test

* more tests

* tests

* Setup.py

* tests

* overflow

* proper types?

* int32

* moved test
  • Loading branch information
SkBlaz committed Aug 22, 2024
1 parent 92c1e2e commit bed2095
Show file tree
Hide file tree
Showing 6 changed files with 95 additions and 6 deletions.
2 changes: 1 addition & 1 deletion outrank/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ def main():
args = parser.parse_args()

if args.task == 'selftest':
conduct_self_test()
conduct_self_test('MI-numba-randomized')
exit()

if args.data_path is None and args.task != 'data_generator':
Expand Down
28 changes: 28 additions & 0 deletions outrank/algorithms/feature_ranking/ranking_cov_alignment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from __future__ import annotations

import numpy as np
import numpy.typing as npt

np.random.seed(123)
max_size = 10**6


def max_pair_coverage(array1: npt.NDArray[np.int32], array2: npt.NDArray[np.int32]) -> float:
def hash_pair(el1: np.int32, el2: np.int32):
return (el1 * 1471343 - el2) % max_size

counts = np.zeros(max_size, dtype=np.int32)
tot_len = len(array1)
for i in range(tot_len):
identifier = hash_pair(array1[i], array2[i])
counts[identifier] += 1

return np.max(counts) / tot_len


if __name__ == '__main__':

array1 = np.array([1,1,2,3,1,1,1,5] * 100000)
array2 = np.array([0,0,5,5,3,0,0,0] * 100000)
coverage = max_pair_coverage(array1, array2)
assert coverage == 0.5
3 changes: 3 additions & 0 deletions outrank/algorithms/importance_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC

from outrank.algorithms.feature_ranking import ranking_cov_alignment
from outrank.core_utils import is_prior_heuristic

logger = logging.getLogger('syn-logger')
Expand Down Expand Up @@ -129,6 +130,8 @@ def get_importances_estimate_pairwise(combination, reference_model_features, arg
estimate_feature_importance = sklearn_surrogate(
vector_first, vector_second, X, args.heuristic,
)
elif 'max-value-coverage' in args.heuristic:
estimate_feature_importance = ranking_cov_alignment.max_pair_coverage(vector_first, vector_second)

elif 'MI-numba' in args.heuristic:
estimate_feature_importance = numba_mi(
Expand Down
10 changes: 6 additions & 4 deletions outrank/task_selftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,13 @@
logger.setLevel(logging.DEBUG)


def conduct_self_test():
def conduct_self_test(heuristic='MI-numba-randomized'):
# Simulate full flow, ranking only
subprocess.run(
'outrank --task data_generator --num_synthetic_rows 100000', shell=True,
)
subprocess.run(
'outrank --task ranking --data_path test_data_synthetic --data_source csv-raw;',
f'outrank --task ranking --data_path test_data_synthetic --data_source csv-raw --heuristic {heuristic};',
shell=True,
)

Expand All @@ -39,8 +39,10 @@ def conduct_self_test():
logger.info(f'Removing {path} as part of cleanup ..')
shutil.rmtree(path)

logger.info('All tests passed, OutRank seems in shape \N{winking face}')
logger.info(f'All tests passed for heuristic: {heuristic} \N{rocket}')


if __name__ == '__main__':
conduct_self_test()
conduct_self_test('MI-numba-randomized')
conduct_self_test('max-value-coverage')
logger.info('OutRank seems in shape \N{winking face}')
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def _read_description():
packages = [x for x in setuptools.find_packages() if x != 'test']
setuptools.setup(
name='outrank',
version='0.96.0',
version='0.96.1',
description='OutRank: Feature ranking for massive sparse data sets.',
long_description=_read_description(),
long_description_content_type='text/markdown',
Expand Down
56 changes: 56 additions & 0 deletions tests/cov_heu_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from __future__ import annotations

import sys
import unittest

import numpy as np

from outrank.algorithms.feature_ranking.ranking_cov_alignment import \
max_pair_coverage

np.random.seed(123)
sys.path.append('./outrank')


class TestMaxPairCoverage(unittest.TestCase):
def test_basic_functionality(self):
array1 = np.array([1, 2, 3, 1, 2])
array2 = np.array([4, 5, 6, 4, 5])
result = max_pair_coverage(array1, array2)
self.assertAlmostEqual(result, 2/5, places=5)

def test_identical_elements(self):
array1 = np.array([1, 1, 1, 1])
array2 = np.array([1, 1, 1, 1])
result = max_pair_coverage(array1, array2)
self.assertEqual(result, 1.0)

def test_large_arrays(self):
array1 = np.random.randint(0, 100, size=10000)
array2 = np.random.randint(0, 100, size=10000)
result = max_pair_coverage(array1, array2)
self.assertTrue(0 <= result <= 1)

def test_all_unique_pairs(self):
array1 = np.array([1, 2, 3, 4, 5])
array2 = np.array([6, 7, 8, 9, 10])
result = max_pair_coverage(array1, array2)
self.assertEqual(result, 1/5)

def test_all_same_pairs(self):
array1 = np.array([1, 1, 1, 1, 1])
array2 = np.array([2, 2, 2, 2, 2])
result = max_pair_coverage(array1, array2)
self.assertEqual(result, 1.0)

def test_high_collision_potential(self):
array1 = np.array([1] * 1000)
array2 = np.array([2] * 1000)
result = max_pair_coverage(array1, array2)
self.assertEqual(result, 1.0)

def test_very_large_arrays(self):
array1 = np.random.randint(0, 1000, size=1000000)
array2 = np.random.randint(0, 1000, size=1000000)
result = max_pair_coverage(array1, array2)
self.assertTrue(0 <= result <= 1)

0 comments on commit bed2095

Please sign in to comment.