From f0287092a53c4a27f0e6fe3940768bd25a8835fc Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 28 Apr 2022 11:57:47 -0600 Subject: [PATCH 001/416] Add naive implementation of stump_topk --- tests/naive.py | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/tests/naive.py b/tests/naive.py index 4089e603e..8f3a05da8 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1716,3 +1716,74 @@ def _total_diagonal_ndists(tile_lower_diag, tile_upper_diag, tile_height, tile_w ) return total_ndists + + +def stump_topk(T_A, m, T_B=None, exclusion_zone=None, k=1): + """ + Traverse distance matrix along the diagonals and update the top-k + nearest neigbors matrix profile and matrix profile indices + """ + if T_B is None: # self-join: + ignore_trivial = True + distance_matrix = np.array( + [distance_profile(Q, T_A, m) for Q in core.rolling_window(T_A, m)] + ) + T_B = T_A.copy() + else: + ignore_trivial = False + distance_matrix = np.array( + [distance_profile(Q, T_B, m) for Q in core.rolling_window(T_A, m)] + ) + + distance_matrix[np.isnan(distance_matrix)] = np.inf + + n_A = T_A.shape[0] + n_B = T_B.shape[0] + l = n_A - m + 1 + if exclusion_zone is None: + exclusion_zone = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM)) + + if ignore_trivial: + diags = np.arange(exclusion_zone + 1, n_A - m + 1) + else: + diags = np.arange(-(n_A - m + 1) + 1, n_B - m + 1) + + # the last two columns in P and I are to keep track of right and left mp for 1NN + P = np.full((l, k + 2), np.inf) + I = np.full((l, k + 2), -1, dtype=np.int64) + + for g in diags: + if g >= 0: + iter_range = range(0, min(n_A - m + 1, n_B - m + 1 - g)) + else: + iter_range = range(-k, min(n_A - m + 1, n_B - m + 1 - g)) + + for i in iter_range: + D = distance_matrix[i, i + g] + if D < P[i, k - 1]: + idx = np.searchsorted(P[i, :k], D, side='right') + P[i, :k] = np.insert(P[i, :k], idx, D)[:-1] + I[i, :k] = np.insert(I[i, :k], idx, i + g)[:-1] + + if ignore_trivial: # Self-joins only + if D < P[i + g, k - 1]: + idx = np.searchsorted(P[i + g, :k], D, side='right') + P[i + g, :k] = np.insert(P[i + g, :k], idx, D)[:-1] + I[i + g, :k] = np.insert(I[i + g, :k], idx, i)[:-1] + + if i < i + g: + # Left matrix profile and left matrix profile index + if D < P[i + g, k]: + P[i + g, k] = D + I[i + g, k] = i + + if D < P[i, k + 1]: + # right matrix profile and right matrix profile index + P[i, k + 1] = D + I[i, k + 1] = i + g + + result = np.empty((l, 2 * k + 2), dtype=object) + result[:, :k] = P[:, :k] + result[:, k:] = I[:, :] + + return result From e893873fc763a944b3d7e414d23e116762ee6693 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 28 Apr 2022 12:10:32 -0600 Subject: [PATCH 002/416] Copy test_stump code to test_stump_topk --- tests/test_stump_topk.py | 242 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 242 insertions(+) create mode 100644 tests/test_stump_topk.py diff --git a/tests/test_stump_topk.py b/tests/test_stump_topk.py new file mode 100644 index 000000000..d3475122f --- /dev/null +++ b/tests/test_stump_topk.py @@ -0,0 +1,242 @@ +import numpy as np +import numpy.testing as npt +import pandas as pd +from stumpy import stump, config +import pytest +import naive + + +test_data = [ + ( + np.array([9, 8100, -60, 7], dtype=np.float64), + np.array([584, -11, 23, 79, 1001, 0, -19], dtype=np.float64), + ), + ( + np.random.uniform(-1000, 1000, [8]).astype(np.float64), + np.random.uniform(-1000, 1000, [64]).astype(np.float64), + ), +] + +substitution_locations = [(slice(0, 0), 0, -1, slice(1, 3), [0, 3])] +substitution_values = [np.nan, np.inf] + + +def test_stump_int_input(): + with pytest.raises(TypeError): + stump(np.arange(10), 5) + + +@pytest.mark.parametrize("T_A, T_B", test_data) +def test_stump_self_join(T_A, T_B): + m = 3 + zone = int(np.ceil(m / 4)) + ref_mp = naive.stump(T_B, m, exclusion_zone=zone) + comp_mp = stump(T_B, m, ignore_trivial=True) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) + + comp_mp = stump(pd.Series(T_B), m, ignore_trivial=True) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) + + +@pytest.mark.parametrize("T_A, T_B", test_data) +def test_stump_A_B_join(T_A, T_B): + m = 3 + ref_mp = naive.stump(T_A, m, T_B=T_B) + comp_mp = stump(T_A, m, T_B, ignore_trivial=False) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) + + comp_mp = stump(pd.Series(T_A), m, pd.Series(T_B), ignore_trivial=False) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) + + +def test_stump_constant_subsequence_self_join(): + T_A = np.concatenate((np.zeros(20, dtype=np.float64), np.ones(5, dtype=np.float64))) + m = 3 + zone = int(np.ceil(m / 4)) + ref_mp = naive.stump(T_A, m, exclusion_zone=zone) + comp_mp = stump(T_A, m, ignore_trivial=True) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp[:, 0], comp_mp[:, 0]) # ignore indices + + comp_mp = stump(pd.Series(T_A), m, ignore_trivial=True) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp[:, 0], comp_mp[:, 0]) # ignore indices + + +def test_stump_one_constant_subsequence_A_B_join(): + T_A = np.random.rand(20) + T_B = np.concatenate((np.zeros(20, dtype=np.float64), np.ones(5, dtype=np.float64))) + m = 3 + ref_mp = naive.stamp(T_A, m, T_B=T_B) + comp_mp = stump(T_A, m, T_B, ignore_trivial=False) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp[:, 0], comp_mp[:, 0]) # ignore indices + + comp_mp = stump(pd.Series(T_A), m, pd.Series(T_B), ignore_trivial=False) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp[:, 0], comp_mp[:, 0]) # ignore indices + + # Swap inputs + ref_mp = naive.stamp(T_B, m, T_B=T_A) + comp_mp = stump(T_B, m, T_A, ignore_trivial=False) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp[:, 0], comp_mp[:, 0]) # ignore indices + + +def test_stump_two_constant_subsequences_A_B_join(): + T_A = np.concatenate( + (np.zeros(10, dtype=np.float64), np.ones(10, dtype=np.float64)) + ) + T_B = np.concatenate((np.zeros(20, dtype=np.float64), np.ones(5, dtype=np.float64))) + m = 3 + ref_mp = naive.stamp(T_A, m, T_B=T_B) + comp_mp = stump(T_A, m, T_B, ignore_trivial=False) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp[:, 0], comp_mp[:, 0]) # ignore indices + + comp_mp = stump(pd.Series(T_A), m, pd.Series(T_B), ignore_trivial=False) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp[:, 0], comp_mp[:, 0]) # ignore indices + + # Swap inputs + ref_mp = naive.stamp(T_B, m, T_B=T_A) + comp_mp = stump(T_B, m, T_A, ignore_trivial=False) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp[:, 0], comp_mp[:, 0]) # ignore indices + + comp_mp = stump(pd.Series(T_B), m, pd.Series(T_A), ignore_trivial=False) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp[:, 0], comp_mp[:, 0]) # ignore indices + + +def test_stump_identical_subsequence_self_join(): + identical = np.random.rand(8) + T_A = np.random.rand(20) + T_A[1 : 1 + identical.shape[0]] = identical + T_A[11 : 11 + identical.shape[0]] = identical + m = 3 + zone = int(np.ceil(m / 4)) + ref_mp = naive.stamp(T_A, m, exclusion_zone=zone) + comp_mp = stump(T_A, m, ignore_trivial=True) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal( + ref_mp[:, 0], comp_mp[:, 0], decimal=config.STUMPY_TEST_PRECISION + ) # ignore indices + + comp_mp = stump(pd.Series(T_A), m, ignore_trivial=True) + naive.replace_inf(comp_mp) + npt.assert_almost_equal( + ref_mp[:, 0], comp_mp[:, 0], decimal=config.STUMPY_TEST_PRECISION + ) # ignore indices + + +def test_stump_identical_subsequence_A_B_join(): + identical = np.random.rand(8) + T_A = np.random.rand(20) + T_B = np.random.rand(20) + T_A[1 : 1 + identical.shape[0]] = identical + T_B[11 : 11 + identical.shape[0]] = identical + m = 3 + ref_mp = naive.stamp(T_A, m, T_B=T_B) + comp_mp = stump(T_A, m, T_B, ignore_trivial=False) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal( + ref_mp[:, 0], comp_mp[:, 0], decimal=config.STUMPY_TEST_PRECISION + ) # ignore indices + + comp_mp = stump(pd.Series(T_A), m, pd.Series(T_B), ignore_trivial=False) + naive.replace_inf(comp_mp) + npt.assert_almost_equal( + ref_mp[:, 0], comp_mp[:, 0], decimal=config.STUMPY_TEST_PRECISION + ) # ignore indices + + # Swap inputs + ref_mp = naive.stamp(T_B, m, T_B=T_A) + comp_mp = stump(T_B, m, T_A, ignore_trivial=False) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal( + ref_mp[:, 0], comp_mp[:, 0], decimal=config.STUMPY_TEST_PRECISION + ) # ignore indices + + +@pytest.mark.parametrize("T_A, T_B", test_data) +@pytest.mark.parametrize("substitute_B", substitution_values) +@pytest.mark.parametrize("substitution_locations", substitution_locations) +def test_stump_nan_inf_self_join(T_A, T_B, substitute_B, substitution_locations): + m = 3 + + T_B_sub = T_B.copy() + + for substitution_location_B in substitution_locations: + T_B_sub[:] = T_B[:] + T_B_sub[substitution_location_B] = substitute_B + + zone = int(np.ceil(m / 4)) + ref_mp = naive.stamp(T_B_sub, m, exclusion_zone=zone) + comp_mp = stump(T_B_sub, m, ignore_trivial=True) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) + + comp_mp = stump(pd.Series(T_B_sub), m, ignore_trivial=True) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) + + +@pytest.mark.parametrize("T_A, T_B", test_data) +@pytest.mark.parametrize("substitute_A", substitution_values) +@pytest.mark.parametrize("substitute_B", substitution_values) +@pytest.mark.parametrize("substitution_locations", substitution_locations) +def test_stump_nan_inf_A_B_join( + T_A, T_B, substitute_A, substitute_B, substitution_locations +): + m = 3 + + T_A_sub = T_A.copy() + T_B_sub = T_B.copy() + + for substitution_location_B in substitution_locations: + for substitution_location_A in substitution_locations: + T_A_sub[:] = T_A[:] + T_B_sub[:] = T_B[:] + T_A_sub[substitution_location_A] = substitute_A + T_B_sub[substitution_location_B] = substitute_B + + ref_mp = naive.stamp(T_A_sub, m, T_B=T_B_sub) + comp_mp = stump(T_A_sub, m, T_B_sub, ignore_trivial=False) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) + + comp_mp = stump( + pd.Series(T_A_sub), m, pd.Series(T_B_sub), ignore_trivial=False + ) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) + + +def test_stump_nan_zero_mean_self_join(): + T = np.array([-1, 0, 1, np.inf, 1, 0, -1]) + m = 3 + + zone = int(np.ceil(m / 4)) + ref_mp = naive.stamp(T, m, exclusion_zone=zone) + comp_mp = stump(T, m, ignore_trivial=True) + + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) From 986311f78dae7ca90db29a793d43fa23b0a3afe4 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 28 Apr 2022 12:15:46 -0600 Subject: [PATCH 003/416] change replace naive.stump with naive.stump_topk --- tests/test_stump_topk.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/test_stump_topk.py b/tests/test_stump_topk.py index d3475122f..290487460 100644 --- a/tests/test_stump_topk.py +++ b/tests/test_stump_topk.py @@ -28,9 +28,10 @@ def test_stump_int_input(): @pytest.mark.parametrize("T_A, T_B", test_data) def test_stump_self_join(T_A, T_B): + k = 3 m = 3 zone = int(np.ceil(m / 4)) - ref_mp = naive.stump(T_B, m, exclusion_zone=zone) + ref_mp = naive.stump_topk(T_B, m, exclusion_zone=zone, k=k) comp_mp = stump(T_B, m, ignore_trivial=True) naive.replace_inf(ref_mp) naive.replace_inf(comp_mp) @@ -43,8 +44,9 @@ def test_stump_self_join(T_A, T_B): @pytest.mark.parametrize("T_A, T_B", test_data) def test_stump_A_B_join(T_A, T_B): + k = 3 m = 3 - ref_mp = naive.stump(T_A, m, T_B=T_B) + ref_mp = naive.stump_topk(T_A, m, T_B=T_B, k=k) comp_mp = stump(T_A, m, T_B, ignore_trivial=False) naive.replace_inf(ref_mp) naive.replace_inf(comp_mp) @@ -57,9 +59,10 @@ def test_stump_A_B_join(T_A, T_B): def test_stump_constant_subsequence_self_join(): T_A = np.concatenate((np.zeros(20, dtype=np.float64), np.ones(5, dtype=np.float64))) + k = 3 m = 3 zone = int(np.ceil(m / 4)) - ref_mp = naive.stump(T_A, m, exclusion_zone=zone) + ref_mp = naive.stump_topk(T_A, m, exclusion_zone=zone, k=k) comp_mp = stump(T_A, m, ignore_trivial=True) naive.replace_inf(ref_mp) naive.replace_inf(comp_mp) From 9d8aafc3b75a051dee64aa72112dc8a3050b13b9 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 28 Apr 2022 13:04:36 -0600 Subject: [PATCH 004/416] Add self-join tests for 1NN and KNN --- tests/test_stump_topk.py | 202 ++------------------------------------- 1 file changed, 7 insertions(+), 195 deletions(-) diff --git a/tests/test_stump_topk.py b/tests/test_stump_topk.py index 290487460..b3276b85b 100644 --- a/tests/test_stump_topk.py +++ b/tests/test_stump_topk.py @@ -27,8 +27,8 @@ def test_stump_int_input(): @pytest.mark.parametrize("T_A, T_B", test_data) -def test_stump_self_join(T_A, T_B): - k = 3 +def test_stump_self_join_1NN(T_A, T_B): + k = 1 m = 3 zone = int(np.ceil(m / 4)) ref_mp = naive.stump_topk(T_B, m, exclusion_zone=zone, k=k) @@ -42,204 +42,16 @@ def test_stump_self_join(T_A, T_B): npt.assert_almost_equal(ref_mp, comp_mp) -@pytest.mark.parametrize("T_A, T_B", test_data) -def test_stump_A_B_join(T_A, T_B): - k = 3 - m = 3 - ref_mp = naive.stump_topk(T_A, m, T_B=T_B, k=k) - comp_mp = stump(T_A, m, T_B, ignore_trivial=False) - naive.replace_inf(ref_mp) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp, comp_mp) - - comp_mp = stump(pd.Series(T_A), m, pd.Series(T_B), ignore_trivial=False) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp, comp_mp) - - -def test_stump_constant_subsequence_self_join(): - T_A = np.concatenate((np.zeros(20, dtype=np.float64), np.ones(5, dtype=np.float64))) +def test_stump_self_join_KNN(T_A, T_B): k = 3 m = 3 zone = int(np.ceil(m / 4)) - ref_mp = naive.stump_topk(T_A, m, exclusion_zone=zone, k=k) - comp_mp = stump(T_A, m, ignore_trivial=True) - naive.replace_inf(ref_mp) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp[:, 0], comp_mp[:, 0]) # ignore indices - - comp_mp = stump(pd.Series(T_A), m, ignore_trivial=True) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp[:, 0], comp_mp[:, 0]) # ignore indices - - -def test_stump_one_constant_subsequence_A_B_join(): - T_A = np.random.rand(20) - T_B = np.concatenate((np.zeros(20, dtype=np.float64), np.ones(5, dtype=np.float64))) - m = 3 - ref_mp = naive.stamp(T_A, m, T_B=T_B) - comp_mp = stump(T_A, m, T_B, ignore_trivial=False) - naive.replace_inf(ref_mp) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp[:, 0], comp_mp[:, 0]) # ignore indices - - comp_mp = stump(pd.Series(T_A), m, pd.Series(T_B), ignore_trivial=False) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp[:, 0], comp_mp[:, 0]) # ignore indices - - # Swap inputs - ref_mp = naive.stamp(T_B, m, T_B=T_A) - comp_mp = stump(T_B, m, T_A, ignore_trivial=False) - naive.replace_inf(ref_mp) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp[:, 0], comp_mp[:, 0]) # ignore indices - - -def test_stump_two_constant_subsequences_A_B_join(): - T_A = np.concatenate( - (np.zeros(10, dtype=np.float64), np.ones(10, dtype=np.float64)) - ) - T_B = np.concatenate((np.zeros(20, dtype=np.float64), np.ones(5, dtype=np.float64))) - m = 3 - ref_mp = naive.stamp(T_A, m, T_B=T_B) - comp_mp = stump(T_A, m, T_B, ignore_trivial=False) - naive.replace_inf(ref_mp) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp[:, 0], comp_mp[:, 0]) # ignore indices - - comp_mp = stump(pd.Series(T_A), m, pd.Series(T_B), ignore_trivial=False) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp[:, 0], comp_mp[:, 0]) # ignore indices - - # Swap inputs - ref_mp = naive.stamp(T_B, m, T_B=T_A) - comp_mp = stump(T_B, m, T_A, ignore_trivial=False) - naive.replace_inf(ref_mp) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp[:, 0], comp_mp[:, 0]) # ignore indices - - comp_mp = stump(pd.Series(T_B), m, pd.Series(T_A), ignore_trivial=False) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp[:, 0], comp_mp[:, 0]) # ignore indices - - -def test_stump_identical_subsequence_self_join(): - identical = np.random.rand(8) - T_A = np.random.rand(20) - T_A[1 : 1 + identical.shape[0]] = identical - T_A[11 : 11 + identical.shape[0]] = identical - m = 3 - zone = int(np.ceil(m / 4)) - ref_mp = naive.stamp(T_A, m, exclusion_zone=zone) - comp_mp = stump(T_A, m, ignore_trivial=True) - naive.replace_inf(ref_mp) - naive.replace_inf(comp_mp) - npt.assert_almost_equal( - ref_mp[:, 0], comp_mp[:, 0], decimal=config.STUMPY_TEST_PRECISION - ) # ignore indices - - comp_mp = stump(pd.Series(T_A), m, ignore_trivial=True) - naive.replace_inf(comp_mp) - npt.assert_almost_equal( - ref_mp[:, 0], comp_mp[:, 0], decimal=config.STUMPY_TEST_PRECISION - ) # ignore indices - - -def test_stump_identical_subsequence_A_B_join(): - identical = np.random.rand(8) - T_A = np.random.rand(20) - T_B = np.random.rand(20) - T_A[1 : 1 + identical.shape[0]] = identical - T_B[11 : 11 + identical.shape[0]] = identical - m = 3 - ref_mp = naive.stamp(T_A, m, T_B=T_B) - comp_mp = stump(T_A, m, T_B, ignore_trivial=False) - naive.replace_inf(ref_mp) - naive.replace_inf(comp_mp) - npt.assert_almost_equal( - ref_mp[:, 0], comp_mp[:, 0], decimal=config.STUMPY_TEST_PRECISION - ) # ignore indices - - comp_mp = stump(pd.Series(T_A), m, pd.Series(T_B), ignore_trivial=False) - naive.replace_inf(comp_mp) - npt.assert_almost_equal( - ref_mp[:, 0], comp_mp[:, 0], decimal=config.STUMPY_TEST_PRECISION - ) # ignore indices - - # Swap inputs - ref_mp = naive.stamp(T_B, m, T_B=T_A) - comp_mp = stump(T_B, m, T_A, ignore_trivial=False) + ref_mp = naive.stump_topk(T_B, m, exclusion_zone=zone, k=k) + comp_mp = stump(T_B, m, ignore_trivial=True) naive.replace_inf(ref_mp) naive.replace_inf(comp_mp) - npt.assert_almost_equal( - ref_mp[:, 0], comp_mp[:, 0], decimal=config.STUMPY_TEST_PRECISION - ) # ignore indices - - -@pytest.mark.parametrize("T_A, T_B", test_data) -@pytest.mark.parametrize("substitute_B", substitution_values) -@pytest.mark.parametrize("substitution_locations", substitution_locations) -def test_stump_nan_inf_self_join(T_A, T_B, substitute_B, substitution_locations): - m = 3 - - T_B_sub = T_B.copy() - - for substitution_location_B in substitution_locations: - T_B_sub[:] = T_B[:] - T_B_sub[substitution_location_B] = substitute_B - - zone = int(np.ceil(m / 4)) - ref_mp = naive.stamp(T_B_sub, m, exclusion_zone=zone) - comp_mp = stump(T_B_sub, m, ignore_trivial=True) - naive.replace_inf(ref_mp) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp, comp_mp) - - comp_mp = stump(pd.Series(T_B_sub), m, ignore_trivial=True) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp, comp_mp) - - -@pytest.mark.parametrize("T_A, T_B", test_data) -@pytest.mark.parametrize("substitute_A", substitution_values) -@pytest.mark.parametrize("substitute_B", substitution_values) -@pytest.mark.parametrize("substitution_locations", substitution_locations) -def test_stump_nan_inf_A_B_join( - T_A, T_B, substitute_A, substitute_B, substitution_locations -): - m = 3 - - T_A_sub = T_A.copy() - T_B_sub = T_B.copy() - - for substitution_location_B in substitution_locations: - for substitution_location_A in substitution_locations: - T_A_sub[:] = T_A[:] - T_B_sub[:] = T_B[:] - T_A_sub[substitution_location_A] = substitute_A - T_B_sub[substitution_location_B] = substitute_B - - ref_mp = naive.stamp(T_A_sub, m, T_B=T_B_sub) - comp_mp = stump(T_A_sub, m, T_B_sub, ignore_trivial=False) - naive.replace_inf(ref_mp) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp, comp_mp) - - comp_mp = stump( - pd.Series(T_A_sub), m, pd.Series(T_B_sub), ignore_trivial=False - ) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp, comp_mp) - - -def test_stump_nan_zero_mean_self_join(): - T = np.array([-1, 0, 1, np.inf, 1, 0, -1]) - m = 3 - - zone = int(np.ceil(m / 4)) - ref_mp = naive.stamp(T, m, exclusion_zone=zone) - comp_mp = stump(T, m, ignore_trivial=True) + npt.assert_almost_equal(ref_mp, comp_mp) - naive.replace_inf(ref_mp) + comp_mp = stump(pd.Series(T_B), m, ignore_trivial=True) naive.replace_inf(comp_mp) npt.assert_almost_equal(ref_mp, comp_mp) From 121686b43187f053f23c09f07f2cf88f0ab1c238 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 28 Apr 2022 13:09:15 -0600 Subject: [PATCH 005/416] remove variable k in 1NN test --- tests/test_stump_topk.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_stump_topk.py b/tests/test_stump_topk.py index b3276b85b..3f277a0ad 100644 --- a/tests/test_stump_topk.py +++ b/tests/test_stump_topk.py @@ -28,10 +28,9 @@ def test_stump_int_input(): @pytest.mark.parametrize("T_A, T_B", test_data) def test_stump_self_join_1NN(T_A, T_B): - k = 1 m = 3 zone = int(np.ceil(m / 4)) - ref_mp = naive.stump_topk(T_B, m, exclusion_zone=zone, k=k) + ref_mp = naive.stump_topk(T_B, m, exclusion_zone=zone, k=1) comp_mp = stump(T_B, m, ignore_trivial=True) naive.replace_inf(ref_mp) naive.replace_inf(comp_mp) From 730bfbbee7e867b2373e5060503492bab533efd8 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 28 Apr 2022 13:11:49 -0600 Subject: [PATCH 006/416] Fixed passing input to test function --- tests/test_stump_topk.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_stump_topk.py b/tests/test_stump_topk.py index 3f277a0ad..4b722fd8f 100644 --- a/tests/test_stump_topk.py +++ b/tests/test_stump_topk.py @@ -41,6 +41,7 @@ def test_stump_self_join_1NN(T_A, T_B): npt.assert_almost_equal(ref_mp, comp_mp) +@pytest.mark.parametrize("T_A, T_B", test_data) def test_stump_self_join_KNN(T_A, T_B): k = 3 m = 3 From f78348f3fadaface820e558c909e19cb0803503c Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 28 Apr 2022 13:19:43 -0600 Subject: [PATCH 007/416] Fixed minor bug --- tests/naive.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 8f3a05da8..6dd4bcb99 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1756,11 +1756,11 @@ def stump_topk(T_A, m, T_B=None, exclusion_zone=None, k=1): if g >= 0: iter_range = range(0, min(n_A - m + 1, n_B - m + 1 - g)) else: - iter_range = range(-k, min(n_A - m + 1, n_B - m + 1 - g)) + iter_range = range(-g, min(n_A - m + 1, n_B - m + 1 - g)) for i in iter_range: D = distance_matrix[i, i + g] - if D < P[i, k - 1]: + if D < P[i, k - 1]: #less than k-th smallest value of T[i:i+m] idx = np.searchsorted(P[i, :k], D, side='right') P[i, :k] = np.insert(P[i, :k], idx, D)[:-1] I[i, :k] = np.insert(I[i, :k], idx, i + g)[:-1] From e09b5f05d16c4506ded15df432fcd27b2fc822df Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 28 Apr 2022 13:31:18 -0600 Subject: [PATCH 008/416] Correct format --- tests/naive.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 6dd4bcb99..91a88cea7 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1760,14 +1760,14 @@ def stump_topk(T_A, m, T_B=None, exclusion_zone=None, k=1): for i in iter_range: D = distance_matrix[i, i + g] - if D < P[i, k - 1]: #less than k-th smallest value of T[i:i+m] - idx = np.searchsorted(P[i, :k], D, side='right') + if D < P[i, k - 1]: # less than k-th smallest value of T[i:i+m] + idx = np.searchsorted(P[i, :k], D, side="right") P[i, :k] = np.insert(P[i, :k], idx, D)[:-1] I[i, :k] = np.insert(I[i, :k], idx, i + g)[:-1] if ignore_trivial: # Self-joins only if D < P[i + g, k - 1]: - idx = np.searchsorted(P[i + g, :k], D, side='right') + idx = np.searchsorted(P[i + g, :k], D, side="right") P[i + g, :k] = np.insert(P[i + g, :k], idx, D)[:-1] I[i + g, :k] = np.insert(I[i + g, :k], idx, i)[:-1] From 95a8c081f745ea8781da5b4eaefceea936559471 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 29 Apr 2022 01:15:01 -0600 Subject: [PATCH 009/416] Erase function stump_topk --- tests/naive.py | 71 -------------------------------------------------- 1 file changed, 71 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 91a88cea7..4089e603e 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1716,74 +1716,3 @@ def _total_diagonal_ndists(tile_lower_diag, tile_upper_diag, tile_height, tile_w ) return total_ndists - - -def stump_topk(T_A, m, T_B=None, exclusion_zone=None, k=1): - """ - Traverse distance matrix along the diagonals and update the top-k - nearest neigbors matrix profile and matrix profile indices - """ - if T_B is None: # self-join: - ignore_trivial = True - distance_matrix = np.array( - [distance_profile(Q, T_A, m) for Q in core.rolling_window(T_A, m)] - ) - T_B = T_A.copy() - else: - ignore_trivial = False - distance_matrix = np.array( - [distance_profile(Q, T_B, m) for Q in core.rolling_window(T_A, m)] - ) - - distance_matrix[np.isnan(distance_matrix)] = np.inf - - n_A = T_A.shape[0] - n_B = T_B.shape[0] - l = n_A - m + 1 - if exclusion_zone is None: - exclusion_zone = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM)) - - if ignore_trivial: - diags = np.arange(exclusion_zone + 1, n_A - m + 1) - else: - diags = np.arange(-(n_A - m + 1) + 1, n_B - m + 1) - - # the last two columns in P and I are to keep track of right and left mp for 1NN - P = np.full((l, k + 2), np.inf) - I = np.full((l, k + 2), -1, dtype=np.int64) - - for g in diags: - if g >= 0: - iter_range = range(0, min(n_A - m + 1, n_B - m + 1 - g)) - else: - iter_range = range(-g, min(n_A - m + 1, n_B - m + 1 - g)) - - for i in iter_range: - D = distance_matrix[i, i + g] - if D < P[i, k - 1]: # less than k-th smallest value of T[i:i+m] - idx = np.searchsorted(P[i, :k], D, side="right") - P[i, :k] = np.insert(P[i, :k], idx, D)[:-1] - I[i, :k] = np.insert(I[i, :k], idx, i + g)[:-1] - - if ignore_trivial: # Self-joins only - if D < P[i + g, k - 1]: - idx = np.searchsorted(P[i + g, :k], D, side="right") - P[i + g, :k] = np.insert(P[i + g, :k], idx, D)[:-1] - I[i + g, :k] = np.insert(I[i + g, :k], idx, i)[:-1] - - if i < i + g: - # Left matrix profile and left matrix profile index - if D < P[i + g, k]: - P[i + g, k] = D - I[i + g, k] = i - - if D < P[i, k + 1]: - # right matrix profile and right matrix profile index - P[i, k + 1] = D - I[i, k + 1] = i + g - - result = np.empty((l, 2 * k + 2), dtype=object) - result[:, :k] = P[:, :k] - result[:, k:] = I[:, :] - - return result From d0701fedd3060dcb0b97a266ceaae4beacae52e8 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 29 Apr 2022 16:18:58 -0600 Subject: [PATCH 010/416] Revise naive.stump to return topk NN matrix profile --- tests/naive.py | 64 +++++++++++++++++++++----------------------------- 1 file changed, 27 insertions(+), 37 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 4089e603e..0c49c5746 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -156,7 +156,7 @@ def stamp(T_A, m, T_B=None, exclusion_zone=None): return result -def stump(T_A, m, T_B=None, exclusion_zone=None): +def stump(T_A, m, T_B=None, exclusion_zone=None, k=1): """ Traverse distance matrix along the diagonals and update the matrix profile and matrix profile indices @@ -181,45 +181,35 @@ def stump(T_A, m, T_B=None, exclusion_zone=None): if exclusion_zone is None: exclusion_zone = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM)) + is_included = np.ones_like(distance_matrix, dtype=bool) if ignore_trivial: - diags = np.arange(exclusion_zone + 1, n_A - m + 1) - else: - diags = np.arange(-(n_A - m + 1) + 1, n_B - m + 1) + for i in range(l): + apply_exclusion_zone(is_included[i], i, exclusion_zone, False) - P = np.full((l, 3), np.inf) - I = np.full((l, 3), -1, dtype=np.int64) + P = np.full((l, k), np.inf) + I = np.full((l, k + 2), -1, dtype=np.int64) - for k in diags: - if k >= 0: - iter_range = range(0, min(n_A - m + 1, n_B - m + 1 - k)) - else: - iter_range = range(-k, min(n_A - m + 1, n_B - m + 1 - k)) - - for i in iter_range: - D = distance_matrix[i, i + k] - if D < P[i, 0]: - P[i, 0] = D - I[i, 0] = i + k - - if ignore_trivial: # Self-joins only - if D < P[i + k, 0]: - P[i + k, 0] = D - I[i + k, 0] = i - - if i < i + k: - # Left matrix profile and left matrix profile index - if D < P[i + k, 1]: - P[i + k, 1] = D - I[i + k, 1] = i - - if D < P[i, 2]: - # right matrix profile and right matrix profile index - P[i, 2] = D - I[i, 2] = i + k - - result = np.empty((l, 4), dtype=object) - result[:, 0] = P[:, 0] - result[:, 1:4] = I[:, :] + for i in range(l): + mask = is_included[i] + IDX = np.argsort(distance_matrix[i][mask]) + nn_indices_sorted = np.flatnonzero(mask)[IDX] + + topk_indices = nn_indices_sorted[:k] + P[i, :k] = distance_matrix[i][topk_indices] + I[i, :k] = topk_indices + + if ignore_trivial: + left_indices = nn_indices_sorted[nn_indices_sorted < i] + if len(left_indices) > 0: + I[i, k] = left_indices[0] + + right_indices = nn_indices_sorted[nn_indices_sorted > i] + if len(right_indices) > 0: + I[i, k + 1] = right_indices[0] + + result = np.empty((l, 2 * k + 2), dtype=object) + result[:, :k] = P[:, :] + result[:, k:] = I[:, :] return result From 54445994ac87bccecf2a4252044d7e5cd0434718 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 29 Apr 2022 16:29:53 -0600 Subject: [PATCH 011/416] Added a few comments --- tests/naive.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/naive.py b/tests/naive.py index 0c49c5746..f9c9226ef 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -185,9 +185,13 @@ def stump(T_A, m, T_B=None, exclusion_zone=None, k=1): if ignore_trivial: for i in range(l): apply_exclusion_zone(is_included[i], i, exclusion_zone, False) + # replacing values of distanc matrix to np.inf in excluion zone + # can cause problem later if there is nan/np.inf in data. So, + # it is better to use mask. P = np.full((l, k), np.inf) - I = np.full((l, k + 2), -1, dtype=np.int64) + I = np.full((l, k + 2), -1, dtype=np.int64) # two more columns in I are + # to store left and right matrix profile indices. for i in range(l): mask = is_included[i] From 9ebb08a4f274cd7c4e1f5a5f11c5c92cb5839721 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 29 Apr 2022 17:17:06 -0600 Subject: [PATCH 012/416] Add one new test case for topk matrix profile --- tests/test_stump.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/test_stump.py b/tests/test_stump.py index d3475122f..67a6ec704 100644 --- a/tests/test_stump.py +++ b/tests/test_stump.py @@ -240,3 +240,19 @@ def test_stump_nan_zero_mean_self_join(): naive.replace_inf(ref_mp) naive.replace_inf(comp_mp) npt.assert_almost_equal(ref_mp, comp_mp) + + +@pytest.mark.parametrize("T_A, T_B", test_data) +def test_stump_self_join_KNN(T_A, T_B): + k = 2 + m = 3 + zone = int(np.ceil(m / 4)) + ref_mp = naive.stump(T_B, m, exclusion_zone=zone, k=k) + comp_mp = stump(T_B, m, ignore_trivial=True) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) + + comp_mp = stump(pd.Series(T_B), m, ignore_trivial=True) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) From d83e8e6355813c15dbfc111a1e853ce1879c3027 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 29 Apr 2022 17:20:17 -0600 Subject: [PATCH 013/416] Removed unnecessary test file --- tests/test_stump_topk.py | 57 ---------------------------------------- 1 file changed, 57 deletions(-) delete mode 100644 tests/test_stump_topk.py diff --git a/tests/test_stump_topk.py b/tests/test_stump_topk.py deleted file mode 100644 index 4b722fd8f..000000000 --- a/tests/test_stump_topk.py +++ /dev/null @@ -1,57 +0,0 @@ -import numpy as np -import numpy.testing as npt -import pandas as pd -from stumpy import stump, config -import pytest -import naive - - -test_data = [ - ( - np.array([9, 8100, -60, 7], dtype=np.float64), - np.array([584, -11, 23, 79, 1001, 0, -19], dtype=np.float64), - ), - ( - np.random.uniform(-1000, 1000, [8]).astype(np.float64), - np.random.uniform(-1000, 1000, [64]).astype(np.float64), - ), -] - -substitution_locations = [(slice(0, 0), 0, -1, slice(1, 3), [0, 3])] -substitution_values = [np.nan, np.inf] - - -def test_stump_int_input(): - with pytest.raises(TypeError): - stump(np.arange(10), 5) - - -@pytest.mark.parametrize("T_A, T_B", test_data) -def test_stump_self_join_1NN(T_A, T_B): - m = 3 - zone = int(np.ceil(m / 4)) - ref_mp = naive.stump_topk(T_B, m, exclusion_zone=zone, k=1) - comp_mp = stump(T_B, m, ignore_trivial=True) - naive.replace_inf(ref_mp) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp, comp_mp) - - comp_mp = stump(pd.Series(T_B), m, ignore_trivial=True) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp, comp_mp) - - -@pytest.mark.parametrize("T_A, T_B", test_data) -def test_stump_self_join_KNN(T_A, T_B): - k = 3 - m = 3 - zone = int(np.ceil(m / 4)) - ref_mp = naive.stump_topk(T_B, m, exclusion_zone=zone, k=k) - comp_mp = stump(T_B, m, ignore_trivial=True) - naive.replace_inf(ref_mp) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp, comp_mp) - - comp_mp = stump(pd.Series(T_B), m, ignore_trivial=True) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp, comp_mp) From 9c8f019353991898bd8ad248053353af19e7c288 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 29 Apr 2022 20:58:31 -0600 Subject: [PATCH 014/416] Set I to -1 if its corresponding P is not finite --- tests/naive.py | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index f9c9226ef..d3640b66c 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -158,8 +158,8 @@ def stamp(T_A, m, T_B=None, exclusion_zone=None): def stump(T_A, m, T_B=None, exclusion_zone=None, k=1): """ - Traverse distance matrix along the diagonals and update the matrix profile and - matrix profile indices + Traverse distance matrix in a row-wise manner and store topk nearest neighbor + matrix profile and matrix profile indices """ if T_B is None: # self-join: ignore_trivial = True @@ -181,35 +181,36 @@ def stump(T_A, m, T_B=None, exclusion_zone=None, k=1): if exclusion_zone is None: exclusion_zone = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM)) - is_included = np.ones_like(distance_matrix, dtype=bool) if ignore_trivial: for i in range(l): - apply_exclusion_zone(is_included[i], i, exclusion_zone, False) - # replacing values of distanc matrix to np.inf in excluion zone - # can cause problem later if there is nan/np.inf in data. So, - # it is better to use mask. + apply_exclusion_zone(distance_matrix[i], i, exclusion_zone, np.inf) P = np.full((l, k), np.inf) - I = np.full((l, k + 2), -1, dtype=np.int64) # two more columns in I are + I = np.full((l, k + 2), -1, dtype=np.int64) # two more columns in I are # to store left and right matrix profile indices. for i in range(l): - mask = is_included[i] - IDX = np.argsort(distance_matrix[i][mask]) - nn_indices_sorted = np.flatnonzero(mask)[IDX] - - topk_indices = nn_indices_sorted[:k] + indices = np.argsort(distance_matrix[i]) + topk_indices = indices[:k] P[i, :k] = distance_matrix[i][topk_indices] - I[i, :k] = topk_indices + I[i, :k] = np.where(distance_matrix[i][topk_indices] != np.inf, topk_indices, -1) if ignore_trivial: - left_indices = nn_indices_sorted[nn_indices_sorted < i] + IL = -1 + left_indices = indices[indices < i] if len(left_indices) > 0: - I[i, k] = left_indices[0] + IL = left_indices[0] + if distance_matrix[i][IL] == np.inf: + IL = -1 + I[i, k] = IL - right_indices = nn_indices_sorted[nn_indices_sorted > i] + IR = -1 + right_indices = indices[indices > i] if len(right_indices) > 0: - I[i, k + 1] = right_indices[0] + IR = right_indices[0] + if distance_matrix[i][IR] == np.inf: + IR = -1 + I[i, k + 1] = IR result = np.empty((l, 2 * k + 2), dtype=object) result[:, :k] = P[:, :] From 0ce959549502e8091d1d017da8c95df73ae45401 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 29 Apr 2022 21:04:16 -0600 Subject: [PATCH 015/416] Removed new test function --- tests/test_stump.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/test_stump.py b/tests/test_stump.py index 67a6ec704..4d2bf312b 100644 --- a/tests/test_stump.py +++ b/tests/test_stump.py @@ -242,17 +242,17 @@ def test_stump_nan_zero_mean_self_join(): npt.assert_almost_equal(ref_mp, comp_mp) -@pytest.mark.parametrize("T_A, T_B", test_data) -def test_stump_self_join_KNN(T_A, T_B): - k = 2 - m = 3 - zone = int(np.ceil(m / 4)) - ref_mp = naive.stump(T_B, m, exclusion_zone=zone, k=k) - comp_mp = stump(T_B, m, ignore_trivial=True) - naive.replace_inf(ref_mp) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp, comp_mp) - - comp_mp = stump(pd.Series(T_B), m, ignore_trivial=True) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp, comp_mp) +#@pytest.mark.parametrize("T_A, T_B", test_data) +#def test_stump_self_join_KNN(T_A, T_B): +# k = 2 +# m = 3 +# zone = int(np.ceil(m / 4)) +# ref_mp = naive.stump(T_B, m, exclusion_zone=zone, k=k) +# comp_mp = stump(T_B, m, ignore_trivial=True) +# naive.replace_inf(ref_mp) +# naive.replace_inf(comp_mp) +# npt.assert_almost_equal(ref_mp, comp_mp) + +# comp_mp = stump(pd.Series(T_B), m, ignore_trivial=True) +# naive.replace_inf(comp_mp) +# npt.assert_almost_equal(ref_mp, comp_mp) From a9726984574deca4eb79c74b622581036604635c Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 29 Apr 2022 21:06:59 -0600 Subject: [PATCH 016/416] Fixed format --- tests/naive.py | 4 +++- tests/test_stump.py | 16 ---------------- 2 files changed, 3 insertions(+), 17 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index d3640b66c..98f639a08 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -193,7 +193,9 @@ def stump(T_A, m, T_B=None, exclusion_zone=None, k=1): indices = np.argsort(distance_matrix[i]) topk_indices = indices[:k] P[i, :k] = distance_matrix[i][topk_indices] - I[i, :k] = np.where(distance_matrix[i][topk_indices] != np.inf, topk_indices, -1) + I[i, :k] = np.where( + distance_matrix[i][topk_indices] != np.inf, topk_indices, -1 + ) if ignore_trivial: IL = -1 diff --git a/tests/test_stump.py b/tests/test_stump.py index 4d2bf312b..d3475122f 100644 --- a/tests/test_stump.py +++ b/tests/test_stump.py @@ -240,19 +240,3 @@ def test_stump_nan_zero_mean_self_join(): naive.replace_inf(ref_mp) naive.replace_inf(comp_mp) npt.assert_almost_equal(ref_mp, comp_mp) - - -#@pytest.mark.parametrize("T_A, T_B", test_data) -#def test_stump_self_join_KNN(T_A, T_B): -# k = 2 -# m = 3 -# zone = int(np.ceil(m / 4)) -# ref_mp = naive.stump(T_B, m, exclusion_zone=zone, k=k) -# comp_mp = stump(T_B, m, ignore_trivial=True) -# naive.replace_inf(ref_mp) -# naive.replace_inf(comp_mp) -# npt.assert_almost_equal(ref_mp, comp_mp) - -# comp_mp = stump(pd.Series(T_B), m, ignore_trivial=True) -# naive.replace_inf(comp_mp) -# npt.assert_almost_equal(ref_mp, comp_mp) From e2d3061e132316cad0e4bbb74d0ff8f5bf0e52ce Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 29 Apr 2022 21:14:07 -0600 Subject: [PATCH 017/416] minor change --- tests/naive.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 98f639a08..429b2ac99 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -161,7 +161,10 @@ def stump(T_A, m, T_B=None, exclusion_zone=None, k=1): Traverse distance matrix in a row-wise manner and store topk nearest neighbor matrix profile and matrix profile indices """ - if T_B is None: # self-join: + if exclusion_zone is None: + exclusion_zone = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM)) + + if T_B is None: # self-join: ignore_trivial = True distance_matrix = np.array( [distance_profile(Q, T_A, m) for Q in core.rolling_window(T_A, m)] @@ -175,12 +178,7 @@ def stump(T_A, m, T_B=None, exclusion_zone=None, k=1): distance_matrix[np.isnan(distance_matrix)] = np.inf - n_A = T_A.shape[0] - n_B = T_B.shape[0] - l = n_A - m + 1 - if exclusion_zone is None: - exclusion_zone = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM)) - + l = T_A.shape[0] - m + 1 if ignore_trivial: for i in range(l): apply_exclusion_zone(distance_matrix[i], i, exclusion_zone, np.inf) From 1938f63363dc873a7c00300c66c54742ec9b0010 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 29 Apr 2022 21:16:46 -0600 Subject: [PATCH 018/416] minor change --- tests/naive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/naive.py b/tests/naive.py index 429b2ac99..ff50eecf7 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -164,7 +164,7 @@ def stump(T_A, m, T_B=None, exclusion_zone=None, k=1): if exclusion_zone is None: exclusion_zone = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM)) - if T_B is None: # self-join: + if T_B is None: # self-join: ignore_trivial = True distance_matrix = np.array( [distance_profile(Q, T_A, m) for Q in core.rolling_window(T_A, m)] From 0e25a347ad7a3fa50d63144e32df771d9ad57545 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 29 Apr 2022 22:32:24 -0600 Subject: [PATCH 019/416] Add new test function for topk matrix profile --- tests/test_stump.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/test_stump.py b/tests/test_stump.py index d3475122f..ea4bae3c9 100644 --- a/tests/test_stump.py +++ b/tests/test_stump.py @@ -240,3 +240,18 @@ def test_stump_nan_zero_mean_self_join(): naive.replace_inf(ref_mp) naive.replace_inf(comp_mp) npt.assert_almost_equal(ref_mp, comp_mp) + +@pytest.mark.parametrize("T_A, T_B", test_data) +def test_stump_self_join_KNN(T_A, T_B): + k = 2 + m = 3 + zone = int(np.ceil(m / 4)) + ref_mp = naive.stump(T_B, m, exclusion_zone=zone, k=k) + comp_mp = stump(T_B, m, ignore_trivial=True) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) + + comp_mp = stump(pd.Series(T_B), m, ignore_trivial=True) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) From e3935851485cc4ecd9c097c915ab37c3946530fd Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 29 Apr 2022 22:34:13 -0600 Subject: [PATCH 020/416] Fixed format --- tests/test_stump.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_stump.py b/tests/test_stump.py index ea4bae3c9..67a6ec704 100644 --- a/tests/test_stump.py +++ b/tests/test_stump.py @@ -241,6 +241,7 @@ def test_stump_nan_zero_mean_self_join(): naive.replace_inf(comp_mp) npt.assert_almost_equal(ref_mp, comp_mp) + @pytest.mark.parametrize("T_A, T_B", test_data) def test_stump_self_join_KNN(T_A, T_B): k = 2 From 850a5946c88465a4fa93fd91b113015752860ff2 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 2 May 2022 12:53:45 -0600 Subject: [PATCH 021/416] Use diagonal traversal to get top-k matrix profile - change naive.stump from row-wise to traversal - add a note to docstring to inform reader of row-wise traversal - use numpy.searchsort(side='right') --- tests/naive.py | 83 ++++++++++++++++++++++++++++---------------------- 1 file changed, 47 insertions(+), 36 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 554c6f9fd..552c85cee 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -158,12 +158,11 @@ def stamp(T_A, m, T_B=None, exclusion_zone=None): def stump(T_A, m, T_B=None, exclusion_zone=None, k=1): """ - Traverse distance matrix in a row-wise manner and store topk nearest neighbor - matrix profile and matrix profile indices - """ - if exclusion_zone is None: - exclusion_zone = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM)) + Traverse distance matrix along the diagonals and update the top-k nearest + neighbor matrix profile and matrix profile indices + NOTE: For row-wise traversal, please use function `stamp` + """ if T_B is None: # self-join: ignore_trivial = True distance_matrix = np.array( @@ -178,42 +177,54 @@ def stump(T_A, m, T_B=None, exclusion_zone=None, k=1): distance_matrix[np.isnan(distance_matrix)] = np.inf - l = T_A.shape[0] - m + 1 + n_A = T_A.shape[0] + n_B = T_B.shape[0] + l = n_A - m + 1 + if exclusion_zone is None: + exclusion_zone = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM)) + if ignore_trivial: - for i in range(l): - apply_exclusion_zone(distance_matrix[i], i, exclusion_zone, np.inf) + diags = np.arange(exclusion_zone + 1, n_A - m + 1) + else: + diags = np.arange(-(n_A - m + 1) + 1, n_B - m + 1) - P = np.full((l, k), np.inf) - I = np.full((l, k + 2), -1, dtype=np.int64) # two more columns in I are - # to store left and right matrix profile indices. + P = np.full((l, k + 2), np.inf) + I = np.full((l, k + 2), -1, dtype=np.int64) # two more columns are to store + # ... left and right top-1 matrix profile indices. - for i in range(l): - indices = np.argsort(distance_matrix[i]) - topk_indices = indices[:k] - P[i, :k] = distance_matrix[i][topk_indices] - I[i, :k] = np.where( - distance_matrix[i][topk_indices] != np.inf, topk_indices, -1 - ) + for g in diags: + if g >= 0: + iter_range = range(0, min(n_A - m + 1, n_B - m + 1 - g)) + else: + iter_range = range(-g, min(n_A - m + 1, n_B - m + 1 - g)) - if ignore_trivial: - IL = -1 - left_indices = indices[indices < i] - if len(left_indices) > 0: - IL = left_indices[0] - if distance_matrix[i][IL] == np.inf: - IL = -1 - I[i, k] = IL + for i in iter_range: + D = distance_matrix[i, i + g] + if D < P[i, k-1]: + idx = np.searchsorted(P[i, :k], D, side='right') + # to keep the top-k, we need to the get rid of the last element. + P[i, :k] = np.insert(P[i, :k], idx, D)[:-1] + I[i, :k] = np.insert(I[i, :k], idx, i + g)[:-1] - IR = -1 - right_indices = indices[indices > i] - if len(right_indices) > 0: - IR = right_indices[0] - if distance_matrix[i][IR] == np.inf: - IR = -1 - I[i, k + 1] = IR - - result = np.empty((l, 2 * k + 2), dtype=object) - result[:, :k] = P[:, :] + if ignore_trivial: # Self-joins only + if D < P[i + g, k-1]: + idx = np.searchsorted(P[i + g, :k], D, side='right') + P[i + g, :k] = np.insert(P[i + g, :k], idx, D)[:-1] + I[i + g, :k] = np.insert(I[i + g, :k], idx, i)[:-1] + + if i < i + g: + # Left matrix profile and left matrix profile index + if D < P[i + g, k]: + P[i + g, k] = D + I[i + g, k] = i + + if D < P[i, k + 1]: + # right matrix profile and right matrix profile index + P[i, k + 1] = D + I[i, k + 1] = i + g + + result = np.empty((2 * k + 2, 4), dtype=object) + result[:, :k] = P[:, :k] result[:, k:] = I[:, :] return result From 278e76ca5e74c53276b1e20cc6d4ab3efd8bc078 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 2 May 2022 13:00:21 -0600 Subject: [PATCH 022/416] Fixed shape of naive.stump output --- tests/naive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/naive.py b/tests/naive.py index 552c85cee..871d52024 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -223,7 +223,7 @@ def stump(T_A, m, T_B=None, exclusion_zone=None, k=1): P[i, k + 1] = D I[i, k + 1] = i + g - result = np.empty((2 * k + 2, 4), dtype=object) + result = np.empty((l, 2 * k + 2), dtype=object) result[:, :k] = P[:, :k] result[:, k:] = I[:, :] From a864662b41f8553df6fcc1f1b9b3b341beb5cc31 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 2 May 2022 13:29:52 -0600 Subject: [PATCH 023/416] Add naive version of numpy.searchsorted --- tests/naive.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/naive.py b/tests/naive.py index 871d52024..010836639 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -156,6 +156,14 @@ def stamp(T_A, m, T_B=None, exclusion_zone=None): return result +def searchsorted(a, v): + indices = np.flatnonzero(v < a) + if len(indices): + return indices.min() + else: + return len(a) + + def stump(T_A, m, T_B=None, exclusion_zone=None, k=1): """ Traverse distance matrix along the diagonals and update the top-k nearest From f0c022da2fb61b1c9840d59e3a2034222dae65c4 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 2 May 2022 13:30:41 -0600 Subject: [PATCH 024/416] Replace numpy.searchsorted with its naive version --- tests/naive.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 010836639..24ca851c7 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -209,14 +209,14 @@ def stump(T_A, m, T_B=None, exclusion_zone=None, k=1): for i in iter_range: D = distance_matrix[i, i + g] if D < P[i, k-1]: - idx = np.searchsorted(P[i, :k], D, side='right') + idx = searchsorted(P[i, :k], D, side='right') # to keep the top-k, we need to the get rid of the last element. P[i, :k] = np.insert(P[i, :k], idx, D)[:-1] I[i, :k] = np.insert(I[i, :k], idx, i + g)[:-1] if ignore_trivial: # Self-joins only if D < P[i + g, k-1]: - idx = np.searchsorted(P[i + g, :k], D, side='right') + idx = searchsorted(P[i + g, :k], D, side='right') P[i + g, :k] = np.insert(P[i + g, :k], idx, D)[:-1] I[i + g, :k] = np.insert(I[i + g, :k], idx, i)[:-1] From 81701ba3620abb480b3852909ffe6fd0b46874ec Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 2 May 2022 13:33:35 -0600 Subject: [PATCH 025/416] Fixed calling function searchsorted --- tests/naive.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 24ca851c7..a282d49c0 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -157,6 +157,9 @@ def stamp(T_A, m, T_B=None, exclusion_zone=None): def searchsorted(a, v): + """ + naive version of numpy.searchsorted(..., side='right') + """ indices = np.flatnonzero(v < a) if len(indices): return indices.min() @@ -209,14 +212,14 @@ def stump(T_A, m, T_B=None, exclusion_zone=None, k=1): for i in iter_range: D = distance_matrix[i, i + g] if D < P[i, k-1]: - idx = searchsorted(P[i, :k], D, side='right') + idx = searchsorted(P[i, :k], D) # to keep the top-k, we need to the get rid of the last element. P[i, :k] = np.insert(P[i, :k], idx, D)[:-1] I[i, :k] = np.insert(I[i, :k], idx, i + g)[:-1] if ignore_trivial: # Self-joins only if D < P[i + g, k-1]: - idx = searchsorted(P[i + g, :k], D, side='right') + idx = searchsorted(P[i + g, :k], D) P[i + g, :k] = np.insert(P[i + g, :k], idx, D)[:-1] I[i + g, :k] = np.insert(I[i + g, :k], idx, i)[:-1] From e244341a9291119a6f3f48ca07f9b7a11203c545 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 2 May 2022 13:36:51 -0600 Subject: [PATCH 026/416] Fixed format --- tests/naive.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index a282d49c0..0f70ae7b4 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -200,7 +200,7 @@ def stump(T_A, m, T_B=None, exclusion_zone=None, k=1): diags = np.arange(-(n_A - m + 1) + 1, n_B - m + 1) P = np.full((l, k + 2), np.inf) - I = np.full((l, k + 2), -1, dtype=np.int64) # two more columns are to store + I = np.full((l, k + 2), -1, dtype=np.int64) # two more columns are to store # ... left and right top-1 matrix profile indices. for g in diags: @@ -211,14 +211,14 @@ def stump(T_A, m, T_B=None, exclusion_zone=None, k=1): for i in iter_range: D = distance_matrix[i, i + g] - if D < P[i, k-1]: + if D < P[i, k - 1]: idx = searchsorted(P[i, :k], D) # to keep the top-k, we need to the get rid of the last element. P[i, :k] = np.insert(P[i, :k], idx, D)[:-1] I[i, :k] = np.insert(I[i, :k], idx, i + g)[:-1] if ignore_trivial: # Self-joins only - if D < P[i + g, k-1]: + if D < P[i + g, k - 1]: idx = searchsorted(P[i + g, :k], D) P[i + g, :k] = np.insert(P[i + g, :k], idx, D)[:-1] I[i + g, :k] = np.insert(I[i + g, :k], idx, i)[:-1] From 1806c66241547cbdd9ac02c0313d16157b5f700e Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 9 May 2022 12:13:05 -0600 Subject: [PATCH 027/416] minor changes --- tests/naive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/naive.py b/tests/naive.py index 5592af064..3028dd15c 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -240,7 +240,7 @@ def stump(T_A, m, T_B=None, exclusion_zone=None, row_wise=False, k=1): D = distance_matrix[i, i + g] # D: a single element if D < P[i, k - 1]: idx = searchsorted(P[i, :k], D) - # to keep the top-k, we need to the get rid of the last element. + # to keep the top-k, we must get rid of the last element. P[i, :k] = np.insert(P[i, :k], idx, D)[:-1] I[i, :k] = np.insert(I[i, :k], idx, i + g)[:-1] From ad29c19cc83d6388a1caab1136fdb4fbf82596fb Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 9 May 2022 12:14:10 -0600 Subject: [PATCH 028/416] Correct format --- tests/naive.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 3028dd15c..849c8d080 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -203,11 +203,11 @@ def stump(T_A, m, T_B=None, exclusion_zone=None, row_wise=False, k=1): for i in range(l): apply_exclusion_zone(distance_matrix[i], i, exclusion_zone, np.inf) - for i, D in enumerate(distance_matrix): # D: distance profile + for i, D in enumerate(distance_matrix): # D: distance profile # self-join / AB-join: matrix proifle and indices indices = np.argsort(D)[:k] P[i, :k] = D[indices] - indices[P[i,:k] == np.inf] = -1 + indices[P[i, :k] == np.inf] = -1 I[i, :k] = indices # self-join: left matrix profile index (top-1) @@ -237,7 +237,7 @@ def stump(T_A, m, T_B=None, exclusion_zone=None, row_wise=False, k=1): iter_range = range(-g, min(n_A - m + 1, n_B - m + 1 - g)) for i in iter_range: - D = distance_matrix[i, i + g] # D: a single element + D = distance_matrix[i, i + g] # D: a single element if D < P[i, k - 1]: idx = searchsorted(P[i, :k], D) # to keep the top-k, we must get rid of the last element. From 448d65d69d10c03063c29062cf6c09124281eb78 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 9 May 2022 12:35:49 -0600 Subject: [PATCH 029/416] Correct flake8 style --- tests/naive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/naive.py b/tests/naive.py index 849c8d080..dacba3075 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -158,7 +158,7 @@ def stamp(T_A, m, T_B=None, exclusion_zone=None): # pragma: no cover def searchsorted(a, v): """ - naive version of numpy.searchsorted(..., side='right') + Naive version of numpy.searchsorted(..., side='right') """ indices = np.flatnonzero(v < a) if len(indices): From e3ebcb5885085ab25e58ddc98acd8a7bfb7afac0 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 9 May 2022 12:46:10 -0600 Subject: [PATCH 030/416] Avoid unnecessary slicing --- tests/naive.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index dacba3075..67d1fb27c 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -239,14 +239,14 @@ def stump(T_A, m, T_B=None, exclusion_zone=None, row_wise=False, k=1): for i in iter_range: D = distance_matrix[i, i + g] # D: a single element if D < P[i, k - 1]: - idx = searchsorted(P[i, :k], D) + idx = searchsorted(P[i], D) # to keep the top-k, we must get rid of the last element. P[i, :k] = np.insert(P[i, :k], idx, D)[:-1] I[i, :k] = np.insert(I[i, :k], idx, i + g)[:-1] if ignore_trivial: # Self-joins only if D < P[i + g, k - 1]: - idx = searchsorted(P[i + g, :k], D) + idx = searchsorted(P[i + g], D) P[i + g, :k] = np.insert(P[i + g, :k], idx, D)[:-1] I[i + g, :k] = np.insert(I[i + g, :k], idx, i)[:-1] From 3cee5d85749eaa0987697e10e937fe5db65c9604 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 9 May 2022 19:28:08 -0600 Subject: [PATCH 031/416] pass parameter k to function stump --- tests/test_stump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_stump.py b/tests/test_stump.py index 783163453..1ce70acc5 100644 --- a/tests/test_stump.py +++ b/tests/test_stump.py @@ -248,7 +248,7 @@ def test_stump_self_join_KNN(T_A, T_B): m = 3 zone = int(np.ceil(m / 4)) ref_mp = naive.stump(T_B, m, exclusion_zone=zone, k=k) - comp_mp = stump(T_B, m, ignore_trivial=True) + comp_mp = stump(T_B, m, ignore_trivial=True, k=k) naive.replace_inf(ref_mp) naive.replace_inf(comp_mp) npt.assert_almost_equal(ref_mp, comp_mp) From a1bc6a4182207f68050da74511d78f46b469b778 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 9 May 2022 19:38:44 -0600 Subject: [PATCH 032/416] Add parameter k to function stump --- stumpy/stump.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index 97334eb5a..115752113 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -433,7 +433,7 @@ def _stump( @core.non_normalized(aamp) -def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0): +def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): """ Compute the z-normalized matrix profile @@ -467,6 +467,10 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0): The p-norm to apply for computing the Minkowski distance. This parameter is ignored when `normalize == True`. + k : int, default 1 + The number of smallest elements in distance profile that should be stored + for constructing top-k matrix profile + Returns ------- out : numpy.ndarray @@ -587,7 +591,6 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0): l = n_A - m + 1 excl_zone = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM)) - out = np.empty((l, 4), dtype=object) if ignore_trivial: diags = np.arange(excl_zone + 1, n_A - m + 1, dtype=np.int64) @@ -612,8 +615,9 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0): ignore_trivial, ) - out[:, 0] = P[:, 0] - out[:, 1:] = I + out = np.empty((l, 2 * k + 2), dtype=object) + out[:, :k] = P[:, :k] + out[:, k:] = I threshold = 10e-6 if core.are_distances_too_small(out[:, 0], threshold=threshold): # pragma: no cover From 384690cc6492019d66d8b9104a9297c5a0fbcc11 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 9 May 2022 20:21:19 -0600 Subject: [PATCH 033/416] Add parameter k to function _stump --- stumpy/stump.py | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index 115752113..bedd5bf6b 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -235,6 +235,7 @@ def _stump( T_B_subseq_isconstant, diags, ignore_trivial, + k, ): """ A Numba JIT-compiled version of STOMPopt with Pearson correlations for parallel @@ -294,6 +295,10 @@ def _stump( Set to `True` if this is a self-join. Otherwise, for AB-join, set this to `False`. Default is `True`. + k : int + The number of smallest elements in distance profile that should be stored + for constructing top-k matrix profile. + Returns ------- profile : numpy.ndarray @@ -353,8 +358,8 @@ def _stump( n_B = T_B.shape[0] l = n_A - m + 1 n_threads = numba.config.NUMBA_NUM_THREADS - ρ = np.full((n_threads, l, 3), -np.inf, dtype=np.float64) - I = np.full((n_threads, l, 3), -1, dtype=np.int64) + ρ = np.full((n_threads, l, k + 2), -np.inf, dtype=np.float64) + I = np.full((n_threads, l, k + 2), -1, dtype=np.int64) ndist_counts = core._count_diagonal_ndist(diags, m, n_A, n_B) diags_ranges = core._get_array_ranges(ndist_counts, n_threads, False) @@ -406,27 +411,18 @@ def _stump( # Reduction of results from all threads for thread_idx in range(1, n_threads): for i in prange(l): - if ρ[0, i, 0] < ρ[thread_idx, i, 0]: - ρ[0, i, 0] = ρ[thread_idx, i, 0] - I[0, i, 0] = I[thread_idx, i, 0] - # left pearson correlation and left matrix profile indices - if ρ[0, i, 1] < ρ[thread_idx, i, 1]: - ρ[0, i, 1] = ρ[thread_idx, i, 1] - I[0, i, 1] = I[thread_idx, i, 1] - # right pearson correlation and right matrix profile indices - if ρ[0, i, 2] < ρ[thread_idx, i, 2]: - ρ[0, i, 2] = ρ[thread_idx, i, 2] - I[0, i, 2] = I[thread_idx, i, 2] + for j in range(k + 2): # alternative: use mask + if ρ[0, i, j] < ρ[thread_idx, i, j]: + ρ[0, i, j] = ρ[thread_idx, i, j] + I[0, i, j] = I[thread_idx, i, j] # Convert pearson correlations to distances p_norm = np.abs(2 * m * (1 - ρ[0, :, :])) for i in prange(p_norm.shape[0]): - if p_norm[i, 0] < config.STUMPY_P_NORM_THRESHOLD: - p_norm[i, 0] = 0.0 - if p_norm[i, 1] < config.STUMPY_P_NORM_THRESHOLD: - p_norm[i, 1] = 0.0 - if p_norm[i, 2] < config.STUMPY_P_NORM_THRESHOLD: - p_norm[i, 2] = 0.0 + for j in range(p_norm.shape[1]): # p_norm.shape[1] is `k + 2` + if p_norm[i, j] < config.STUMPY_P_NORM_THRESHOLD: + p_norm[i, j] = 0.0 + P = np.sqrt(p_norm) return P[:, :], I[0, :, :] @@ -469,7 +465,7 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): k : int, default 1 The number of smallest elements in distance profile that should be stored - for constructing top-k matrix profile + for constructing top-k matrix profile. Returns ------- @@ -613,6 +609,7 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): T_B_subseq_isconstant, diags, ignore_trivial, + k, ) out = np.empty((l, 2 * k + 2), dtype=object) From d246736717bac279d87970a8627e3c222d8fefa9 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 9 May 2022 20:45:08 -0600 Subject: [PATCH 034/416] Fixed update of top-k rho and indices in _stump --- stumpy/stump.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index bedd5bf6b..cc70e76c4 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -411,15 +411,30 @@ def _stump( # Reduction of results from all threads for thread_idx in range(1, n_threads): for i in prange(l): - for j in range(k + 2): # alternative: use mask - if ρ[0, i, j] < ρ[thread_idx, i, j]: - ρ[0, i, j] = ρ[thread_idx, i, j] - I[0, i, j] = I[thread_idx, i, j] + # top-k + for j in range(k): + if ρ[0, i, k-1] < ρ[thread_idx, i, j]: + idx = k - np.searchsorted( + ρ[0, i, :k][::-1], ρ[thread_idx, i, j] + ) + ρ[0, i, idx + 1 : k] = ρ[0, i, idx : k - 1] + ρ[0, i, idx] = ρ[thread_idx, i, j] + + I[0, i, idx + 1 : k] = I[0, i, idx : k - 1] + I[0, i, idx] = I[thread_idx, i, j] + + if ρ[0, i, k] < ρ[thread_idx, i, k]: + ρ[0, i, k] = ρ[thread_idx, i, k] + I[0, i, k] = I[thread_idx, i, k] + + if ρ[0, i, k + 1] < ρ[thread_idx, i, k + 1]: + ρ[0, i, k + 1] = ρ[thread_idx, i, k + 1] + I[0, i, k + 1] = I[thread_idx, i, k + 1] # Convert pearson correlations to distances p_norm = np.abs(2 * m * (1 - ρ[0, :, :])) for i in prange(p_norm.shape[0]): - for j in range(p_norm.shape[1]): # p_norm.shape[1] is `k + 2` + for j in prange(p_norm.shape[1]): # p_norm.shape[1] is `k + 2` if p_norm[i, j] < config.STUMPY_P_NORM_THRESHOLD: p_norm[i, j] = 0.0 From fdff040c1324fb7c804862a02ee0cf207edad8b4 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 9 May 2022 20:59:11 -0600 Subject: [PATCH 035/416] Add parameter k to function _compute_diagonal --- stumpy/stump.py | 68 +++++++++++++++++++++++++++++++------------------ 1 file changed, 43 insertions(+), 25 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index cc70e76c4..f0f09e083 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -125,6 +125,10 @@ def _compute_diagonal( Set to `True` if this is a self-join. Otherwise, for AB-join, set this to `False`. Default is `True`. + k : int + The number of smallest elements in distance profile that should be stored + for constructing top-k matrix profile. + Returns ------- None @@ -154,18 +158,18 @@ def _compute_diagonal( constant = (m - 1) * m_inverse * m_inverse # (m - 1)/(m * m) for diag_idx in range(diags_start_idx, diags_stop_idx): - k = diags[diag_idx] + g = diags[diag_idx] - if k >= 0: - iter_range = range(0, min(n_A - m + 1, n_B - m + 1 - k)) + if g >= 0: + iter_range = range(0, min(n_A - m + 1, n_B - m + 1 - g)) else: - iter_range = range(-k, min(n_A - m + 1, n_B - m + 1 - k)) + iter_range = range(-g, min(n_A - m + 1, n_B - m + 1 - g)) for i in iter_range: - if i == 0 or (k < 0 and i == -k): + if i == 0 or (g < 0 and i == -g): cov = ( np.dot( - (T_B[i + k : i + k + m] - M_T[i + k]), (T_A[i : i + m] - μ_Q[i]) + (T_B[i + g : i + g + m] - M_T[i + g]), (T_A[i : i + m] - μ_Q[i]) ) * m_inverse ) @@ -177,38 +181,51 @@ def _compute_diagonal( # - (T_B[i + k - 1] - M_T_m_1[i + k]) * (T_A[i - 1] - μ_Q_m_1[i]) # ) cov = cov + constant * ( - cov_a[i + k] * cov_b[i] - cov_c[i + k] * cov_d[i] + cov_a[i + g] * cov_b[i] - cov_c[i + g] * cov_d[i] ) - if T_B_subseq_isfinite[i + k] and T_A_subseq_isfinite[i]: + if T_B_subseq_isfinite[i + g] and T_A_subseq_isfinite[i]: # Neither subsequence contains NaNs - if T_B_subseq_isconstant[i + k] or T_A_subseq_isconstant[i]: + if T_B_subseq_isconstant[i + g] or T_A_subseq_isconstant[i]: pearson = 0.5 else: - pearson = cov * Σ_T_inverse[i + k] * σ_Q_inverse[i] + pearson = cov * Σ_T_inverse[i + g] * σ_Q_inverse[i] - if T_B_subseq_isconstant[i + k] and T_A_subseq_isconstant[i]: + if T_B_subseq_isconstant[i + g] and T_A_subseq_isconstant[i]: pearson = 1.0 - if pearson > ρ[thread_idx, i, 0]: - ρ[thread_idx, i, 0] = pearson - I[thread_idx, i, 0] = i + k + if pearson > ρ[thread_idx, i, k - 1]: + idx = k - np.searchsorted( + ρ[thread_idx, i, :k][::-1], pearson + ) + ρ[thread_idx, i, idx + 1 : k] = ρ[thread_idx, i, idx : k - 1] + ρ[thread_idx, i, idx] = pearson + I[thread_idx, i, idx + 1 : k] = I[thread_idx, i, idx : k - 1] + I[thread_idx, i, idx] = i + g if ignore_trivial: # self-joins only - if pearson > ρ[thread_idx, i + k, 0]: - ρ[thread_idx, i + k, 0] = pearson - I[thread_idx, i + k, 0] = i - - if i < i + k: + if pearson > ρ[thread_idx, i + g, k - 1]: + idx = k - np.searchsorted( + ρ[thread_idx, i + g, :k][::-1], pearson + ) + ρ[thread_idx, i + g, idx + 1 : k] = ρ[thread_idx, i + g, idx : k - 1] + ρ[thread_idx, i + g, idx] = pearson + I[thread_idx, i + g, idx + 1 : k] = I[thread_idx, i + g, idx : k - 1] + I[thread_idx, i + g, idx] = i + # for top-1 case: + #ρ[thread_idx, i + g, 0] = pearson + #I[thread_idx, i + g, 0] = i + + if i < i + g: # left pearson correlation and left matrix profile index - if pearson > ρ[thread_idx, i + k, 1]: - ρ[thread_idx, i + k, 1] = pearson - I[thread_idx, i + k, 1] = i + if pearson > ρ[thread_idx, i + g, k]: + ρ[thread_idx, i + g, k] = pearson + I[thread_idx, i + g, k] = i # right pearson correlation and right matrix profile index - if pearson > ρ[thread_idx, i, 2]: - ρ[thread_idx, i, 2] = pearson - I[thread_idx, i, 2] = i + k + if pearson > ρ[thread_idx, i, k + 1]: + ρ[thread_idx, i, k + 1] = pearson + I[thread_idx, i, k + 1] = i + g return @@ -406,6 +423,7 @@ def _stump( ρ, I, ignore_trivial, + k, ) # Reduction of results from all threads From 9d721982f4a10d3e01dbe3fdf0403fb33372aec7 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 9 May 2022 21:08:13 -0600 Subject: [PATCH 036/416] consider parameter k in non normalized function, decorator --- stumpy/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stumpy/core.py b/stumpy/core.py index a2a30c043..391ce6b57 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -121,7 +121,7 @@ def norm_func(Q, T, A_norm=None, other_norm=None, normalize=True, p=2.0): The desired z-normalized/non-normalized function (or class) """ if exclude is None: - exclude = ["normalize", "p"] + exclude = ["normalize", "p", "k"] @functools.wraps(non_norm) def outer_wrapper(norm): From 995559ffe6f49aa20ab71f3b33846b3717ce4e1d Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 9 May 2022 21:11:37 -0600 Subject: [PATCH 037/416] Fixed missing input parameter k in function _compute_diagonal --- stumpy/stump.py | 1 + 1 file changed, 1 insertion(+) diff --git a/stumpy/stump.py b/stumpy/stump.py index f0f09e083..45c4e533c 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -42,6 +42,7 @@ def _compute_diagonal( ρ, I, ignore_trivial, + k ): """ Compute (Numba JIT-compiled) and update the Pearson correlation, ρ, and I From a047dd002a93b387f664189ca401405b19fdec4f Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 9 May 2022 21:22:13 -0600 Subject: [PATCH 038/416] minor change --- stumpy/stump.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index 45c4e533c..5f701b9a5 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -648,10 +648,10 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): out = np.empty((l, 2 * k + 2), dtype=object) out[:, :k] = P[:, :k] - out[:, k:] = I + out[:, k:] = I[:, :] threshold = 10e-6 - if core.are_distances_too_small(out[:, 0], threshold=threshold): # pragma: no cover + if core.are_distances_too_small(out[:, :k].ravel(), threshold=threshold): # pragma: no cover logger.warning(f"A large number of values are smaller than {threshold}.") logger.warning("For a self-join, try setting `ignore_trivial = True`.") From c6370b6da6e438bdd16e4eefffb8a3e4f71a8c93 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 9 May 2022 21:50:06 -0600 Subject: [PATCH 039/416] Add verbose --- stumpy/stump.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index 5f701b9a5..ce5988662 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -646,7 +646,8 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): k, ) - out = np.empty((l, 2 * k + 2), dtype=object) + out = np.empty((l, (2 * k) + 2), dtype=object) + print(out.shape) out[:, :k] = P[:, :k] out[:, k:] = I[:, :] From 816441596cbc2d1d85454bdbcba939132d1677b2 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 9 May 2022 22:03:28 -0600 Subject: [PATCH 040/416] minor changes --- stumpy/stump.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index ce5988662..2ae3046be 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -42,7 +42,7 @@ def _compute_diagonal( ρ, I, ignore_trivial, - k + k, ): """ Compute (Numba JIT-compiled) and update the Pearson correlation, ρ, and I @@ -646,13 +646,12 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): k, ) - out = np.empty((l, (2 * k) + 2), dtype=object) - print(out.shape) + out = np.empty((l, 2 * k + 2), dtype=object) out[:, :k] = P[:, :k] - out[:, k:] = I[:, :] + out[:, k:] = I threshold = 10e-6 - if core.are_distances_too_small(out[:, :k].ravel(), threshold=threshold): # pragma: no cover + if core.are_distances_too_small(out[:, 0], threshold=threshold): # pragma: no cover logger.warning(f"A large number of values are smaller than {threshold}.") logger.warning("For a self-join, try setting `ignore_trivial = True`.") From 7007953f700dd41cae95d1ea834d0e5850b245b7 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 9 May 2022 22:17:23 -0600 Subject: [PATCH 041/416] Fixed unit test for top-k matrix profile --- tests/test_stump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_stump.py b/tests/test_stump.py index 1ce70acc5..25b9c5283 100644 --- a/tests/test_stump.py +++ b/tests/test_stump.py @@ -253,6 +253,6 @@ def test_stump_self_join_KNN(T_A, T_B): naive.replace_inf(comp_mp) npt.assert_almost_equal(ref_mp, comp_mp) - comp_mp = stump(pd.Series(T_B), m, ignore_trivial=True) + comp_mp = stump(pd.Series(T_B), m, ignore_trivial=True, k=k) naive.replace_inf(comp_mp) npt.assert_almost_equal(ref_mp, comp_mp) From 5b5f21ada054f9d26780199c34f248f034874fe2 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 9 May 2022 22:21:13 -0600 Subject: [PATCH 042/416] Remove parameter k in function non_normalized decorator --- stumpy/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stumpy/core.py b/stumpy/core.py index 391ce6b57..a2a30c043 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -121,7 +121,7 @@ def norm_func(Q, T, A_norm=None, other_norm=None, normalize=True, p=2.0): The desired z-normalized/non-normalized function (or class) """ if exclude is None: - exclude = ["normalize", "p", "k"] + exclude = ["normalize", "p"] @functools.wraps(non_norm) def outer_wrapper(norm): From f7ee854f733eba01412ed17f6a3cdf8f747d842a Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 9 May 2022 22:56:39 -0600 Subject: [PATCH 043/416] Corret format by black --- stumpy/stump.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index 2ae3046be..eb18b7e8a 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -196,9 +196,7 @@ def _compute_diagonal( pearson = 1.0 if pearson > ρ[thread_idx, i, k - 1]: - idx = k - np.searchsorted( - ρ[thread_idx, i, :k][::-1], pearson - ) + idx = k - np.searchsorted(ρ[thread_idx, i, :k][::-1], pearson) ρ[thread_idx, i, idx + 1 : k] = ρ[thread_idx, i, idx : k - 1] ρ[thread_idx, i, idx] = pearson I[thread_idx, i, idx + 1 : k] = I[thread_idx, i, idx : k - 1] @@ -207,15 +205,19 @@ def _compute_diagonal( if ignore_trivial: # self-joins only if pearson > ρ[thread_idx, i + g, k - 1]: idx = k - np.searchsorted( - ρ[thread_idx, i + g, :k][::-1], pearson + ρ[thread_idx, i + g, :k][::-1], pearson ) - ρ[thread_idx, i + g, idx + 1 : k] = ρ[thread_idx, i + g, idx : k - 1] + ρ[thread_idx, i + g, idx + 1 : k] = ρ[ + thread_idx, i + g, idx : k - 1 + ] ρ[thread_idx, i + g, idx] = pearson - I[thread_idx, i + g, idx + 1 : k] = I[thread_idx, i + g, idx : k - 1] + I[thread_idx, i + g, idx + 1 : k] = I[ + thread_idx, i + g, idx : k - 1 + ] I[thread_idx, i + g, idx] = i # for top-1 case: - #ρ[thread_idx, i + g, 0] = pearson - #I[thread_idx, i + g, 0] = i + # ρ[thread_idx, i + g, 0] = pearson + # I[thread_idx, i + g, 0] = i if i < i + g: # left pearson correlation and left matrix profile index @@ -432,10 +434,8 @@ def _stump( for i in prange(l): # top-k for j in range(k): - if ρ[0, i, k-1] < ρ[thread_idx, i, j]: - idx = k - np.searchsorted( - ρ[0, i, :k][::-1], ρ[thread_idx, i, j] - ) + if ρ[0, i, k - 1] < ρ[thread_idx, i, j]: + idx = k - np.searchsorted(ρ[0, i, :k][::-1], ρ[thread_idx, i, j]) ρ[0, i, idx + 1 : k] = ρ[0, i, idx : k - 1] ρ[0, i, idx] = ρ[thread_idx, i, j] @@ -453,7 +453,7 @@ def _stump( # Convert pearson correlations to distances p_norm = np.abs(2 * m * (1 - ρ[0, :, :])) for i in prange(p_norm.shape[0]): - for j in prange(p_norm.shape[1]): # p_norm.shape[1] is `k + 2` + for j in prange(p_norm.shape[1]): # p_norm.shape[1] is `k + 2` if p_norm[i, j] < config.STUMPY_P_NORM_THRESHOLD: p_norm[i, j] = 0.0 From 485dba3da38398f27b237142f29adebd870ac003 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 12 May 2022 17:45:26 -0600 Subject: [PATCH 044/416] Use seperate variaboles for left and right profiles --- stumpy/stump.py | 87 +++++++++++++++++++++++++++++-------------------- 1 file changed, 51 insertions(+), 36 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index eb18b7e8a..9921a5e7c 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -41,6 +41,10 @@ def _compute_diagonal( thread_idx, ρ, I, + ρL, + IL, + ρR, + IR, ignore_trivial, k, ): @@ -221,14 +225,14 @@ def _compute_diagonal( if i < i + g: # left pearson correlation and left matrix profile index - if pearson > ρ[thread_idx, i + g, k]: - ρ[thread_idx, i + g, k] = pearson - I[thread_idx, i + g, k] = i + if pearson > ρL[thread_idx, i + g]: + ρL[thread_idx, i + g] = pearson + IL[thread_idx, i + g] = i # right pearson correlation and right matrix profile index - if pearson > ρ[thread_idx, i, k + 1]: - ρ[thread_idx, i, k + 1] = pearson - I[thread_idx, i, k + 1] = i + g + if pearson > ρR[thread_idx, i]: + ρR[thread_idx, i] = pearson + IR[thread_idx, i] = i + g return @@ -378,8 +382,15 @@ def _stump( n_B = T_B.shape[0] l = n_A - m + 1 n_threads = numba.config.NUMBA_NUM_THREADS - ρ = np.full((n_threads, l, k + 2), -np.inf, dtype=np.float64) - I = np.full((n_threads, l, k + 2), -1, dtype=np.int64) + + ρ = np.full((n_threads, l, k), -np.inf, dtype=np.float64) + I = np.full((n_threads, l, k), -1, dtype=np.int64) + + ρL = np.full((n_threads, l), -np.inf, dtype=np.float64) + IL = np.full((n_threads, l), -1, dtype=np.float64) + + ρR = np.full((n_threads, l), -np.inf, dtype=np.float64) + IR = np.full((n_threads, l), -1, dtype=np.float64) ndist_counts = core._count_diagonal_ndist(diags, m, n_A, n_B) diags_ranges = core._get_array_ranges(ndist_counts, n_threads, False) @@ -425,6 +436,10 @@ def _stump( thread_idx, ρ, I, + ρL, + IL, + ρR, + IR, ignore_trivial, k, ) @@ -434,7 +449,7 @@ def _stump( for i in prange(l): # top-k for j in range(k): - if ρ[0, i, k - 1] < ρ[thread_idx, i, j]: + if ρ[0, i, k-1] < ρ[thread_idx, i, j]: idx = k - np.searchsorted(ρ[0, i, :k][::-1], ρ[thread_idx, i, j]) ρ[0, i, idx + 1 : k] = ρ[0, i, idx : k - 1] ρ[0, i, idx] = ρ[thread_idx, i, j] @@ -442,24 +457,24 @@ def _stump( I[0, i, idx + 1 : k] = I[0, i, idx : k - 1] I[0, i, idx] = I[thread_idx, i, j] - if ρ[0, i, k] < ρ[thread_idx, i, k]: - ρ[0, i, k] = ρ[thread_idx, i, k] - I[0, i, k] = I[thread_idx, i, k] + if ρL[0, i] < ρL[thread_idx, i]: + ρL[0, i] = ρL[thread_idx, i] + IL[0, i] = IL[thread_idx, i] - if ρ[0, i, k + 1] < ρ[thread_idx, i, k + 1]: - ρ[0, i, k + 1] = ρ[thread_idx, i, k + 1] - I[0, i, k + 1] = I[thread_idx, i, k + 1] + if ρR[0, i] < ρR[thread_idx, i]: + ρR[0, i] = ρR[thread_idx, i] + IR[0, i] = IR[thread_idx, i] # Convert pearson correlations to distances p_norm = np.abs(2 * m * (1 - ρ[0, :, :])) for i in prange(p_norm.shape[0]): - for j in prange(p_norm.shape[1]): # p_norm.shape[1] is `k + 2` + for j in prange(p_norm.shape[1]): if p_norm[i, j] < config.STUMPY_P_NORM_THRESHOLD: p_norm[i, j] = 0.0 P = np.sqrt(p_norm) - return P[:, :], I[0, :, :] + return P, I[0, :, :], IL[0, :], IR[0, :] @core.non_normalized(aamp) @@ -627,28 +642,28 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): else: diags = np.arange(-(n_A - m + 1) + 1, n_B - m + 1, dtype=np.int64) - P, I = _stump( - T_A, - T_B, - m, - M_T, - μ_Q, - Σ_T_inverse, - σ_Q_inverse, - M_T_m_1, - μ_Q_m_1, - T_A_subseq_isfinite, - T_B_subseq_isfinite, - T_A_subseq_isconstant, - T_B_subseq_isconstant, - diags, - ignore_trivial, - k, + P, I, IL, IR = _stump( + T_A, + T_B, + m, + M_T, + μ_Q, + Σ_T_inverse, + σ_Q_inverse, + M_T_m_1, + μ_Q_m_1, + T_A_subseq_isfinite, + T_B_subseq_isfinite, + T_A_subseq_isconstant, + T_B_subseq_isconstant, + diags, + ignore_trivial, + k, ) out = np.empty((l, 2 * k + 2), dtype=object) - out[:, :k] = P[:, :k] - out[:, k:] = I + out[:, :k] = P + out[:, k:] = np.c_[I, IL, IR] threshold = 10e-6 if core.are_distances_too_small(out[:, 0], threshold=threshold): # pragma: no cover From bc133ca638df71c4542b2351e07297b04b8b6269 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 12 May 2022 18:10:06 -0600 Subject: [PATCH 045/416] store top-k rho in ascending order --- stumpy/stump.py | 47 ++++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index 9921a5e7c..56b2118ca 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -199,26 +199,26 @@ def _compute_diagonal( if T_B_subseq_isconstant[i + g] and T_A_subseq_isconstant[i]: pearson = 1.0 - if pearson > ρ[thread_idx, i, k - 1]: - idx = k - np.searchsorted(ρ[thread_idx, i, :k][::-1], pearson) - ρ[thread_idx, i, idx + 1 : k] = ρ[thread_idx, i, idx : k - 1] - ρ[thread_idx, i, idx] = pearson - I[thread_idx, i, idx + 1 : k] = I[thread_idx, i, idx : k - 1] - I[thread_idx, i, idx] = i + g + if pearson > ρ[thread_idx, i, 0]: + idx = np.searchsorted(ρ[thread_idx, i], pearson) + ρ[thread_idx, i, : idx - 1] = ρ[thread_idx, i, 1 : idx] + ρ[thread_idx, i, idx - 1] = pearson + + I[thread_idx, i, : idx - 1] = I[thread_idx, i, 1 : idx] + I[thread_idx, i, idx - 1] = i + g if ignore_trivial: # self-joins only - if pearson > ρ[thread_idx, i + g, k - 1]: - idx = k - np.searchsorted( - ρ[thread_idx, i + g, :k][::-1], pearson - ) - ρ[thread_idx, i + g, idx + 1 : k] = ρ[ - thread_idx, i + g, idx : k - 1 + if pearson > ρ[thread_idx, i + g, 0]: + idx = np.searchsorted(ρ[thread_idx, i + g], pearson) + ρ[thread_idx, i + g, : idx - 1] = ρ[ + thread_idx, i + g, 1 : idx ] - ρ[thread_idx, i + g, idx] = pearson - I[thread_idx, i + g, idx + 1 : k] = I[ - thread_idx, i + g, idx : k - 1 + ρ[thread_idx, i + g, idx - 1] = pearson + + I[thread_idx, i + g, : idx - 1] = I[ + thread_idx, i + g, 1 : idx ] - I[thread_idx, i + g, idx] = i + I[thread_idx, i + g, idx - 1] = i # for top-1 case: # ρ[thread_idx, i + g, 0] = pearson # I[thread_idx, i + g, 0] = i @@ -449,13 +449,14 @@ def _stump( for i in prange(l): # top-k for j in range(k): - if ρ[0, i, k-1] < ρ[thread_idx, i, j]: - idx = k - np.searchsorted(ρ[0, i, :k][::-1], ρ[thread_idx, i, j]) - ρ[0, i, idx + 1 : k] = ρ[0, i, idx : k - 1] - ρ[0, i, idx] = ρ[thread_idx, i, j] + j = k - 1 - j + if ρ[0, i, 0] < ρ[thread_idx, i, j]: + idx = np.searchsorted(ρ[0, i], ρ[thread_idx, i, j]) + ρ[0, i, : idx - 1] = ρ[0, i, 1 : idx] + ρ[0, i, idx - 1] = ρ[thread_idx, i, j] - I[0, i, idx + 1 : k] = I[0, i, idx : k - 1] - I[0, i, idx] = I[thread_idx, i, j] + I[0, i, : idx - 1] = I[0, i, 1 : idx] + I[0, i, idx - 1] = I[thread_idx, i, j] if ρL[0, i] < ρL[thread_idx, i]: ρL[0, i] = ρL[thread_idx, i] @@ -474,7 +475,7 @@ def _stump( P = np.sqrt(p_norm) - return P, I[0, :, :], IL[0, :], IR[0, :] + return P[:, ::-1], I[0, :, ::-1], IL[0, :], IR[0, :] @core.non_normalized(aamp) From 47a61b2f202e3f2864460086ccf92100168b8f1e Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 12 May 2022 18:23:08 -0600 Subject: [PATCH 046/416] Revise docstrings --- stumpy/stump.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index 56b2118ca..bdf8c85b7 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -326,12 +326,16 @@ def _stump( Returns ------- profile : numpy.ndarray - Matrix profile + Top-k Matrix profile indices : numpy.ndarray - The first column consists of the matrix profile indices, the second - column consists of the left matrix profile indices, and the third - column consists of the right matrix profile indices. + The top-k matrix profile indices + + left indices : numpy.ndarray + The top-1 left matrix profile indices + + right indices : numpy.ndarray + The top-1 right matrix profile indices Notes ----- @@ -520,10 +524,10 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): Returns ------- out : numpy.ndarray - The first column consists of the matrix profile, the second column - consists of the matrix profile indices, the third column consists of - the left matrix profile indices, and the fourth column consists of - the right matrix profile indices. + The first k columns consists of the top-k matrix profile, the next k columns + consists of their corresponding matrix profile indices, the one before + last column consists of the top-1 left matrix profile indices, and the + last column consists of the top-1 right matrix profile indices. See Also -------- From d4dc04a5caea088cd6a9a619830af7c517f5348d Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 12 May 2022 18:37:27 -0600 Subject: [PATCH 047/416] Correct docstrings --- stumpy/stump.py | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index bdf8c85b7..d49296ac5 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -49,9 +49,9 @@ def _compute_diagonal( k, ): """ - Compute (Numba JIT-compiled) and update the Pearson correlation, ρ, and I - sequentially along individual diagonals using a single thread and avoiding race - conditions + Compute (Numba JIT-compiled) and update the (top-k) Pearson correlation, ρ, and I, + and, the left ρ and the left I, the right ρ and the right I sequentially along + individual diagonals using a single thread and avoiding race conditions. Parameters ---------- @@ -121,10 +121,22 @@ def _compute_diagonal( The thread index ρ : numpy.ndarray - The Pearson correlations + The top-k Pearson correlations, sorted in ascending order per row I : numpy.ndarray - The matrix profile indices + The top-k matrix profile indices + + ρL : numpy.ndarray + The top-1 left Pearson correlations + + IL : numpy.ndarray + The top-1 left matrix profile indices + + ρR : numpy.ndarray + The top-1 left Pearson correlations + + IR : numpy.ndarray + The top-1 right matrix profile indices ignore_trivial : bool Set to `True` if this is a self-join. Otherwise, for AB-join, set this to @@ -263,8 +275,8 @@ def _stump( ): """ A Numba JIT-compiled version of STOMPopt with Pearson correlations for parallel - computation of the matrix profile, matrix profile indices, left matrix profile - indices, and right matrix profile indices. + computation of the top-k matrix profile, top-k matrix profile indices, top-1 + left matrix profile indices, and top-1 right matrix profile indices. Parameters ---------- @@ -326,16 +338,16 @@ def _stump( Returns ------- profile : numpy.ndarray - Top-k Matrix profile + Top-k matrix profile indices : numpy.ndarray - The top-k matrix profile indices + Top-k matrix profile indices left indices : numpy.ndarray - The top-1 left matrix profile indices + Top-1 left matrix profile indices right indices : numpy.ndarray - The top-1 right matrix profile indices + Top-1 right matrix profile indices Notes ----- @@ -417,7 +429,8 @@ def _stump( cov_d[:] = cov_d - μ_Q_m_1 for thread_idx in prange(n_threads): - # Compute and update cov, I within a single thread to avoiding race conditions + # Compute and update pearson correlations and matrix profile indices + # within a single thread to avoid race conditions _compute_diagonal( T_A, T_B, From a123540664c93cacc5cf1b006422b42fba9c9069 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 12 May 2022 18:38:47 -0600 Subject: [PATCH 048/416] Correct formats --- stumpy/stump.py | 48 ++++++++++++++++++++++-------------------------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index d49296ac5..f31d0c0f7 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -213,23 +213,19 @@ def _compute_diagonal( if pearson > ρ[thread_idx, i, 0]: idx = np.searchsorted(ρ[thread_idx, i], pearson) - ρ[thread_idx, i, : idx - 1] = ρ[thread_idx, i, 1 : idx] + ρ[thread_idx, i, : idx - 1] = ρ[thread_idx, i, 1:idx] ρ[thread_idx, i, idx - 1] = pearson - I[thread_idx, i, : idx - 1] = I[thread_idx, i, 1 : idx] + I[thread_idx, i, : idx - 1] = I[thread_idx, i, 1:idx] I[thread_idx, i, idx - 1] = i + g if ignore_trivial: # self-joins only if pearson > ρ[thread_idx, i + g, 0]: idx = np.searchsorted(ρ[thread_idx, i + g], pearson) - ρ[thread_idx, i + g, : idx - 1] = ρ[ - thread_idx, i + g, 1 : idx - ] + ρ[thread_idx, i + g, : idx - 1] = ρ[thread_idx, i + g, 1:idx] ρ[thread_idx, i + g, idx - 1] = pearson - I[thread_idx, i + g, : idx - 1] = I[ - thread_idx, i + g, 1 : idx - ] + I[thread_idx, i + g, : idx - 1] = I[thread_idx, i + g, 1:idx] I[thread_idx, i + g, idx - 1] = i # for top-1 case: # ρ[thread_idx, i + g, 0] = pearson @@ -469,10 +465,10 @@ def _stump( j = k - 1 - j if ρ[0, i, 0] < ρ[thread_idx, i, j]: idx = np.searchsorted(ρ[0, i], ρ[thread_idx, i, j]) - ρ[0, i, : idx - 1] = ρ[0, i, 1 : idx] + ρ[0, i, : idx - 1] = ρ[0, i, 1:idx] ρ[0, i, idx - 1] = ρ[thread_idx, i, j] - I[0, i, : idx - 1] = I[0, i, 1 : idx] + I[0, i, : idx - 1] = I[0, i, 1:idx] I[0, i, idx - 1] = I[thread_idx, i, j] if ρL[0, i] < ρL[thread_idx, i]: @@ -661,22 +657,22 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): diags = np.arange(-(n_A - m + 1) + 1, n_B - m + 1, dtype=np.int64) P, I, IL, IR = _stump( - T_A, - T_B, - m, - M_T, - μ_Q, - Σ_T_inverse, - σ_Q_inverse, - M_T_m_1, - μ_Q_m_1, - T_A_subseq_isfinite, - T_B_subseq_isfinite, - T_A_subseq_isconstant, - T_B_subseq_isconstant, - diags, - ignore_trivial, - k, + T_A, + T_B, + m, + M_T, + μ_Q, + Σ_T_inverse, + σ_Q_inverse, + M_T_m_1, + μ_Q_m_1, + T_A_subseq_isfinite, + T_B_subseq_isfinite, + T_A_subseq_isconstant, + T_B_subseq_isconstant, + diags, + ignore_trivial, + k, ) out = np.empty((l, 2 * k + 2), dtype=object) From 1dff66f983346ae23430f76cf5c1f16b2c46ea98 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 12 May 2022 18:40:00 -0600 Subject: [PATCH 049/416] Full coverage of test_stump unit test From cf48b6961eab3c01180a84a476dcd5e8fcd626ee Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 13 May 2022 10:37:44 -0600 Subject: [PATCH 050/416] Change function considering new input/output structure --- stumpy/scrump.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 002847507..75790c70a 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -609,7 +609,7 @@ def update(self): if self._chunk_idx < self._n_chunks: start_idx, stop_idx = self._chunk_diags_ranges[self._chunk_idx] - P, I = _stump( + P, I, IL, IR = _stump( self._T_A, self._T_B, self._m, @@ -625,8 +625,11 @@ def update(self): self._T_B_subseq_isconstant, self._diags[start_idx:stop_idx], self._ignore_trivial, + k=1, ) + I = np.c_[I, IL, IR] + # Update matrix profile and indices for i in range(self._P.shape[0]): if self._P[i, 0] > P[i, 0]: From 7d16ce6a883b38808a7e6f93c41c82755500465a Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 13 May 2022 10:45:24 -0600 Subject: [PATCH 051/416] Add two more outputs returned by _stump --- stumpy/stump.py | 51 +++++++++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 19 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index f31d0c0f7..348085a4e 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -481,14 +481,26 @@ def _stump( # Convert pearson correlations to distances p_norm = np.abs(2 * m * (1 - ρ[0, :, :])) + p_norm_L = np.abs(2 * m * (1 - ρL[0, :])) + p_norm_R = np.abs(2 * m * (1 - ρR[0, :])) + for i in prange(p_norm.shape[0]): for j in prange(p_norm.shape[1]): if p_norm[i, j] < config.STUMPY_P_NORM_THRESHOLD: p_norm[i, j] = 0.0 + if p_norm_L[i] < config.STUMPY_P_NORM_THRESHOLD: + p_norm_L[i] = 0.0 + + if p_norm_R[i] < config.STUMPY_P_NORM_THRESHOLD: + p_norm_R[i] = 0.0 + P = np.sqrt(p_norm) + PL = np.sqrt(p_norm_L) + PR = np.sqrt(p_norm_R) + - return P[:, ::-1], I[0, :, ::-1], IL[0, :], IR[0, :] + return P[:, ::-1], I[0, :, ::-1], PL, IL[0, :], PR, IR[0, :] @core.non_normalized(aamp) @@ -656,26 +668,27 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): else: diags = np.arange(-(n_A - m + 1) + 1, n_B - m + 1, dtype=np.int64) - P, I, IL, IR = _stump( - T_A, - T_B, - m, - M_T, - μ_Q, - Σ_T_inverse, - σ_Q_inverse, - M_T_m_1, - μ_Q_m_1, - T_A_subseq_isfinite, - T_B_subseq_isfinite, - T_A_subseq_isconstant, - T_B_subseq_isconstant, - diags, - ignore_trivial, - k, + P, I, PL, IL, PR, IR = _stump( + T_A, + T_B, + m, + M_T, + μ_Q, + Σ_T_inverse, + σ_Q_inverse, + M_T_m_1, + μ_Q_m_1, + T_A_subseq_isfinite, + T_B_subseq_isfinite, + T_A_subseq_isconstant, + T_B_subseq_isconstant, + diags, + ignore_trivial, + k, ) - out = np.empty((l, 2 * k + 2), dtype=object) + out = np.empty((l, 2 * k + 2), dtype=object) # last two columns are to + # store left and right matrix profile indices out[:, :k] = P out[:, k:] = np.c_[I, IL, IR] From 61d38b6b747ff96820140335163b5d02c76f0eaf Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 13 May 2022 10:50:48 -0600 Subject: [PATCH 052/416] Update/Correct docstrings --- stumpy/stump.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index 348085a4e..b9743613b 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -133,7 +133,7 @@ def _compute_diagonal( The top-1 left matrix profile indices ρR : numpy.ndarray - The top-1 left Pearson correlations + The top-1 right Pearson correlations IR : numpy.ndarray The top-1 right matrix profile indices @@ -272,7 +272,8 @@ def _stump( """ A Numba JIT-compiled version of STOMPopt with Pearson correlations for parallel computation of the top-k matrix profile, top-k matrix profile indices, top-1 - left matrix profile indices, and top-1 right matrix profile indices. + left matrix profile and matrix profile indices, and top-1 right matrix profile + and matrix profile indices. Parameters ---------- @@ -339,9 +340,15 @@ def _stump( indices : numpy.ndarray Top-k matrix profile indices + left profile : numpy.ndarray + Top-1 left matrix profile + left indices : numpy.ndarray Top-1 left matrix profile indices + right profile : numpy.ndarray + Top-1 right matrix profile + right indices : numpy.ndarray Top-1 right matrix profile indices @@ -499,7 +506,6 @@ def _stump( PL = np.sqrt(p_norm_L) PR = np.sqrt(p_norm_R) - return P[:, ::-1], I[0, :, ::-1], PL, IL[0, :], PR, IR[0, :] @@ -546,9 +552,9 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): ------- out : numpy.ndarray The first k columns consists of the top-k matrix profile, the next k columns - consists of their corresponding matrix profile indices, the one before - last column consists of the top-1 left matrix profile indices, and the - last column consists of the top-1 right matrix profile indices. + consists of their corresponding matrix profile indices, the column at + numpy indexing 2k contains top-1 left matrix profile indices and the last + column, at numpy indexing 2k+1, contains top-1 right matrix profile indices. See Also -------- From 1a469a5230720bdc4d86287db174c0196fd9cf8d Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 13 May 2022 10:53:16 -0600 Subject: [PATCH 053/416] Correct callee function _stump --- stumpy/scrump.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 75790c70a..df53d8244 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -609,7 +609,7 @@ def update(self): if self._chunk_idx < self._n_chunks: start_idx, stop_idx = self._chunk_diags_ranges[self._chunk_idx] - P, I, IL, IR = _stump( + P, I, PL, IL, PR, IR = _stump( self._T_A, self._T_B, self._m, @@ -628,8 +628,9 @@ def update(self): k=1, ) + P = np.c_[P, PL, PR] I = np.c_[I, IL, IR] - + # Update matrix profile and indices for i in range(self._P.shape[0]): if self._P[i, 0] > P[i, 0]: From 2149abf0f4d2b0f109246b1a90d1106fa4d76f89 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 13 May 2022 10:53:58 -0600 Subject: [PATCH 054/416] Fix format --- stumpy/stump.py | 34 +++++++++++++++++----------------- stumpy/test_stump.py | 0 2 files changed, 17 insertions(+), 17 deletions(-) create mode 100644 stumpy/test_stump.py diff --git a/stumpy/stump.py b/stumpy/stump.py index b9743613b..cb10e65c4 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -675,25 +675,25 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): diags = np.arange(-(n_A - m + 1) + 1, n_B - m + 1, dtype=np.int64) P, I, PL, IL, PR, IR = _stump( - T_A, - T_B, - m, - M_T, - μ_Q, - Σ_T_inverse, - σ_Q_inverse, - M_T_m_1, - μ_Q_m_1, - T_A_subseq_isfinite, - T_B_subseq_isfinite, - T_A_subseq_isconstant, - T_B_subseq_isconstant, - diags, - ignore_trivial, - k, + T_A, + T_B, + m, + M_T, + μ_Q, + Σ_T_inverse, + σ_Q_inverse, + M_T_m_1, + μ_Q_m_1, + T_A_subseq_isfinite, + T_B_subseq_isfinite, + T_A_subseq_isconstant, + T_B_subseq_isconstant, + diags, + ignore_trivial, + k, ) - out = np.empty((l, 2 * k + 2), dtype=object) # last two columns are to + out = np.empty((l, 2 * k + 2), dtype=object) # last two columns are to # store left and right matrix profile indices out[:, :k] = P out[:, k:] = np.c_[I, IL, IR] diff --git a/stumpy/test_stump.py b/stumpy/test_stump.py new file mode 100644 index 000000000..e69de29bb From 364f280d7a4db08ede32151b201e856d344bdef6 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 13 May 2022 11:19:02 -0600 Subject: [PATCH 055/416] Fixed number of inputs passed to _stump --- stumpy/stumped.py | 1 + 1 file changed, 1 insertion(+) diff --git a/stumpy/stumped.py b/stumpy/stumped.py index 09557e318..7f1f67e51 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -248,6 +248,7 @@ def stumped(dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True, T_B_subseq_isconstant_future, diags_futures[i], ignore_trivial, + 1, ) ) From e983e1fbda3ca017d453a2acb97d997314ad9a70 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 13 May 2022 11:29:00 -0600 Subject: [PATCH 056/416] Fixed number of outputs returned by the function --- stumpy/stumped.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/stumpy/stumped.py b/stumpy/stumped.py index 7f1f67e51..db30eea59 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -253,7 +253,11 @@ def stumped(dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True, ) results = dask_client.gather(futures) - profile, indices = results[0] + profile, indices, profile_L, indices_L, profile_R, indices_R = results[0] + + profile = np.c_[profile, profile_L, profile_R] + indices = np.c_[indices, indices_L, indices_R] + for i in range(1, len(hosts)): P, I = results[i] for col in range(P.shape[1]): # pragma: no cover From ef2bc6578bfb4f7e04c74dcda3563d32fd76497a Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 13 May 2022 11:36:56 -0600 Subject: [PATCH 057/416] Fixed number of returned outputs --- stumpy/stumped.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/stumpy/stumped.py b/stumpy/stumped.py index db30eea59..1fbd7be49 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -259,7 +259,9 @@ def stumped(dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True, indices = np.c_[indices, indices_L, indices_R] for i in range(1, len(hosts)): - P, I = results[i] + P, I, PL, IL, PR, IR = results[i] + P = np.c_[P, PL, PR] + I = np.c_[I, IL, IR] for col in range(P.shape[1]): # pragma: no cover cond = P[:, col] < profile[:, col] profile[:, col] = np.where(cond, P[:, col], profile[:, col]) From f7d4a8fcd298600c7a51fe8178020a675b349349 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 13 May 2022 11:39:31 -0600 Subject: [PATCH 058/416] Correct format --- stumpy/stumped.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stumpy/stumped.py b/stumpy/stumped.py index 1fbd7be49..6ca40707c 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -253,7 +253,7 @@ def stumped(dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True, ) results = dask_client.gather(futures) - profile, indices, profile_L, indices_L, profile_R, indices_R = results[0] + profile, indices, profile_L, indices_L, profile_R, indices_R = results[0] profile = np.c_[profile, profile_L, profile_R] indices = np.c_[indices, indices_L, indices_R] From 3dccc9a244797c3324cfef54a0b3e1d07c36d6e5 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 13 May 2022 12:21:00 -0600 Subject: [PATCH 059/416] Exclude parameter 'k' in non-normalized decorator After updating non-normalized functions to return top-k matrix profile, the parameter "k" will be removed from such exclusion. --- stumpy/core.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/stumpy/core.py b/stumpy/core.py index a2a30c043..9c4296ab9 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -121,7 +121,8 @@ def norm_func(Q, T, A_norm=None, other_norm=None, normalize=True, p=2.0): The desired z-normalized/non-normalized function (or class) """ if exclude is None: - exclude = ["normalize", "p"] + exclude = ["normalize", "p", "k"] # remove "k" after updating + # non-normalized function to accept "k" for top-k matrix profile @functools.wraps(non_norm) def outer_wrapper(norm): From a430364aa2cfc77263f7328386dc5c9ea0048945 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 13 May 2022 12:25:17 -0600 Subject: [PATCH 060/416] Correct format --- stumpy/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 9c4296ab9..f9a77a07f 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -121,8 +121,8 @@ def norm_func(Q, T, A_norm=None, other_norm=None, normalize=True, p=2.0): The desired z-normalized/non-normalized function (or class) """ if exclude is None: - exclude = ["normalize", "p", "k"] # remove "k" after updating - # non-normalized function to accept "k" for top-k matrix profile + exclude = ["normalize", "p", "k"] # remove "k" after updating + # non-normalized function to accept "k" for top-k matrix profile @functools.wraps(non_norm) def outer_wrapper(norm): From 4f0194384b38e38a6b76e949d6aac0bd06fa441f Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 13 May 2022 15:17:22 -0600 Subject: [PATCH 061/416] Fixed dtype of matrix profile indices --- stumpy/stump.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index cb10e65c4..683194e9b 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -406,10 +406,10 @@ def _stump( I = np.full((n_threads, l, k), -1, dtype=np.int64) ρL = np.full((n_threads, l), -np.inf, dtype=np.float64) - IL = np.full((n_threads, l), -1, dtype=np.float64) + IL = np.full((n_threads, l), -1, dtype=np.int64) ρR = np.full((n_threads, l), -np.inf, dtype=np.float64) - IR = np.full((n_threads, l), -1, dtype=np.float64) + IR = np.full((n_threads, l), -1, dtype=np.int64) ndist_counts = core._count_diagonal_ndist(diags, m, n_A, n_B) diags_ranges = core._get_array_ranges(ndist_counts, n_threads, False) From aebe5a31920fed46be8cac8f46c50cbc58315e0c Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 13 May 2022 16:33:19 -0600 Subject: [PATCH 062/416] Add pagam no cover --- tests/naive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/naive.py b/tests/naive.py index 67d1fb27c..b2d8894f7 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -164,7 +164,7 @@ def searchsorted(a, v): if len(indices): return indices.min() else: - return len(a) + return len(a) # pragma: no cover def stump(T_A, m, T_B=None, exclusion_zone=None, row_wise=False, k=1): From de295af807c8b114cdce77ee254e62ed34bcf485 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 13 May 2022 16:37:23 -0600 Subject: [PATCH 063/416] Minor change --- tests/naive.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index b2d8894f7..4a5ed789a 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -163,8 +163,8 @@ def searchsorted(a, v): indices = np.flatnonzero(v < a) if len(indices): return indices.min() - else: - return len(a) # pragma: no cover + else: # pragma: no cover + return len(a) def stump(T_A, m, T_B=None, exclusion_zone=None, row_wise=False, k=1): From 1d35aea6326fab28d4d099d1b6e40db7d4fd037c Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 15 May 2022 21:48:13 -0600 Subject: [PATCH 064/416] Use range to move in reverse --- stumpy/stump.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index 683194e9b..6f47fe698 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -468,8 +468,7 @@ def _stump( for thread_idx in range(1, n_threads): for i in prange(l): # top-k - for j in range(k): - j = k - 1 - j + for j in range(k - 1, -1, -1): if ρ[0, i, 0] < ρ[thread_idx, i, j]: idx = np.searchsorted(ρ[0, i], ρ[thread_idx, i, j]) ρ[0, i, : idx - 1] = ρ[0, i, 1:idx] From e817e5f0dd1316105b93a96d9be28b659a58367d Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 17 May 2022 21:02:08 -0600 Subject: [PATCH 065/416] Remove a wrongly created file --- stumpy/test_stump.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 stumpy/test_stump.py diff --git a/stumpy/test_stump.py b/stumpy/test_stump.py deleted file mode 100644 index e69de29bb..000000000 From c1e39256972a03f0ee1b014e1b8e20efa2d811ba Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 17 May 2022 21:04:59 -0600 Subject: [PATCH 066/416] Remove parameter k in non normalized decorator --- stumpy/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index f9a77a07f..753b0affa 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -121,9 +121,9 @@ def norm_func(Q, T, A_norm=None, other_norm=None, normalize=True, p=2.0): The desired z-normalized/non-normalized function (or class) """ if exclude is None: - exclude = ["normalize", "p", "k"] # remove "k" after updating - # non-normalized function to accept "k" for top-k matrix profile + exclude = ["normalize", "p"] + @functools.wraps(non_norm) def outer_wrapper(norm): @functools.wraps(norm) From aa08176e4cc1ecd90dc47e3ef851103088136a11 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 17 May 2022 21:09:58 -0600 Subject: [PATCH 067/416] Add parameter k to arguments of non normalized function Temporarily, the parameter k is added to the arguments of non-normalized function `aamp` so that the tests can be passed for now. This will be handled after completing the normalized version `stump`. --- stumpy/aamp.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/stumpy/aamp.py b/stumpy/aamp.py index 201e4413b..b00c8cbf1 100644 --- a/stumpy/aamp.py +++ b/stumpy/aamp.py @@ -240,7 +240,8 @@ def _aamp( return np.power(P[0, :, :], 1.0 / p), I[0, :, :] -def aamp(T_A, m, T_B=None, ignore_trivial=True, p=2.0): +def aamp(T_A, m, T_B=None, ignore_trivial=True, p=2.0, k=1): # k=1 is temporary + # and this function needs to be changed to return top-k """ Compute the non-normalized (i.e., without z-normalization) matrix profile From 37a9f2c91979fbd2db35d27d4c946eb1ca31c08f Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 17 May 2022 22:12:34 -0600 Subject: [PATCH 068/416] Replace numpy c_ with column_stack --- stumpy/scrump.py | 4 ++-- stumpy/stump.py | 2 +- stumpy/stumped.py | 8 ++++---- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index df53d8244..25c4e4e3f 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -628,8 +628,8 @@ def update(self): k=1, ) - P = np.c_[P, PL, PR] - I = np.c_[I, IL, IR] + P = np.column_stack((P, PL, PR)) + I = np.column_stack((I, IL, IR)) # Update matrix profile and indices for i in range(self._P.shape[0]): diff --git a/stumpy/stump.py b/stumpy/stump.py index 6f47fe698..449c35200 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -695,7 +695,7 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): out = np.empty((l, 2 * k + 2), dtype=object) # last two columns are to # store left and right matrix profile indices out[:, :k] = P - out[:, k:] = np.c_[I, IL, IR] + out[:, k:] = np.column_stack((I, IL, IR)) threshold = 10e-6 if core.are_distances_too_small(out[:, 0], threshold=threshold): # pragma: no cover diff --git a/stumpy/stumped.py b/stumpy/stumped.py index 6ca40707c..0c1c34e07 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -255,13 +255,13 @@ def stumped(dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True, results = dask_client.gather(futures) profile, indices, profile_L, indices_L, profile_R, indices_R = results[0] - profile = np.c_[profile, profile_L, profile_R] - indices = np.c_[indices, indices_L, indices_R] + profile = np.column_stack((profile, profile_L, profile_R)) + indices = np.column_stack((indices, indices_L, indices_R)) for i in range(1, len(hosts)): P, I, PL, IL, PR, IR = results[i] - P = np.c_[P, PL, PR] - I = np.c_[I, IL, IR] + P = np.column_stack((P, PL, PR)) + I = np.column_stack((I, IL, IR)) for col in range(P.shape[1]): # pragma: no cover cond = P[:, col] < profile[:, col] profile[:, col] = np.where(cond, P[:, col], profile[:, col]) From 8c0e76ecd2eeea875fb7c80eb4cfe5703740a333 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 17 May 2022 23:29:21 -0600 Subject: [PATCH 069/416] Minor changes - Improve docstrings - Reverse rho and I before return - Improve comments --- stumpy/stump.py | 47 ++++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index 449c35200..6fe2b7e41 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -49,9 +49,9 @@ def _compute_diagonal( k, ): """ - Compute (Numba JIT-compiled) and update the (top-k) Pearson correlation, ρ, and I, - and, the left ρ and the left I, the right ρ and the right I sequentially along - individual diagonals using a single thread and avoiding race conditions. + Compute (Numba JIT-compiled) and update the (top-k) Pearson correlation (ρ), + ρL, ρR, I, IL, and IR sequentially along individual diagonals using a single + thread and avoiding race conditions. Parameters ---------- @@ -121,10 +121,10 @@ def _compute_diagonal( The thread index ρ : numpy.ndarray - The top-k Pearson correlations, sorted in ascending order per row + The (top-k) Pearson correlations, sorted in ascending order per row I : numpy.ndarray - The top-k matrix profile indices + The (top-k) matrix profile indices ρL : numpy.ndarray The top-1 left Pearson correlations @@ -144,7 +144,7 @@ def _compute_diagonal( k : int The number of smallest elements in distance profile that should be stored - for constructing top-k matrix profile. + for constructing the top-k matrix profile. Returns ------- @@ -227,9 +227,6 @@ def _compute_diagonal( I[thread_idx, i + g, : idx - 1] = I[thread_idx, i + g, 1:idx] I[thread_idx, i + g, idx - 1] = i - # for top-1 case: - # ρ[thread_idx, i + g, 0] = pearson - # I[thread_idx, i + g, 0] = i if i < i + g: # left pearson correlation and left matrix profile index @@ -271,9 +268,9 @@ def _stump( ): """ A Numba JIT-compiled version of STOMPopt with Pearson correlations for parallel - computation of the top-k matrix profile, top-k matrix profile indices, top-1 - left matrix profile and matrix profile indices, and top-1 right matrix profile - and matrix profile indices. + computation of the (top-k) matrix profile, the (top-k) matrix profile indices, + the top-1 left matrix profile and matrix profile indices, and the top-1 right + matrix profile and matrix profile indices. Parameters ---------- @@ -468,7 +465,7 @@ def _stump( for thread_idx in range(1, n_threads): for i in prange(l): # top-k - for j in range(k - 1, -1, -1): + for j in range(k - 1, -1, -1): # reverse iteration to preserve order in ties if ρ[0, i, 0] < ρ[thread_idx, i, j]: idx = np.searchsorted(ρ[0, i], ρ[thread_idx, i, j]) ρ[0, i, : idx - 1] = ρ[0, i, 1:idx] @@ -485,8 +482,12 @@ def _stump( ρR[0, i] = ρR[thread_idx, i] IR[0, i] = IR[thread_idx, i] - # Convert pearson correlations to distances - p_norm = np.abs(2 * m * (1 - ρ[0, :, :])) + # The arrays ρ (and so I) should be reversed since ρ is in ascending order. + ρ = ρ[0, :, ::-1] + I = I[0, :, ::-1] + + # Convert pearson correlations to distances. + p_norm = np.abs(2 * m * (1 - ρ)) p_norm_L = np.abs(2 * m * (1 - ρL[0, :])) p_norm_R = np.abs(2 * m * (1 - ρR[0, :])) @@ -505,7 +506,7 @@ def _stump( PL = np.sqrt(p_norm_L) PR = np.sqrt(p_norm_R) - return P[:, ::-1], I[0, :, ::-1], PL, IL[0, :], PR, IR[0, :] + return P, I, PL, IL[0, :], PR, IR[0, :] @core.non_normalized(aamp) @@ -514,8 +515,8 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): Compute the z-normalized matrix profile This is a convenience wrapper around the Numba JIT-compiled parallelized - `_stump` function which computes the matrix profile according to STOMPopt with - Pearson correlations. + `_stump` function which computes the (top-k) matrix profile according to + STOMPopt with Pearson correlations. Parameters ---------- @@ -545,15 +546,15 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): k : int, default 1 The number of smallest elements in distance profile that should be stored - for constructing top-k matrix profile. + for constructing the top-k matrix profile. Returns ------- out : numpy.ndarray - The first k columns consists of the top-k matrix profile, the next k columns - consists of their corresponding matrix profile indices, the column at - numpy indexing 2k contains top-1 left matrix profile indices and the last - column, at numpy indexing 2k+1, contains top-1 right matrix profile indices. + The first k columns contain the top-k matrix profile, the next k columns + contain their corresponding matrix profile indices, the column at + numpy indexing 2k contains the top-1 left matrix profile indices and the last + column, at numpy indexing 2k+1, contains the top-1 right matrix profile indices. See Also -------- From df4c5d1ad8db3109eb8316c99314785cb02f5325 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 17 May 2022 23:47:54 -0600 Subject: [PATCH 070/416] Correct Format --- stumpy/aamp.py | 2 +- stumpy/core.py | 1 - stumpy/stump.py | 6 ++++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/stumpy/aamp.py b/stumpy/aamp.py index b00c8cbf1..807c3164b 100644 --- a/stumpy/aamp.py +++ b/stumpy/aamp.py @@ -240,7 +240,7 @@ def _aamp( return np.power(P[0, :, :], 1.0 / p), I[0, :, :] -def aamp(T_A, m, T_B=None, ignore_trivial=True, p=2.0, k=1): # k=1 is temporary +def aamp(T_A, m, T_B=None, ignore_trivial=True, p=2.0, k=1): # k=1 is temporary # and this function needs to be changed to return top-k """ Compute the non-normalized (i.e., without z-normalization) matrix profile diff --git a/stumpy/core.py b/stumpy/core.py index 753b0affa..a2a30c043 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -123,7 +123,6 @@ def norm_func(Q, T, A_norm=None, other_norm=None, normalize=True, p=2.0): if exclude is None: exclude = ["normalize", "p"] - @functools.wraps(non_norm) def outer_wrapper(norm): @functools.wraps(norm) diff --git a/stumpy/stump.py b/stumpy/stump.py index 6fe2b7e41..3e241a11e 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -465,7 +465,9 @@ def _stump( for thread_idx in range(1, n_threads): for i in prange(l): # top-k - for j in range(k - 1, -1, -1): # reverse iteration to preserve order in ties + for j in range( + k - 1, -1, -1 + ): # reverse iteration to preserve order in ties if ρ[0, i, 0] < ρ[thread_idx, i, j]: idx = np.searchsorted(ρ[0, i], ρ[thread_idx, i, j]) ρ[0, i, : idx - 1] = ρ[0, i, 1:idx] @@ -487,7 +489,7 @@ def _stump( I = I[0, :, ::-1] # Convert pearson correlations to distances. - p_norm = np.abs(2 * m * (1 - ρ)) + p_norm = np.abs(2 * m * (1 - ρ)) p_norm_L = np.abs(2 * m * (1 - ρL[0, :])) p_norm_R = np.abs(2 * m * (1 - ρR[0, :])) From c5c881bebc2ebffb9d55a1491ebff6f239b73553 Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 18 May 2022 00:22:26 -0600 Subject: [PATCH 071/416] minor improvement of docstring --- stumpy/stump.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index 3e241a11e..ae6a21a15 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -553,8 +553,8 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): Returns ------- out : numpy.ndarray - The first k columns contain the top-k matrix profile, the next k columns - contain their corresponding matrix profile indices, the column at + The first k columns consist of the top-k matrix profile, the next k columns + consist of their corresponding matrix profile indices, the column at numpy indexing 2k contains the top-1 left matrix profile indices and the last column, at numpy indexing 2k+1, contains the top-1 right matrix profile indices. From d9dcdc037168ef4f7cd4a9ef4cda491a94f24495 Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 18 May 2022 00:24:49 -0600 Subject: [PATCH 072/416] Add parameter k to the arguments of function the function will be revised to return top-k matrix profile --- stumpy/aamped.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/stumpy/aamped.py b/stumpy/aamped.py index d6bf6d97b..c158c9423 100644 --- a/stumpy/aamped.py +++ b/stumpy/aamped.py @@ -12,7 +12,8 @@ logger = logging.getLogger(__name__) -def aamped(dask_client, T_A, m, T_B=None, ignore_trivial=True, p=2.0): +def aamped(dask_client, T_A, m, T_B=None, ignore_trivial=True, p=2.0, k=1): + # function needs to be revised to return top-k matix profile """ Compute the non-normalized (i.e., without z-normalization) matrix profile From c6b81f0410769cd700cf68dbbb8f473dd50bfabf Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 18 May 2022 00:52:31 -0600 Subject: [PATCH 073/416] Add parameter k to arguments Temporarily add parameter k to avoid non-normalized decorator test failure --- stumpy/stumped.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/stumpy/stumped.py b/stumpy/stumped.py index 0c1c34e07..a48f6a957 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -14,7 +14,8 @@ @core.non_normalized(aamped) -def stumped(dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0): +def stumped(dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): + # the function needs to be revisd to return top-k matrix profile """ Compute the z-normalized matrix profile with a distributed dask cluster From 4ffc7fca9733cccb6dddab528a1a5d2ca996089c Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 18 May 2022 00:53:55 -0600 Subject: [PATCH 074/416] Correct format --- stumpy/stumped.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/stumpy/stumped.py b/stumpy/stumped.py index a48f6a957..e922536f3 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -14,7 +14,9 @@ @core.non_normalized(aamped) -def stumped(dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): +def stumped( + dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1 +): # the function needs to be revisd to return top-k matrix profile """ Compute the z-normalized matrix profile with a distributed dask cluster From 102c627f64eb5736f528cc31bba8bb01f8645628 Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 18 May 2022 02:03:34 -0600 Subject: [PATCH 075/416] Remove parameter k from arguements --- stumpy/aamped.py | 3 +-- stumpy/stumped.py | 4 +--- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/stumpy/aamped.py b/stumpy/aamped.py index c158c9423..d6bf6d97b 100644 --- a/stumpy/aamped.py +++ b/stumpy/aamped.py @@ -12,8 +12,7 @@ logger = logging.getLogger(__name__) -def aamped(dask_client, T_A, m, T_B=None, ignore_trivial=True, p=2.0, k=1): - # function needs to be revised to return top-k matix profile +def aamped(dask_client, T_A, m, T_B=None, ignore_trivial=True, p=2.0): """ Compute the non-normalized (i.e., without z-normalization) matrix profile diff --git a/stumpy/stumped.py b/stumpy/stumped.py index e922536f3..6cdfc5aed 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -15,9 +15,7 @@ @core.non_normalized(aamped) def stumped( - dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1 -): - # the function needs to be revisd to return top-k matrix profile + dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0): """ Compute the z-normalized matrix profile with a distributed dask cluster From a37f793306d54123af0660c428ec845a880b3930 Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 18 May 2022 02:24:53 -0600 Subject: [PATCH 076/416] Add one new unit test --- tests/test_stumped.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/test_stumped.py b/tests/test_stumped.py index ca53829fc..02e914436 100644 --- a/tests/test_stumped.py +++ b/tests/test_stumped.py @@ -608,3 +608,20 @@ def test_stumped_two_subsequences_nan_inf_A_B_join_swap( naive.replace_inf(ref_mp) naive.replace_inf(comp_mp) npt.assert_almost_equal(ref_mp, comp_mp) + + +@pytest.mark.filterwarnings("ignore:numpy.dtype size changed") +@pytest.mark.filterwarnings("ignore:numpy.ufunc size changed") +@pytest.mark.filterwarnings("ignore:numpy.ndarray size changed") +@pytest.mark.filterwarnings("ignore:\\s+Port 8787 is already in use:UserWarning") +@pytest.mark.parametrize("T_A, T_B", test_data) +def test_stumped_self_join_KNN(T_A, T_B, dask_cluster): + with Client(dask_cluster) as dask_client: + k = 3 + m = 3 + zone = int(np.ceil(m / 4)) + ref_mp = naive.stump(T_B, m, exclusion_zone=zone, k=k) + comp_mp = stumped(dask_client, T_B, m, ignore_trivial=True, k=k) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) From 0755af4ddcdf5ad5a331a1b535af53f879dfc160 Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 18 May 2022 11:31:12 -0600 Subject: [PATCH 077/416] Add parameter k=1 to arguments This is to avoid unit test failure in non-normalized decorator. After finalizing the normalized function, the non normalized functions will be revised to return top-k matrix profile. --- stumpy/aamped.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/stumpy/aamped.py b/stumpy/aamped.py index d6bf6d97b..d833ee8b3 100644 --- a/stumpy/aamped.py +++ b/stumpy/aamped.py @@ -12,7 +12,8 @@ logger = logging.getLogger(__name__) -def aamped(dask_client, T_A, m, T_B=None, ignore_trivial=True, p=2.0): +def aamped(dask_client, T_A, m, T_B=None, ignore_trivial=True, p=2.0, k=1): + # function needs to be revised to return top-k matrix profile """ Compute the non-normalized (i.e., without z-normalization) matrix profile From ca9fdcffcf94d5f0541b74e845fc5e11ee9481ae Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 18 May 2022 14:16:06 -0600 Subject: [PATCH 078/416] Revise stumped to return top-k matrix profile --- stumpy/stumped.py | 50 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 35 insertions(+), 15 deletions(-) diff --git a/stumpy/stumped.py b/stumpy/stumped.py index 6cdfc5aed..2b826ba71 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -15,7 +15,7 @@ @core.non_normalized(aamped) def stumped( - dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0): + dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): """ Compute the z-normalized matrix profile with a distributed dask cluster @@ -55,6 +55,10 @@ def stumped( The p-norm to apply for computing the Minkowski distance. This parameter is ignored when `normalize == True`. + k : int + The number of smallest elements in distance profile that should be stored + for constructing the top-k matrix profile. + Returns ------- out : numpy.ndarray @@ -184,7 +188,6 @@ def stumped( l = n_A - m + 1 excl_zone = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM)) - out = np.empty((l, 4), dtype=object) hosts = list(dask_client.ncores().keys()) nworkers = len(hosts) @@ -249,27 +252,44 @@ def stumped( T_B_subseq_isconstant_future, diags_futures[i], ignore_trivial, - 1, + k, ) ) results = dask_client.gather(futures) profile, indices, profile_L, indices_L, profile_R, indices_R = results[0] - profile = np.column_stack((profile, profile_L, profile_R)) - indices = np.column_stack((indices, indices_L, indices_R)) - for i in range(1, len(hosts)): P, I, PL, IL, PR, IR = results[i] - P = np.column_stack((P, PL, PR)) - I = np.column_stack((I, IL, IR)) - for col in range(P.shape[1]): # pragma: no cover - cond = P[:, col] < profile[:, col] - profile[:, col] = np.where(cond, P[:, col], profile[:, col]) - indices[:, col] = np.where(cond, I[:, col], indices[:, col]) - - out[:, 0] = profile[:, 0] - out[:, 1:4] = indices + # Update top-k matrix profile, alternative approach: + # np.argsort(np.concatenate(profile, P), kind='mergesort') + prof = profile.copy() + ind = indices.copy() + for j in range(l): + u, w = 0, 0 + for idx in range(k): + if prof[j, u] <= P[j, w]: + profile[j, idx] = prof[j, u] + indices[j, idx] = ind[j, u] + u += 1 + else: + profile[j, idx] = P[j, w] + indices[j, idx] = I[j, w] + w += 1 + + # Update top-1 left matrix profile and matrix profile index + cond = PL < profile_L + profile_L = np.where(cond, PL, profile_L) + indices_L = np.where(cond, IL, indices_L) + + # Update top-1 right matrix profile and matrix profile index + cond = PR < profile_R + profile_R = np.where(cond, PR, profile_R) + indices_R = np.where(cond, IR, indices_R) + + out = np.empty((l, 2 * k + 2), dtype=object) + out[:, :k] = profile + out[:, k:] = np.column_stack((indices, indices_L, indices_R)) # Delete data from Dask cluster dask_client.cancel(T_A_future) From 9408631f397ef6578dc2d21205e44bb5a45c38f6 Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 18 May 2022 14:16:56 -0600 Subject: [PATCH 079/416] Correct format --- stumpy/stumped.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/stumpy/stumped.py b/stumpy/stumped.py index 2b826ba71..037c4ba52 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -15,7 +15,8 @@ @core.non_normalized(aamped) def stumped( - dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): + dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1 +): """ Compute the z-normalized matrix profile with a distributed dask cluster From 435d9b88ed52bfd8800ff5055375661287b3871d Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 18 May 2022 16:19:44 -0600 Subject: [PATCH 080/416] several minor changes --- stumpy/aamp.py | 4 ++-- stumpy/scrump.py | 2 +- stumpy/stump.py | 16 ++++++++-------- stumpy/stumped.py | 12 ++++++------ tests/test_stump.py | 2 +- 5 files changed, 18 insertions(+), 18 deletions(-) diff --git a/stumpy/aamp.py b/stumpy/aamp.py index 807c3164b..87568f365 100644 --- a/stumpy/aamp.py +++ b/stumpy/aamp.py @@ -240,8 +240,8 @@ def _aamp( return np.power(P[0, :, :], 1.0 / p), I[0, :, :] -def aamp(T_A, m, T_B=None, ignore_trivial=True, p=2.0, k=1): # k=1 is temporary - # and this function needs to be changed to return top-k +def aamp(T_A, m, T_B=None, ignore_trivial=True, p=2.0, k=1): + # function needs to be changed to return top-k matrix profile """ Compute the non-normalized (i.e., without z-normalization) matrix profile diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 25c4e4e3f..9b26478c2 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -454,6 +454,7 @@ def __init__( s=None, normalize=True, p=2.0, + k=1, # class needs to be revised to return (top-k) matrix profile ): """ Initialize the `scrump` object @@ -625,7 +626,6 @@ def update(self): self._T_B_subseq_isconstant, self._diags[start_idx:stop_idx], self._ignore_trivial, - k=1, ) P = np.column_stack((P, PL, PR)) diff --git a/stumpy/stump.py b/stumpy/stump.py index ae6a21a15..60d965590 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -269,8 +269,8 @@ def _stump( """ A Numba JIT-compiled version of STOMPopt with Pearson correlations for parallel computation of the (top-k) matrix profile, the (top-k) matrix profile indices, - the top-1 left matrix profile and matrix profile indices, and the top-1 right - matrix profile and matrix profile indices. + the top-1 left matrix profile and its matrix profile index, and the top-1 right + matrix profile and its matrix profile index. Parameters ---------- @@ -327,7 +327,7 @@ def _stump( k : int The number of smallest elements in distance profile that should be stored - for constructing top-k matrix profile. + for constructing the top-k matrix profile. Returns ------- @@ -430,7 +430,7 @@ def _stump( for thread_idx in prange(n_threads): # Compute and update pearson correlations and matrix profile indices - # within a single thread to avoid race conditions + # within a single thread and avoiding race conditions _compute_diagonal( T_A, T_B, @@ -484,12 +484,12 @@ def _stump( ρR[0, i] = ρR[thread_idx, i] IR[0, i] = IR[thread_idx, i] - # The arrays ρ (and so I) should be reversed since ρ is in ascending order. - ρ = ρ[0, :, ::-1] + # Convert top-k pearson correlations to distances. The arrays ρ (and so I) should + # be reversed since ρ is in ascending order. + p_norm = np.abs(2 * m * (1 - ρ[0, :, ::-1])) I = I[0, :, ::-1] - # Convert pearson correlations to distances. - p_norm = np.abs(2 * m * (1 - ρ)) + # Convert top-1 left/right pearson correlations to distances. p_norm_L = np.abs(2 * m * (1 - ρL[0, :])) p_norm_R = np.abs(2 * m * (1 - ρR[0, :])) diff --git a/stumpy/stumped.py b/stumpy/stumped.py index 037c4ba52..9aa815e6e 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -18,10 +18,10 @@ def stumped( dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1 ): """ - Compute the z-normalized matrix profile with a distributed dask cluster + Compute the z-normalized (top-k) matrix profile with a distributed dask cluster This is a highly distributed implementation around the Numba JIT-compiled - parallelized `_stump` function which computes the matrix profile according + parallelized `_stump` function which computes the (top-k) matrix profile according to STOMPopt with Pearson correlations. Parameters @@ -63,10 +63,10 @@ def stumped( Returns ------- out : numpy.ndarray - The first column consists of the matrix profile, the second column - consists of the matrix profile indices, the third column consists of - the left matrix profile indices, and the fourth column consists of - the right matrix profile indices. + The first k columns consist of the top-k matrix profile, the next k columns + consist of their corresponding matrix profile indices, the column at + numpy indexing 2k contains the top-1 left matrix profile indices and the last + column, at numpy indexing 2k+1, contains the top-1 right matrix profile indices. See Also -------- diff --git a/tests/test_stump.py b/tests/test_stump.py index 25b9c5283..af2a2315e 100644 --- a/tests/test_stump.py +++ b/tests/test_stump.py @@ -244,7 +244,7 @@ def test_stump_nan_zero_mean_self_join(): @pytest.mark.parametrize("T_A, T_B", test_data) def test_stump_self_join_KNN(T_A, T_B): - k = 2 + k = 3 m = 3 zone = int(np.ceil(m / 4)) ref_mp = naive.stump(T_B, m, exclusion_zone=zone, k=k) From c6580c8a8dc1d2cdc49ca4724c16d0649ed95028 Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 18 May 2022 16:20:45 -0600 Subject: [PATCH 081/416] Correct Format --- stumpy/scrump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 9b26478c2..6a4f7b534 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -454,7 +454,7 @@ def __init__( s=None, normalize=True, p=2.0, - k=1, # class needs to be revised to return (top-k) matrix profile + k=1, # class needs to be revised to return (top-k) matrix profile ): """ Initialize the `scrump` object From e4b0473e0fa38f696a84aac3f2da9938eaeb198d Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 18 May 2022 16:26:25 -0600 Subject: [PATCH 082/416] Remove k from arguments --- stumpy/scrump.py | 1 - 1 file changed, 1 deletion(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 6a4f7b534..e62658fc9 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -454,7 +454,6 @@ def __init__( s=None, normalize=True, p=2.0, - k=1, # class needs to be revised to return (top-k) matrix profile ): """ Initialize the `scrump` object From 8bf05ee3d7534488f0769ff2b4bf95eb1f818fc7 Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 18 May 2022 16:33:27 -0600 Subject: [PATCH 083/416] Pass 1 as value of parameter k to a class method to avoid unit test failure --- stumpy/scrump.py | 1 + 1 file changed, 1 insertion(+) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index e62658fc9..ea8808696 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -625,6 +625,7 @@ def update(self): self._T_B_subseq_isconstant, self._diags[start_idx:stop_idx], self._ignore_trivial, + 1 # revise module to accept parameter k for top-k matrix profile ) P = np.column_stack((P, PL, PR)) From f12261cafd9637a1253444d0c321f61c8ee59b23 Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 18 May 2022 16:34:42 -0600 Subject: [PATCH 084/416] Pass 1 as the value of parameter k to avoid unit test failure --- stumpy/scrump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index ea8808696..9fcb51e4b 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -625,7 +625,7 @@ def update(self): self._T_B_subseq_isconstant, self._diags[start_idx:stop_idx], self._ignore_trivial, - 1 # revise module to accept parameter k for top-k matrix profile + 1, # revise module to accept parameter k for top-k matrix profile ) P = np.column_stack((P, PL, PR)) From 695343e4e7ff927b1793de418bc0b2d3dc45b5df Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 18 May 2022 22:38:25 -0600 Subject: [PATCH 085/416] Use np searchsort to avoid copying arrays into new memory --- stumpy/stumped.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/stumpy/stumped.py b/stumpy/stumped.py index 9aa815e6e..f6932325b 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -262,21 +262,18 @@ def stumped( for i in range(1, len(hosts)): P, I, PL, IL, PR, IR = results[i] - # Update top-k matrix profile, alternative approach: - # np.argsort(np.concatenate(profile, P), kind='mergesort') - prof = profile.copy() - ind = indices.copy() for j in range(l): - u, w = 0, 0 - for idx in range(k): - if prof[j, u] <= P[j, w]: - profile[j, idx] = prof[j, u] - indices[j, idx] = ind[j, u] - u += 1 - else: - profile[j, idx] = P[j, w] - indices[j, idx] = I[j, w] - w += 1 + # Uodate profie[j] + for D, ind in zip(P[j], I[j]): + if D >= profile[j, -1]: + break # no need to update profile[j] from this point. + idx = np.searchsorted(profile[j], D, side="right") # might be optimized + # with help of checkpoint idx from previous iteration. + profile[j, idx + 1 :] = profile[j, idx : k - 1] + profile[j, idx] = D + + indices[j, idx + 1 :] = indices[j, idx : k - 1] + indices[j, idx] = ind # Update top-1 left matrix profile and matrix profile index cond = PL < profile_L From f4a37faa71b57cbd6258c981a513ec8a3d2e20b3 Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 18 May 2022 23:42:43 -0600 Subject: [PATCH 086/416] All tests passed From b6d376319b027706dd771693e63811524605b1be Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 20 May 2022 16:11:29 -0600 Subject: [PATCH 087/416] Replace nested for loops with numpy operations --- stumpy/stumped.py | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/stumpy/stumped.py b/stumpy/stumped.py index f6932325b..34d665fc7 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -257,23 +257,27 @@ def stumped( ) ) + profile = np.empty((l, 2 * k)) + indices = np.empty((l, 2 * k)) + results = dask_client.gather(futures) - profile, indices, profile_L, indices_L, profile_R, indices_R = results[0] + ( + profile[:, :k], + indices[:, :k], + profile_L, + indices_L, + profile_R, + indices_R, + ) = results[0] for i in range(1, len(hosts)): P, I, PL, IL, PR, IR = results[i] - for j in range(l): - # Uodate profie[j] - for D, ind in zip(P[j], I[j]): - if D >= profile[j, -1]: - break # no need to update profile[j] from this point. - idx = np.searchsorted(profile[j], D, side="right") # might be optimized - # with help of checkpoint idx from previous iteration. - profile[j, idx + 1 :] = profile[j, idx : k - 1] - profile[j, idx] = D - - indices[j, idx + 1 :] = indices[j, idx : k - 1] - indices[j, idx] = ind + + profile[:, k:] = P + indices[:, k:] = I + idx = np.argsort(profile, axis=1) + profile = np.take_along_axis(profile, idx, axis=1) + indices = np.take_along_axis(indices, idx, axis=1) # Update top-1 left matrix profile and matrix profile index cond = PL < profile_L @@ -286,8 +290,8 @@ def stumped( indices_R = np.where(cond, IR, indices_R) out = np.empty((l, 2 * k + 2), dtype=object) - out[:, :k] = profile - out[:, k:] = np.column_stack((indices, indices_L, indices_R)) + out[:, :k] = profile[:, :k] + out[:, k:] = np.column_stack((indices[:, :k], indices_L, indices_R)) # Delete data from Dask cluster dask_client.cancel(T_A_future) From cc9c0769fde6e270ff903d69459e241207e57da2 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 20 May 2022 18:28:46 -0600 Subject: [PATCH 088/416] Change the order of some variables in inputs and outputs --- stumpy/stump.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index 60d965590..9f37edc8b 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -40,10 +40,10 @@ def _compute_diagonal( diags_stop_idx, thread_idx, ρ, - I, ρL, - IL, ρR, + I, + IL, IR, ignore_trivial, k, @@ -123,18 +123,18 @@ def _compute_diagonal( ρ : numpy.ndarray The (top-k) Pearson correlations, sorted in ascending order per row - I : numpy.ndarray - The (top-k) matrix profile indices - ρL : numpy.ndarray The top-1 left Pearson correlations - IL : numpy.ndarray - The top-1 left matrix profile indices - ρR : numpy.ndarray The top-1 right Pearson correlations + I : numpy.ndarray + The (top-k) matrix profile indices + + IL : numpy.ndarray + The top-1 left matrix profile indices + IR : numpy.ndarray The top-1 right matrix profile indices @@ -452,10 +452,10 @@ def _stump( diags_ranges[thread_idx, 1], thread_idx, ρ, - I, ρL, - IL, ρR, + I, + IL, IR, ignore_trivial, k, @@ -508,7 +508,7 @@ def _stump( PL = np.sqrt(p_norm_L) PR = np.sqrt(p_norm_R) - return P, I, PL, IL[0, :], PR, IR[0, :] + return P, PL, PR, I, IL[0, :], IR[0, :] @core.non_normalized(aamp) @@ -676,7 +676,7 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): else: diags = np.arange(-(n_A - m + 1) + 1, n_B - m + 1, dtype=np.int64) - P, I, PL, IL, PR, IR = _stump( + P, PL, PR, I, IL, IR = _stump( T_A, T_B, m, From a4d456691dacf788739db9dfdf3796ddc568f794 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 20 May 2022 18:47:50 -0600 Subject: [PATCH 089/416] Revise docstrings and comments --- stumpy/aamp.py | 4 ++++ stumpy/aamped.py | 4 ++++ stumpy/stump.py | 42 +++++++++++++++++++++++------------------- stumpy/stumped.py | 19 ++++++++++++------- 4 files changed, 43 insertions(+), 26 deletions(-) diff --git a/stumpy/aamp.py b/stumpy/aamp.py index 87568f365..428c3d4bd 100644 --- a/stumpy/aamp.py +++ b/stumpy/aamp.py @@ -268,6 +268,10 @@ def aamp(T_A, m, T_B=None, ignore_trivial=True, p=2.0, k=1): p : float, default 2.0 The p-norm to apply for computing the Minkowski distance. + k : int, default 1 + The number of top `k` smallest distances used to construct the matrix profile. + Note that this will increase the total computational time and memory usage when k > 1. + Returns ------- out : numpy.ndarray diff --git a/stumpy/aamped.py b/stumpy/aamped.py index d833ee8b3..ad147b42f 100644 --- a/stumpy/aamped.py +++ b/stumpy/aamped.py @@ -47,6 +47,10 @@ def aamped(dask_client, T_A, m, T_B=None, ignore_trivial=True, p=2.0, k=1): p : float, default 2.0 The p-norm to apply for computing the Minkowski distance. + k : int, default 1 + The number of top `k` smallest distances used to construct the matrix profile. + Note that this will increase the total computational time and memory usage when k > 1. + Returns ------- out : numpy.ndarray diff --git a/stumpy/stump.py b/stumpy/stump.py index 9f37edc8b..bcf0d4103 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -143,8 +143,8 @@ def _compute_diagonal( `False`. Default is `True`. k : int - The number of smallest elements in distance profile that should be stored - for constructing the top-k matrix profile. + The number of top `k` smallest distances used to construct the matrix profile. + Note that this will increase the total computational time and memory usage when k > 1. Returns ------- @@ -326,28 +326,28 @@ def _stump( `False`. Default is `True`. k : int - The number of smallest elements in distance profile that should be stored - for constructing the top-k matrix profile. + The number of top `k` smallest distances used to construct the matrix profile. + Note that this will increase the total computational time and memory usage when k > 1. Returns ------- profile : numpy.ndarray - Top-k matrix profile + The (top-k) matrix profile indices : numpy.ndarray - Top-k matrix profile indices + The (top-k) matrix profile indices left profile : numpy.ndarray - Top-1 left matrix profile + The (top-1) left matrix profile left indices : numpy.ndarray - Top-1 left matrix profile indices + The (top-1) left matrix profile indices right profile : numpy.ndarray - Top-1 right matrix profile + The (top-1) right matrix profile right indices : numpy.ndarray - Top-1 right matrix profile indices + The (top-1) right matrix profile indices Notes ----- @@ -484,12 +484,11 @@ def _stump( ρR[0, i] = ρR[thread_idx, i] IR[0, i] = IR[thread_idx, i] - # Convert top-k pearson correlations to distances. The arrays ρ (and so I) should - # be reversed since ρ is in ascending order. + # Reverse top-k rho (and its associated I) to be in descending order and + # then convert from Pearson correlations to Euclidean distances (ascending order) p_norm = np.abs(2 * m * (1 - ρ[0, :, ::-1])) I = I[0, :, ::-1] - # Convert top-1 left/right pearson correlations to distances. p_norm_L = np.abs(2 * m * (1 - ρL[0, :])) p_norm_R = np.abs(2 * m * (1 - ρR[0, :])) @@ -547,16 +546,21 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): ignored when `normalize == True`. k : int, default 1 - The number of smallest elements in distance profile that should be stored - for constructing the top-k matrix profile. + The number of top `k` smallest distances used to construct the matrix profile. + Note that this will increase the total computational time and memory usage when k > 1. Returns ------- out : numpy.ndarray - The first k columns consist of the top-k matrix profile, the next k columns - consist of their corresponding matrix profile indices, the column at - numpy indexing 2k contains the top-1 left matrix profile indices and the last - column, at numpy indexing 2k+1, contains the top-1 right matrix profile indices. + When k = 1 (default), the first column consists of the matrix profile, + the second column consists of the matrix profile indices, the third column + consists of the left matrix profile indices, and the fourth column consists of + the right matrix profile indices. However, when k > 1, the output array will + contain exactly 2 * k + 2 columns. The first k columns (i.e., out[:, :k]) consists + of the top-k matrix profile, the next set of k columns (i.e., out[:, k:2k]) consists + of the corresponding top-k matrix profile indices, and the last two columns + (i.e., out[:, 2k] and out[:, 2k+1] or, equivalently, out[:, -2] and out[:, -1]) correspond to + the top-1 left matrix profile indices and the top-1 right matrix profile indices, respectively. See Also -------- diff --git a/stumpy/stumped.py b/stumpy/stumped.py index 34d665fc7..99a1ba0b1 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -56,17 +56,22 @@ def stumped( The p-norm to apply for computing the Minkowski distance. This parameter is ignored when `normalize == True`. - k : int - The number of smallest elements in distance profile that should be stored - for constructing the top-k matrix profile. + k : int, default 1 + The number of top `k` smallest distances used to construct the matrix profile. + Note that this will increase the total computational time and memory usage when k > 1. Returns ------- out : numpy.ndarray - The first k columns consist of the top-k matrix profile, the next k columns - consist of their corresponding matrix profile indices, the column at - numpy indexing 2k contains the top-1 left matrix profile indices and the last - column, at numpy indexing 2k+1, contains the top-1 right matrix profile indices. + When k = 1 (default), the first column consists of the matrix profile, + the second column consists of the matrix profile indices, the third column + consists of the left matrix profile indices, and the fourth column consists of + the right matrix profile indices. However, when k > 1, the output array will + contain exactly 2 * k + 2 columns. The first k columns (i.e., out[:, :k]) consists + of the top-k matrix profile, the next set of k columns (i.e., out[:, k:2k]) consists + of the corresponding top-k matrix profile indices, and the last two columns + (i.e., out[:, 2k] and out[:, 2k+1] or, equivalently, out[:, -2] and out[:, -1]) correspond to + the top-1 left matrix profile indices and the top-1 right matrix profile indices, respectively. See Also -------- From 5ab2978f9c09589e7cbc6279d7c5fb27c07d9723 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 20 May 2022 22:17:51 -0600 Subject: [PATCH 090/416] Fixed order of outputs returned in _stump --- stumpy/scrump.py | 2 +- stumpy/stumped.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 9fcb51e4b..c547ab02b 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -609,7 +609,7 @@ def update(self): if self._chunk_idx < self._n_chunks: start_idx, stop_idx = self._chunk_diags_ranges[self._chunk_idx] - P, I, PL, IL, PR, IR = _stump( + P, PL, PR, I, IL, IR = _stump( self._T_A, self._T_B, self._m, diff --git a/stumpy/stumped.py b/stumpy/stumped.py index 99a1ba0b1..1c8b2cd80 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -268,15 +268,15 @@ def stumped( results = dask_client.gather(futures) ( profile[:, :k], - indices[:, :k], profile_L, - indices_L, profile_R, + indices[:, :k], + indices_L, indices_R, ) = results[0] for i in range(1, len(hosts)): - P, I, PL, IL, PR, IR = results[i] + P, PL, PR, I, IL, IR = results[i] profile[:, k:] = P indices[:, k:] = I From 6460a5bc57a4a6ecc3beff8a45f6262cfd47807b Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 21 May 2022 11:23:00 -0600 Subject: [PATCH 091/416] Add new function to update TopK MatrixProfile --- stumpy/stumped.py | 47 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/stumpy/stumped.py b/stumpy/stumped.py index 1c8b2cd80..e9784e28f 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -5,6 +5,7 @@ import logging import numpy as np +from numba import njit, prange from . import core, config from .stump import _stump @@ -13,6 +14,35 @@ logger = logging.getLogger(__name__) +@njit(parallel=True) +def _merge_topk_profiles_indices(PA, PB, IA, IB): + """ + Merge two top-k matrix profiles while prioritizing values of PA in ties + and update PA (and so IA) + + PA : numpy.ndarray + a (top-k) matrix profile + + PB : numpy.ndarray + a (top-k) matrix profile + + IA : numpy.ndarray + a (top-k) matrix profile indices, corresponding to PA + + IB : numpy.ndarray + a (top-k) matrix profile indices, corresponding to PB + """ + for i in prange(PA.shape[0]): + for j in range(PA.shape[1]): + if PB[i, j] < PA[i, -1]: + idx = np.searchsorted(PA[i], PB[i, j], side="right") + + PA[i, idx + 1 :] = PA[i, idx:-1] + PA[i, idx] = PB[i, j] + IA[i, idx + 1 :] = IA[i, idx:-1] + IA[i, idx] = IB[i, j] + + @core.non_normalized(aamped) def stumped( dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1 @@ -266,23 +296,12 @@ def stumped( indices = np.empty((l, 2 * k)) results = dask_client.gather(futures) - ( - profile[:, :k], - profile_L, - profile_R, - indices[:, :k], - indices_L, - indices_R, - ) = results[0] + profile, profile_L, profile_R, indices, indices_L, indices_R = results[0] for i in range(1, len(hosts)): P, PL, PR, I, IL, IR = results[i] - - profile[:, k:] = P - indices[:, k:] = I - idx = np.argsort(profile, axis=1) - profile = np.take_along_axis(profile, idx, axis=1) - indices = np.take_along_axis(indices, idx, axis=1) + # Update top-k matrix profile and matrix profile indices + _merge_topk_profiles_indices(profile, P, indices, I) # Update top-1 left matrix profile and matrix profile index cond = PL < profile_L From d94db722bb2b3150a38008a323ccd117f4bfc1c2 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 21 May 2022 11:34:38 -0600 Subject: [PATCH 092/416] Add .copy() to update array properly --- stumpy/stumped.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stumpy/stumped.py b/stumpy/stumped.py index e9784e28f..01606f5bf 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -37,9 +37,9 @@ def _merge_topk_profiles_indices(PA, PB, IA, IB): if PB[i, j] < PA[i, -1]: idx = np.searchsorted(PA[i], PB[i, j], side="right") - PA[i, idx + 1 :] = PA[i, idx:-1] + PA[i, idx + 1 :] = PA[i, idx:-1].copy() PA[i, idx] = PB[i, j] - IA[i, idx + 1 :] = IA[i, idx:-1] + IA[i, idx + 1 :] = IA[i, idx:-1].copy() IA[i, idx] = IB[i, j] From 72c3887b014f6be2fe89030013179bfd182bc1c1 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 21 May 2022 12:31:08 -0600 Subject: [PATCH 093/416] Add new test function for TopK MatrixProfile with gpu_stump --- tests/test_gpu_stump.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/test_gpu_stump.py b/tests/test_gpu_stump.py index 508b02a56..1a2662647 100644 --- a/tests/test_gpu_stump.py +++ b/tests/test_gpu_stump.py @@ -350,3 +350,20 @@ def test_gpu_stump_nan_zero_mean_self_join(): naive.replace_inf(ref_mp) naive.replace_inf(comp_mp) npt.assert_almost_equal(ref_mp, comp_mp) + + +@pytest.mark.filterwarnings("ignore", category=NumbaPerformanceWarning) +@pytest.mark.parametrize("T_A, T_B", test_data) +def test_gpu_stump_self_join_KNN(T_A, T_B): + k = 3 + m = 3 + zone = int(np.ceil(m / 4)) + ref_mp = naive.stump(T_B, m, exclusion_zone=zone, row_wise=True, k=k) + comp_mp = gpu_stump(T_B, m, ignore_trivial=True, k=k) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) + + comp_mp = gpu_stump(pd.Series(T_B), m, ignore_trivial=True, k=k) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) From 0068358c8c771d950090d62779a9fd30336f2bfc Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 22 May 2022 00:15:00 -0600 Subject: [PATCH 094/416] Enhance gpu_stump to return TopK MatrixProfile --- stumpy/gpu_stump.py | 247 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 198 insertions(+), 49 deletions(-) diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index 667dd8b56..606bf7faf 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -7,7 +7,7 @@ import os import numpy as np -from numba import cuda +from numba import cuda, njit, prange from . import core, config from .gpu_aamp import gpu_aamp @@ -15,9 +15,38 @@ logger = logging.getLogger(__name__) +@njit(parallel=True) +def _merge_topk_profiles_indices(PA, PB, IA, IB): + """ + Merge two top-k matrix profiles while prioritizing values of PA in ties + and update PA (and so IA) + + PA : numpy.ndarray + a (top-k) matrix profile + + PB : numpy.ndarray + a (top-k) matrix profile + + IA : numpy.ndarray + a (top-k) matrix profile indices, corresponding to PA + + IB : numpy.ndarray + a (top-k) matrix profile indices, corresponding to PB + """ + for i in range(PA.shape[0]): + for j in range(PA.shape[1]): + if PB[i, j] < PA[i, -1]: + idx = np.searchsorted(PA[i], PB[i, j], side="right") + + PA[i, idx + 1 :] = PA[i, idx:-1].copy() + PA[i, idx] = PB[i, j] + IA[i, idx + 1 :] = IA[i, idx:-1].copy() + IA[i, idx] = IB[i, j] + + @cuda.jit( "(i8, f8[:], f8[:], i8, f8[:], f8[:], f8[:], f8[:], f8[:]," - "f8[:], f8[:], i8, b1, i8, f8[:, :], i8[:, :], b1)" + "f8[:], f8[:], i8, b1, i8, f8[:, :], f8[:], f8[:], i8[:, :], i8[:], i8[:], b1, i2)" ) def _compute_and_update_PI_kernel( i, @@ -31,12 +60,17 @@ def _compute_and_update_PI_kernel( Σ_T, μ_Q, σ_Q, - k, + profile_len, ignore_trivial, excl_zone, profile, + profile_L, + profile_R, indices, + indices_L, + indices_R, compute_QT, + k, ): """ A Numba CUDA kernel to update the matrix profile and matrix profile indices @@ -79,7 +113,7 @@ def _compute_and_update_PI_kernel( σ_Q : numpy.ndarray Standard deviation of the query sequence, `Q` - k : int + profile_len : int The total number of sliding windows to iterate over ignore_trivial : bool @@ -91,18 +125,30 @@ def _compute_and_update_PI_kernel( sliding window profile : numpy.ndarray - Matrix profile. The first column consists of the global matrix profile, - the second column consists of the left matrix profile, and the third - column consists of the right matrix profile. + The (top-k) matrix profile, sorted in ascending order per row + + profile_L : numpy.ndarray + The (top-1) left matrix profile + + profile_R : numpy.ndarray + The (top-1) right matrix profile indices : numpy.ndarray - The first column consists of the matrix profile indices, the second - column consists of the left matrix profile indices, and the third - column consists of the right matrix profile indices. + The (top-k) matrix profile indices + + indices_L : numpy.ndarray + The (top-1) left matrix profile indices + + indices_R : numpy.ndarray + The (top-1) right matrix profile indices compute_QT : bool A boolean flag for whether or not to compute QT + k : int + The number of top `k` smallest distances used to construct the matrix profile. + Note that this will increase the total computational time and memory usage when k > 1. + Returns ------- None @@ -126,7 +172,7 @@ def _compute_and_update_PI_kernel( for j in range(start, QT_out.shape[0], stride): zone_start = max(0, j - excl_zone) - zone_stop = min(k, j + excl_zone) + zone_stop = min(profile_len, j + excl_zone) if compute_QT: QT_out[j] = ( @@ -157,16 +203,22 @@ def _compute_and_update_PI_kernel( if ignore_trivial: if i <= zone_stop and i >= zone_start: p_norm = np.inf - if p_norm < profile[j, 1] and i < j: - profile[j, 1] = p_norm - indices[j, 1] = i - if p_norm < profile[j, 2] and i > j: - profile[j, 2] = p_norm - indices[j, 2] = i - - if p_norm < profile[j, 0]: - profile[j, 0] = p_norm - indices[j, 0] = i + if p_norm < profile_L[j] and i < j: + profile_L[j] = p_norm + indices_L[j] = i + if p_norm < profile_R[j] and i > j: + profile_R[j] = p_norm + indices_R[j] = i + + for idx in range(k, -1, -1): + if (p_norm < profile[j, idx - 1]) and (idx > 0): + profile[j, idx - 1] = profile[j, idx - 2] + indices[j, idx - 1] = indices[j, idx - 2] + else: + break + if idx < k: + profile[j, idx] = p_norm + indices[j, idx] = i def _gpu_stump( @@ -181,10 +233,11 @@ def _gpu_stump( QT_first_fname, μ_Q_fname, σ_Q_fname, - k, + profile_len, ignore_trivial=True, range_start=1, device_id=0, + k=1, ): """ A Numba CUDA version of STOMP for parallel computation of the @@ -235,7 +288,7 @@ def _gpu_stump( The file name for the standard deviation of the query sequence, `Q`, relative to the current sliding window - k : int + profile_len : int The total number of sliding windows to iterate over ignore_trivial : bool @@ -249,6 +302,10 @@ def _gpu_stump( device_id : int The (GPU) device number to use. The default value is `0`. + k : int + The number of top `k` smallest distances used to construct the matrix profile. + Note that this will increase the total computational time and memory usage when k > 1. + Returns ------- profile_fname : str @@ -316,11 +373,22 @@ def _gpu_stump( device_M_T = cuda.to_device(M_T) device_Σ_T = cuda.to_device(Σ_T) - profile = np.full((k, 3), np.inf, dtype=np.float64) - indices = np.full((k, 3), -1, dtype=np.int64) + profile = np.full((profile_len, k), np.inf, dtype=np.float64) + indices = np.full((profile_len, k), -1, dtype=np.int64) + + profile_L = np.full(profile_len, np.inf, dtype=np.float64) + indices_L = np.full(profile_len, -1, dtype=np.int64) + + profile_R = np.full(profile_len, np.inf, dtype=np.float64) + indices_R = np.full(profile_len, -1, dtype=np.int64) device_profile = cuda.to_device(profile) + device_profile_L = cuda.to_device(profile_L) + device_profile_R = cuda.to_device(profile_R) device_indices = cuda.to_device(indices) + device_indices_L = cuda.to_device(indices_L) + device_indices_R = cuda.to_device(indices_R) + _compute_and_update_PI_kernel[blocks_per_grid, threads_per_block]( range_start - 1, device_T_A, @@ -333,12 +401,17 @@ def _gpu_stump( device_Σ_T, device_μ_Q, device_σ_Q, - k, + profile_len, ignore_trivial, excl_zone, device_profile, + device_profile_L, + device_profile_R, device_indices, + device_indices_L, + device_indices_R, False, + k, ) for i in range(range_start, range_stop): @@ -354,27 +427,50 @@ def _gpu_stump( device_Σ_T, device_μ_Q, device_σ_Q, - k, + profile_len, ignore_trivial, excl_zone, device_profile, + device_profile_L, + device_profile_R, device_indices, + device_indices_L, + device_indices_R, True, + k, ) profile = device_profile.copy_to_host() + profile_L = device_profile_L.copy_to_host() + profile_R = device_profile_R.copy_to_host() indices = device_indices.copy_to_host() + indices_L = device_indices_L.copy_to_host() + indices_R = device_indices_R.copy_to_host() + profile = np.sqrt(profile) + profile_L = np.sqrt(profile_L) + profile_R = np.sqrt(profile_R) profile_fname = core.array_to_temp_file(profile) + profile_L_fname = core.array_to_temp_file(profile_L) + profile_R_fname = core.array_to_temp_file(profile_R) indices_fname = core.array_to_temp_file(indices) + indices_L_fname = core.array_to_temp_file(indices_L) + indices_R_fname = core.array_to_temp_file(indices_R) - return profile_fname, indices_fname + return ( + profile_fname, + profile_L_fname, + profile_R_fname, + indices_fname, + indices_L_fname, + indices_R_fname, + ) @core.non_normalized(gpu_aamp) def gpu_stump( - T_A, m, T_B=None, ignore_trivial=True, device_id=0, normalize=True, p=2.0 + T_A, m, T_B=None, ignore_trivial=True, device_id=0, normalize=True, p=2.0, k=1 ): """ Compute the z-normalized matrix profile with one or more GPU devices @@ -417,13 +513,22 @@ def gpu_stump( The p-norm to apply for computing the Minkowski distance. This parameter is ignored when `normalize == True`. + k : int, default 1 + The number of top `k` smallest distances used to construct the matrix profile. + Note that this will increase the total computational time and memory usage when k > 1. + Returns ------- out : numpy.ndarray - The first column consists of the matrix profile, the second column - consists of the matrix profile indices, the third column consists of - the left matrix profile indices, and the fourth column consists of - the right matrix profile indices. + When k = 1 (default), the first column consists of the matrix profile, + the second column consists of the matrix profile indices, the third column + consists of the left matrix profile indices, and the fourth column consists of + the right matrix profile indices. However, when k > 1, the output array will + contain exactly 2 * k + 2 columns. The first k columns (i.e., out[:, :k]) consists + of the top-k matrix profile, the next set of k columns (i.e., out[:, k:2k]) consists + of the corresponding top-k matrix profile indices, and the last two columns + (i.e., out[:, 2k] and out[:, 2k+1] or, equivalently, out[:, -2] and out[:, -1]) correspond to + the top-1 left matrix profile indices and the top-1 right matrix profile indices, respectively. See Also -------- @@ -505,7 +610,7 @@ def gpu_stump( logger.warning("Try setting `ignore_trivial = False`.") n = T_B.shape[0] - k = T_A.shape[0] - m + 1 + profile_len = T_A.shape[0] - m + 1 l = n - m + 1 excl_zone = int( np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM) @@ -518,8 +623,6 @@ def gpu_stump( μ_Q_fname = core.array_to_temp_file(μ_Q) σ_Q_fname = core.array_to_temp_file(σ_Q) - out = np.empty((k, 4), dtype=object) - if isinstance(device_id, int): device_ids = [device_id] else: @@ -528,6 +631,12 @@ def gpu_stump( profile = [None] * len(device_ids) indices = [None] * len(device_ids) + profile_L = [None] * len(device_ids) + indices_L = [None] * len(device_ids) + + profile_R = [None] * len(device_ids) + indices_R = [None] * len(device_ids) + for _id in device_ids: with cuda.gpus[_id]: if ( @@ -571,16 +680,24 @@ def gpu_stump( QT_first_fname, μ_Q_fname, σ_Q_fname, - k, + profile_len, ignore_trivial, start + 1, device_ids[idx], + k, ), ) else: # Execute last chunk in parent process # Only parent process is executed when a single GPU is requested - profile[idx], indices[idx] = _gpu_stump( + ( + profile[idx], + profile_L[idx], + profile_R[idx], + indices[idx], + indices_L[idx], + indices_R[idx], + ) = _gpu_stump( T_A_fname, T_B_fname, m, @@ -592,10 +709,11 @@ def gpu_stump( QT_first_fname, μ_Q_fname, σ_Q_fname, - k, + profile_len, ignore_trivial, start + 1, device_ids[idx], + k, ) # Clean up process pool for multi-GPU request @@ -606,7 +724,14 @@ def gpu_stump( # Collect results from spawned child processes if they exist for idx, result in enumerate(results): if result is not None: - profile[idx], indices[idx] = result.get() + ( + profile[idx], + profile_L[idx], + profile_R[idx], + indices[idx], + indices_L[idx], + indices_R[idx], + ) = result.get() os.remove(T_A_fname) os.remove(T_B_fname) @@ -621,22 +746,46 @@ def gpu_stump( for idx in range(len(device_ids)): profile_fname = profile[idx] + profile_L_fname = profile_L[idx] + profile_R_fname = profile_R[idx] indices_fname = indices[idx] + indices_L_fname = indices_L[idx] + indices_R_fname = indices_R[idx] + profile[idx] = np.load(profile_fname, allow_pickle=False) + profile_L[idx] = np.load(profile_L_fname, allow_pickle=False) + profile_R[idx] = np.load(profile_R_fname, allow_pickle=False) indices[idx] = np.load(indices_fname, allow_pickle=False) + indices_L[idx] = np.load(indices_L_fname, allow_pickle=False) + indices_R[idx] = np.load(indices_R_fname, allow_pickle=False) + os.remove(profile_fname) + os.remove(profile_L_fname) + os.remove(profile_R_fname) os.remove(indices_fname) + os.remove(indices_L_fname) + os.remove(indices_R_fname) for i in range(1, len(device_ids)): - # Update all matrix profiles and matrix profile indices - # (global, left, right) and store in profile[0] and indices[0] - for col in range(profile[0].shape[1]): # pragma: no cover - cond = profile[0][:, col] < profile[i][:, col] - profile[0][:, col] = np.where(cond, profile[0][:, col], profile[i][:, col]) - indices[0][:, col] = np.where(cond, indices[0][:, col], indices[i][:, col]) - - out[:, 0] = profile[0][:, 0] - out[:, 1:4] = indices[0][:, :] + # Update (top-k) matrix profile and matrix profile indices + _merge_topk_profiles_indices(profile[0], profile[i], indices[0], indices[i]) + + # Update (top-1) left matrix profile and matrix profil indices + cond = profile_L[0] < profile_L[i] + profile_L[0] = np.where(cond, profile_L[0], profile_L[i]) + indices_L[0] = np.where(cond, indices_L[0], indices_L[i]) + + # Update (top-1) right matrix profile and matrix profil indices + cond = profile_R[0] < profile_R[i] + profile_R[0] = np.where(cond, profile_R[0], profile_R[i]) + indices_R[0] = np.where(cond, indices_R[0], indices_R[i]) + + out = np.empty( + (profile_len, 2 * k + 2), dtype=object + ) # last two columns are to store + # (top-1) left/right matrix profile indices + out[:, :k] = profile[0] + out[:, k:] = np.column_stack((indices[0], indices_L[0], indices_R[0])) threshold = 10e-6 if core.are_distances_too_small(out[:, 0], threshold=threshold): # pragma: no cover From 1e7c05e0dce914ca2fc8fbc39cb5411b4fd5fb03 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 22 May 2022 14:16:40 -0600 Subject: [PATCH 095/416] Refactored function for merging two TopK MatrixProfile --- stumpy/core.py | 37 ++++++++++++++++++++++++++++++++++++- stumpy/gpu_stump.py | 41 +++++++---------------------------------- stumpy/stumped.py | 33 +-------------------------------- 3 files changed, 44 insertions(+), 67 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index a2a30c043..64dee293c 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -7,7 +7,7 @@ import inspect import numpy as np -from numba import njit +from numba import njit, prange from scipy.signal import convolve from scipy.ndimage import maximum_filter1d, minimum_filter1d from scipy import linalg @@ -2494,3 +2494,38 @@ def _select_P_ABBA_value(P_ABBA, k, custom_func=None): MPdist = partition[k] return MPdist + + +@njit(parallel=True) +def _merge_topk_profiles_indices(PA, PB, IA, IB): + """ + Merge two top-k matrix profiles PA and PB, and update PA (in place) while + prioritizing values of PA in ties. Also, update IA accordingly. + + Parameters + ---------- + PA : numpy.ndarray + a (top-k) matrix profile + + PB : numpy.ndarray + a (top-k) matrix profile + + IA : numpy.ndarray + a (top-k) matrix profile indices, corresponding to PA + + IB : numpy.ndarray + a (top-k) matrix profile indices, corresponding to PB + + Returns + ------- + None + """ + for i in prange(PA.shape[0]): + for j in range(PA.shape[1]): + if PB[i, j] < PA[i, -1]: + idx = np.searchsorted(PA[i], PB[i, j], side="right") + + PA[i, idx + 1 :] = PA[i, idx:-1].copy() + PA[i, idx] = PB[i, j] + IA[i, idx + 1 :] = IA[i, idx:-1].copy() + IA[i, idx] = IB[i, j] diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index 606bf7faf..2df5b14b1 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -7,7 +7,7 @@ import os import numpy as np -from numba import cuda, njit, prange +from numba import cuda from . import core, config from .gpu_aamp import gpu_aamp @@ -15,35 +15,6 @@ logger = logging.getLogger(__name__) -@njit(parallel=True) -def _merge_topk_profiles_indices(PA, PB, IA, IB): - """ - Merge two top-k matrix profiles while prioritizing values of PA in ties - and update PA (and so IA) - - PA : numpy.ndarray - a (top-k) matrix profile - - PB : numpy.ndarray - a (top-k) matrix profile - - IA : numpy.ndarray - a (top-k) matrix profile indices, corresponding to PA - - IB : numpy.ndarray - a (top-k) matrix profile indices, corresponding to PB - """ - for i in range(PA.shape[0]): - for j in range(PA.shape[1]): - if PB[i, j] < PA[i, -1]: - idx = np.searchsorted(PA[i], PB[i, j], side="right") - - PA[i, idx + 1 :] = PA[i, idx:-1].copy() - PA[i, idx] = PB[i, j] - IA[i, idx + 1 :] = IA[i, idx:-1].copy() - IA[i, idx] = IB[i, j] - - @cuda.jit( "(i8, f8[:], f8[:], i8, f8[:], f8[:], f8[:], f8[:], f8[:]," "f8[:], f8[:], i8, b1, i8, f8[:, :], f8[:], f8[:], i8[:, :], i8[:], i8[:], b1, i2)" @@ -209,7 +180,7 @@ def _compute_and_update_PI_kernel( if p_norm < profile_R[j] and i > j: profile_R[j] = p_norm indices_R[j] = i - + for idx in range(k, -1, -1): if (p_norm < profile[j, idx - 1]) and (idx > 0): profile[j, idx - 1] = profile[j, idx - 2] @@ -766,9 +737,11 @@ def gpu_stump( os.remove(indices_L_fname) os.remove(indices_R_fname) + profile_0 = profile[0].copy() + indices_0 = indices[0].copy() for i in range(1, len(device_ids)): # Update (top-k) matrix profile and matrix profile indices - _merge_topk_profiles_indices(profile[0], profile[i], indices[0], indices[i]) + core._merge_topk_profiles_indices(profile_0, profile[i], indices_0, indices[i]) # Update (top-1) left matrix profile and matrix profil indices cond = profile_L[0] < profile_L[i] @@ -784,8 +757,8 @@ def gpu_stump( (profile_len, 2 * k + 2), dtype=object ) # last two columns are to store # (top-1) left/right matrix profile indices - out[:, :k] = profile[0] - out[:, k:] = np.column_stack((indices[0], indices_L[0], indices_R[0])) + out[:, :k] = profile_0 + out[:, k:] = np.column_stack((indices_0, indices_L[0], indices_R[0])) threshold = 10e-6 if core.are_distances_too_small(out[:, 0], threshold=threshold): # pragma: no cover diff --git a/stumpy/stumped.py b/stumpy/stumped.py index 01606f5bf..0667713d3 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -5,7 +5,6 @@ import logging import numpy as np -from numba import njit, prange from . import core, config from .stump import _stump @@ -13,36 +12,6 @@ logger = logging.getLogger(__name__) - -@njit(parallel=True) -def _merge_topk_profiles_indices(PA, PB, IA, IB): - """ - Merge two top-k matrix profiles while prioritizing values of PA in ties - and update PA (and so IA) - - PA : numpy.ndarray - a (top-k) matrix profile - - PB : numpy.ndarray - a (top-k) matrix profile - - IA : numpy.ndarray - a (top-k) matrix profile indices, corresponding to PA - - IB : numpy.ndarray - a (top-k) matrix profile indices, corresponding to PB - """ - for i in prange(PA.shape[0]): - for j in range(PA.shape[1]): - if PB[i, j] < PA[i, -1]: - idx = np.searchsorted(PA[i], PB[i, j], side="right") - - PA[i, idx + 1 :] = PA[i, idx:-1].copy() - PA[i, idx] = PB[i, j] - IA[i, idx + 1 :] = IA[i, idx:-1].copy() - IA[i, idx] = IB[i, j] - - @core.non_normalized(aamped) def stumped( dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1 @@ -301,7 +270,7 @@ def stumped( for i in range(1, len(hosts)): P, PL, PR, I, IL, IR = results[i] # Update top-k matrix profile and matrix profile indices - _merge_topk_profiles_indices(profile, P, indices, I) + core._merge_topk_profiles_indices(profile, P, indices, I) # Update top-1 left matrix profile and matrix profile index cond = PL < profile_L From 2ebc276498eab50fb08c3f1f2ecf30db337eb80e Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 22 May 2022 14:21:33 -0600 Subject: [PATCH 096/416] Clean up code --- stumpy/gpu_stump.py | 10 +++++----- stumpy/stumped.py | 5 +++-- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index 2df5b14b1..803b020f0 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -737,11 +737,11 @@ def gpu_stump( os.remove(indices_L_fname) os.remove(indices_R_fname) - profile_0 = profile[0].copy() - indices_0 = indices[0].copy() for i in range(1, len(device_ids)): # Update (top-k) matrix profile and matrix profile indices - core._merge_topk_profiles_indices(profile_0, profile[i], indices_0, indices[i]) + core._merge_topk_profiles_indices( + profile[0], profile[i], indices[0], indices[i] + ) # Update (top-1) left matrix profile and matrix profil indices cond = profile_L[0] < profile_L[i] @@ -757,8 +757,8 @@ def gpu_stump( (profile_len, 2 * k + 2), dtype=object ) # last two columns are to store # (top-1) left/right matrix profile indices - out[:, :k] = profile_0 - out[:, k:] = np.column_stack((indices_0, indices_L[0], indices_R[0])) + out[:, :k] = profile[0] + out[:, k:] = np.column_stack((indices[0], indices_L[0], indices_R[0])) threshold = 10e-6 if core.are_distances_too_small(out[:, 0], threshold=threshold): # pragma: no cover diff --git a/stumpy/stumped.py b/stumpy/stumped.py index 0667713d3..17e0d556c 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -12,6 +12,7 @@ logger = logging.getLogger(__name__) + @core.non_normalized(aamped) def stumped( dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1 @@ -283,8 +284,8 @@ def stumped( indices_R = np.where(cond, IR, indices_R) out = np.empty((l, 2 * k + 2), dtype=object) - out[:, :k] = profile[:, :k] - out[:, k:] = np.column_stack((indices[:, :k], indices_L, indices_R)) + out[:, :k] = profile + out[:, k:] = np.column_stack((indices, indices_L, indices_R)) # Delete data from Dask cluster dask_client.cancel(T_A_future) From 1170f2ebd770ed4f70aa3048dd4e6778bb723c53 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 22 May 2022 15:44:18 -0600 Subject: [PATCH 097/416] Add naive version of merge_topk_matrix_profile function --- tests/test_core.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/test_core.py b/tests/test_core.py index 6ef78d230..c26dd449d 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -82,6 +82,15 @@ def naive_bsf_indices(n): return np.array(out) +def naive_merge_topk_profiles_indices(PA, PB, IA, IB): + profile = np.column_stack((PA, PB)) + indices = np.column_stack((IA, IB)) + + idx = np.argsort(profile, axis=1) + PA[:, :] = np.take_along_axis(profile, idx, axis=1)[:, : PA.shape[1]] + IA[:, :] = np.take_along_axis(indices, idx, axis=1)[:, : PA.shape[1]] + + test_data = [ (np.array([-1, 1, 2], dtype=np.float64), np.array(range(5), dtype=np.float64)), ( From 2a827b450df582b95d68a0578ca82ced758fe7f1 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 22 May 2022 15:48:54 -0600 Subject: [PATCH 098/416] Rename function --- stumpy/core.py | 2 +- stumpy/gpu_stump.py | 2 +- stumpy/stumped.py | 2 +- tests/test_core.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 64dee293c..89b6266fc 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2497,7 +2497,7 @@ def _select_P_ABBA_value(P_ABBA, k, custom_func=None): @njit(parallel=True) -def _merge_topk_profiles_indices(PA, PB, IA, IB): +def _merge_topk_PI(PA, PB, IA, IB): """ Merge two top-k matrix profiles PA and PB, and update PA (in place) while prioritizing values of PA in ties. Also, update IA accordingly. diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index 803b020f0..cc4537813 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -739,7 +739,7 @@ def gpu_stump( for i in range(1, len(device_ids)): # Update (top-k) matrix profile and matrix profile indices - core._merge_topk_profiles_indices( + core._merge_topk_PI( profile[0], profile[i], indices[0], indices[i] ) diff --git a/stumpy/stumped.py b/stumpy/stumped.py index 17e0d556c..0f6459db5 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -271,7 +271,7 @@ def stumped( for i in range(1, len(hosts)): P, PL, PR, I, IL, IR = results[i] # Update top-k matrix profile and matrix profile indices - core._merge_topk_profiles_indices(profile, P, indices, I) + core._merge_topk_PI(profile, P, indices, I) # Update top-1 left matrix profile and matrix profile index cond = PL < profile_L diff --git a/tests/test_core.py b/tests/test_core.py index c26dd449d..95dc268d3 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -82,7 +82,7 @@ def naive_bsf_indices(n): return np.array(out) -def naive_merge_topk_profiles_indices(PA, PB, IA, IB): +def naive_merge_topk_PI(PA, PB, IA, IB): profile = np.column_stack((PA, PB)) indices = np.column_stack((IA, IB)) From cc62c74f11f229dcc7bd98aabba2759cda91260f Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 22 May 2022 16:13:24 -0600 Subject: [PATCH 099/416] Revise naive function to make it more readable --- tests/test_core.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test_core.py b/tests/test_core.py index 95dc268d3..4585de1af 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -87,9 +87,11 @@ def naive_merge_topk_PI(PA, PB, IA, IB): indices = np.column_stack((IA, IB)) idx = np.argsort(profile, axis=1) - PA[:, :] = np.take_along_axis(profile, idx, axis=1)[:, : PA.shape[1]] - IA[:, :] = np.take_along_axis(indices, idx, axis=1)[:, : PA.shape[1]] + profile = np.take_along_axis(profile, idx, axis=1) + indices = np.take_along_axis(indices, idx, axis=1) + PA[:, :] = profile[:, : PA.shape[1]] + IA[:, :] = indices[:, : PA.shape[1]] test_data = [ (np.array([-1, 1, 2], dtype=np.float64), np.array(range(5), dtype=np.float64)), From b6b74c4edaa2cb2bf5c3a45987b631cf1a76ab9e Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 22 May 2022 17:32:44 -0600 Subject: [PATCH 100/416] Add test function for merge_topk_PI --- tests/test_core.py | 48 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/tests/test_core.py b/tests/test_core.py index 4585de1af..8e29c2f1a 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1039,3 +1039,51 @@ def test_select_P_ABBA_val_inf(): p_abba.sort() ref = p_abba[k - 1] npt.assert_almost_equal(ref, comp) + + +def test_merge_topk_PI(): + PA = np.array([ + [0.0, 0.0, 0.0, 0.0, 0.0], + [0.1, 0.2, 0.3, 0.4, 0.5], + [0.1, 0.2, 0.3, 0.4, 0.5], + [0.1, 0.2, 0.3, 0.4, 0.5], + [0.1, 0.2, 0.3, 0.4, 0.5], + [0.1, 0.2, 0.3, 0.4, 0.5], + [0.1, 0.1, 0.2, 0.3, 0.4], + [0.1, 0.2, np.inf, np.inf, np.inf], + [np.inf, np.inf, np.inf, np.inf, np.inf] + ]) + + PB = np.array([ + [0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.15, 0.25, 0.35, 0.45], + [0.15, 0.25, 0.35, 0.45, 0.55], + [0.01, 0.02, 0.03, 0.04, 0.05], + [0.6, 0.7, 0.8, 0.9, 1], + [0.1, 0.1, 0.2, 0.3, 0.4], + [0.1, 0.2, 0.3, 0.4, 0.5], + [0.0, 0.3, np.inf, np.inf, np.inf], + [np.inf, np.inf, np.inf, np.inf, np.inf], + ]) + + n, k = PA.shape + + IA = np.arange(n * k).reshape(n, k) + IB = IA.copy() + n * k + IA[7, 2:] = -1 + IA[8, :] = -1 + IB[7, 2:] = -1 + IB[8, :] = -1 + + ref_P = PA.copy() + ref_I = IA.copy() + + comp_P = PA.copy() + comp_I = IA.copy() + + naive_merge_topk_PI(ref_P, PB, ref_I, IB) + core._merge_topk_PI(comp_P, PB, comp_I, IB) + + ref = np.column_stack((ref_P, ref_I)) + comp = np.column_stack((comp_P, comp_I)) + npt.assert_array_equal(ref, comp) From b6d6450850453bfde5c932d129e79e063435b9f8 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 22 May 2022 17:37:51 -0600 Subject: [PATCH 101/416] Moved naive function to naive.py --- tests/naive.py | 12 ++++++++++++ tests/test_core.py | 13 +------------ 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 4a5ed789a..3074c2359 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1760,3 +1760,15 @@ def _total_diagonal_ndists(tile_lower_diag, tile_upper_diag, tile_height, tile_w ) return total_ndists + + +def merge_topk_PI(PA, PB, IA, IB): + profile = np.column_stack((PA, PB)) + indices = np.column_stack((IA, IB)) + + idx = np.argsort(profile, axis=1) + profile = np.take_along_axis(profile, idx, axis=1) + indices = np.take_along_axis(indices, idx, axis=1) + + PA[:, :] = profile[:, : PA.shape[1]] + IA[:, :] = indices[:, : PA.shape[1]] diff --git a/tests/test_core.py b/tests/test_core.py index 8e29c2f1a..e45f8c600 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -82,17 +82,6 @@ def naive_bsf_indices(n): return np.array(out) -def naive_merge_topk_PI(PA, PB, IA, IB): - profile = np.column_stack((PA, PB)) - indices = np.column_stack((IA, IB)) - - idx = np.argsort(profile, axis=1) - profile = np.take_along_axis(profile, idx, axis=1) - indices = np.take_along_axis(indices, idx, axis=1) - - PA[:, :] = profile[:, : PA.shape[1]] - IA[:, :] = indices[:, : PA.shape[1]] - test_data = [ (np.array([-1, 1, 2], dtype=np.float64), np.array(range(5), dtype=np.float64)), ( @@ -1081,7 +1070,7 @@ def test_merge_topk_PI(): comp_P = PA.copy() comp_I = IA.copy() - naive_merge_topk_PI(ref_P, PB, ref_I, IB) + naive.merge_topk_PI(ref_P, PB, ref_I, IB) core._merge_topk_PI(comp_P, PB, comp_I, IB) ref = np.column_stack((ref_P, ref_I)) From 97a04f457ca7c7542b768e504652bf2a9b0d7abf Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 22 May 2022 17:46:32 -0600 Subject: [PATCH 102/416] Correct Format --- stumpy/gpu_stump.py | 4 +--- tests/test_core.py | 50 ++++++++++++++++++++++++--------------------- 2 files changed, 28 insertions(+), 26 deletions(-) diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index cc4537813..26e49cbb2 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -739,9 +739,7 @@ def gpu_stump( for i in range(1, len(device_ids)): # Update (top-k) matrix profile and matrix profile indices - core._merge_topk_PI( - profile[0], profile[i], indices[0], indices[i] - ) + core._merge_topk_PI(profile[0], profile[i], indices[0], indices[i]) # Update (top-1) left matrix profile and matrix profil indices cond = profile_L[0] < profile_L[i] diff --git a/tests/test_core.py b/tests/test_core.py index e45f8c600..707893d14 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1031,29 +1031,33 @@ def test_select_P_ABBA_val_inf(): def test_merge_topk_PI(): - PA = np.array([ - [0.0, 0.0, 0.0, 0.0, 0.0], - [0.1, 0.2, 0.3, 0.4, 0.5], - [0.1, 0.2, 0.3, 0.4, 0.5], - [0.1, 0.2, 0.3, 0.4, 0.5], - [0.1, 0.2, 0.3, 0.4, 0.5], - [0.1, 0.2, 0.3, 0.4, 0.5], - [0.1, 0.1, 0.2, 0.3, 0.4], - [0.1, 0.2, np.inf, np.inf, np.inf], - [np.inf, np.inf, np.inf, np.inf, np.inf] - ]) - - PB = np.array([ - [0.0, 0.0, 0.0, 0.0, 0.0], - [0.0, 0.15, 0.25, 0.35, 0.45], - [0.15, 0.25, 0.35, 0.45, 0.55], - [0.01, 0.02, 0.03, 0.04, 0.05], - [0.6, 0.7, 0.8, 0.9, 1], - [0.1, 0.1, 0.2, 0.3, 0.4], - [0.1, 0.2, 0.3, 0.4, 0.5], - [0.0, 0.3, np.inf, np.inf, np.inf], - [np.inf, np.inf, np.inf, np.inf, np.inf], - ]) + PA = np.array( + [ + [0.0, 0.0, 0.0, 0.0, 0.0], + [0.1, 0.2, 0.3, 0.4, 0.5], + [0.1, 0.2, 0.3, 0.4, 0.5], + [0.1, 0.2, 0.3, 0.4, 0.5], + [0.1, 0.2, 0.3, 0.4, 0.5], + [0.1, 0.2, 0.3, 0.4, 0.5], + [0.1, 0.1, 0.2, 0.3, 0.4], + [0.1, 0.2, np.inf, np.inf, np.inf], + [np.inf, np.inf, np.inf, np.inf, np.inf], + ] + ) + + PB = np.array( + [ + [0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.15, 0.25, 0.35, 0.45], + [0.15, 0.25, 0.35, 0.45, 0.55], + [0.01, 0.02, 0.03, 0.04, 0.05], + [0.6, 0.7, 0.8, 0.9, 1], + [0.1, 0.1, 0.2, 0.3, 0.4], + [0.1, 0.2, 0.3, 0.4, 0.5], + [0.0, 0.3, np.inf, np.inf, np.inf], + [np.inf, np.inf, np.inf, np.inf, np.inf], + ] + ) n, k = PA.shape From 50f4ee8cf84b6f5958b9691d23d961a26d5f06b5 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 22 May 2022 17:58:52 -0600 Subject: [PATCH 103/416] Correct Style --- stumpy/aamp.py | 3 ++- stumpy/aamped.py | 3 ++- stumpy/gpu_stump.py | 24 ++++++++++++++---------- stumpy/stump.py | 24 ++++++++++++++---------- stumpy/stumped.py | 18 ++++++++++-------- 5 files changed, 42 insertions(+), 30 deletions(-) diff --git a/stumpy/aamp.py b/stumpy/aamp.py index 428c3d4bd..82eb41639 100644 --- a/stumpy/aamp.py +++ b/stumpy/aamp.py @@ -270,7 +270,8 @@ def aamp(T_A, m, T_B=None, ignore_trivial=True, p=2.0, k=1): k : int, default 1 The number of top `k` smallest distances used to construct the matrix profile. - Note that this will increase the total computational time and memory usage when k > 1. + Note that this will increase the total computational time and memory usage + when k > 1. Returns ------- diff --git a/stumpy/aamped.py b/stumpy/aamped.py index ad147b42f..4499c58b5 100644 --- a/stumpy/aamped.py +++ b/stumpy/aamped.py @@ -49,7 +49,8 @@ def aamped(dask_client, T_A, m, T_B=None, ignore_trivial=True, p=2.0, k=1): k : int, default 1 The number of top `k` smallest distances used to construct the matrix profile. - Note that this will increase the total computational time and memory usage when k > 1. + Note that this will increase the total computational time and memory usage + when k > 1. Returns ------- diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index 26e49cbb2..15583c58e 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -118,7 +118,8 @@ def _compute_and_update_PI_kernel( k : int The number of top `k` smallest distances used to construct the matrix profile. - Note that this will increase the total computational time and memory usage when k > 1. + Note that this will increase the total computational time and memory usage + when k > 1. Returns ------- @@ -275,7 +276,8 @@ def _gpu_stump( k : int The number of top `k` smallest distances used to construct the matrix profile. - Note that this will increase the total computational time and memory usage when k > 1. + Note that this will increase the total computational time and memory usage + when k > 1. Returns ------- @@ -486,20 +488,22 @@ def gpu_stump( k : int, default 1 The number of top `k` smallest distances used to construct the matrix profile. - Note that this will increase the total computational time and memory usage when k > 1. + Note that this will increase the total computational time and memory usage + when k > 1. Returns ------- out : numpy.ndarray When k = 1 (default), the first column consists of the matrix profile, the second column consists of the matrix profile indices, the third column - consists of the left matrix profile indices, and the fourth column consists of - the right matrix profile indices. However, when k > 1, the output array will - contain exactly 2 * k + 2 columns. The first k columns (i.e., out[:, :k]) consists - of the top-k matrix profile, the next set of k columns (i.e., out[:, k:2k]) consists - of the corresponding top-k matrix profile indices, and the last two columns - (i.e., out[:, 2k] and out[:, 2k+1] or, equivalently, out[:, -2] and out[:, -1]) correspond to - the top-1 left matrix profile indices and the top-1 right matrix profile indices, respectively. + consists of the left matrix profile indices, and the fourth column consists + of the right matrix profile indices. However, when k > 1, the output array + will contain exactly 2 * k + 2 columns. The first k columns (i.e., out[:, :k]) + consists of the top-k matrix profile, the next set of k columns + (i.e., out[:, k:2k]) consists of the corresponding top-k matrix profile + indices, and the last two columns (i.e., out[:, 2k] and out[:, 2k+1] or, + equivalently, out[:, -2] and out[:, -1]) correspond to the top-1 left + matrix profile indices and the top-1 right matrix profile indices, respectively. See Also -------- diff --git a/stumpy/stump.py b/stumpy/stump.py index bcf0d4103..f5a5fe811 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -144,7 +144,8 @@ def _compute_diagonal( k : int The number of top `k` smallest distances used to construct the matrix profile. - Note that this will increase the total computational time and memory usage when k > 1. + Note that this will increase the total computational time and memory usage + when k > 1. Returns ------- @@ -327,7 +328,8 @@ def _stump( k : int The number of top `k` smallest distances used to construct the matrix profile. - Note that this will increase the total computational time and memory usage when k > 1. + Note that this will increase the total computational time and memory usage + when k > 1. Returns ------- @@ -547,20 +549,22 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): k : int, default 1 The number of top `k` smallest distances used to construct the matrix profile. - Note that this will increase the total computational time and memory usage when k > 1. + Note that this will increase the total computational time and memory usage + when k > 1. Returns ------- out : numpy.ndarray When k = 1 (default), the first column consists of the matrix profile, the second column consists of the matrix profile indices, the third column - consists of the left matrix profile indices, and the fourth column consists of - the right matrix profile indices. However, when k > 1, the output array will - contain exactly 2 * k + 2 columns. The first k columns (i.e., out[:, :k]) consists - of the top-k matrix profile, the next set of k columns (i.e., out[:, k:2k]) consists - of the corresponding top-k matrix profile indices, and the last two columns - (i.e., out[:, 2k] and out[:, 2k+1] or, equivalently, out[:, -2] and out[:, -1]) correspond to - the top-1 left matrix profile indices and the top-1 right matrix profile indices, respectively. + consists of the left matrix profile indices, and the fourth column consists + of the right matrix profile indices. However, when k > 1, the output array + will contain exactly 2 * k + 2 columns. The first k columns (i.e., out[:, :k]) + consists of the top-k matrix profile, the next set of k columns + (i.e., out[:, k:2k]) consists of the corresponding top-k matrix profile + indices, and the last two columns (i.e., out[:, 2k] and out[:, 2k+1] or, + equivalently, out[:, -2] and out[:, -1]) correspond to the top-1 left + matrix profile indices and the top-1 right matrix profile indices, respectively. See Also -------- diff --git a/stumpy/stumped.py b/stumpy/stumped.py index 0f6459db5..f98338ce9 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -58,20 +58,22 @@ def stumped( k : int, default 1 The number of top `k` smallest distances used to construct the matrix profile. - Note that this will increase the total computational time and memory usage when k > 1. + Note that this will increase the total computational time and memory usage + when k > 1. Returns ------- out : numpy.ndarray When k = 1 (default), the first column consists of the matrix profile, the second column consists of the matrix profile indices, the third column - consists of the left matrix profile indices, and the fourth column consists of - the right matrix profile indices. However, when k > 1, the output array will - contain exactly 2 * k + 2 columns. The first k columns (i.e., out[:, :k]) consists - of the top-k matrix profile, the next set of k columns (i.e., out[:, k:2k]) consists - of the corresponding top-k matrix profile indices, and the last two columns - (i.e., out[:, 2k] and out[:, 2k+1] or, equivalently, out[:, -2] and out[:, -1]) correspond to - the top-1 left matrix profile indices and the top-1 right matrix profile indices, respectively. + consists of the left matrix profile indices, and the fourth column consists + of the right matrix profile indices. However, when k > 1, the output array + will contain exactly 2 * k + 2 columns. The first k columns (i.e., out[:, :k]) + consists of the top-k matrix profile, the next set of k columns + (i.e., out[:, k:2k]) consists of the corresponding top-k matrix profile + indices, and the last two columns (i.e., out[:, 2k] and out[:, 2k+1] or, + equivalently, out[:, -2] and out[:, -1]) correspond to the top-1 left + matrix profile indices and the top-1 right matrix profile indices, respectively. See Also -------- From 5b7da52bf1a936a147d47321e06653a67da1db29 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 22 May 2022 18:02:04 -0600 Subject: [PATCH 104/416] Add parameter k to avoid failure in non-normalized decorater unit test --- stumpy/gpu_aamp.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/stumpy/gpu_aamp.py b/stumpy/gpu_aamp.py index e62be7b02..0c9a21a85 100644 --- a/stumpy/gpu_aamp.py +++ b/stumpy/gpu_aamp.py @@ -339,7 +339,9 @@ def _gpu_aamp( return profile_fname, indices_fname -def gpu_aamp(T_A, m, T_B=None, ignore_trivial=True, device_id=0, p=2.0): +def gpu_aamp(T_A, m, T_B=None, ignore_trivial=True, device_id=0, p=2.0, k=1): + # function needs to be revised to return (top-k) matrix profile and + # matrix profile indices """ Compute the non-normalized (i.e., without z-normalization) matrix profile with one or more GPU devices @@ -375,6 +377,11 @@ def gpu_aamp(T_A, m, T_B=None, ignore_trivial=True, device_id=0, p=2.0): p : float, default 2.0 The p-norm to apply for computing the Minkowski distance. + k : int, default 1 + The number of top `k` smallest distances used to construct the matrix profile. + Note that this will increase the total computational time and memory usage + when k > 1. + Returns ------- out : numpy.ndarray From e983ef0997ac2e4bcf1c14387be5ec617ec66a4d Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 22 May 2022 20:34:58 -0600 Subject: [PATCH 105/416] Skip a for-loop in unit test coverage --- stumpy/gpu_stump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index 15583c58e..0d76e19b6 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -741,7 +741,7 @@ def gpu_stump( os.remove(indices_L_fname) os.remove(indices_R_fname) - for i in range(1, len(device_ids)): + for i in range(1, len(device_ids)): # pragma: no cover # Update (top-k) matrix profile and matrix profile indices core._merge_topk_PI(profile[0], profile[i], indices[0], indices[i]) From b0c5cace4951f97b201f7b42ca0d9627c22bf890 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 22 May 2022 20:37:07 -0600 Subject: [PATCH 106/416] All tests pass From 787e3f761162475e556c7fb4bbc252796fa2f9a6 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 22 May 2022 21:20:50 -0600 Subject: [PATCH 107/416] Use randomly generated arrays for test function --- tests/test_core.py | 77 ++++++++++++++++++++++++++-------------------- 1 file changed, 44 insertions(+), 33 deletions(-) diff --git a/tests/test_core.py b/tests/test_core.py index 707893d14..21e08fd76 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1031,42 +1031,53 @@ def test_select_P_ABBA_val_inf(): def test_merge_topk_PI(): - PA = np.array( - [ - [0.0, 0.0, 0.0, 0.0, 0.0], - [0.1, 0.2, 0.3, 0.4, 0.5], - [0.1, 0.2, 0.3, 0.4, 0.5], - [0.1, 0.2, 0.3, 0.4, 0.5], - [0.1, 0.2, 0.3, 0.4, 0.5], - [0.1, 0.2, 0.3, 0.4, 0.5], - [0.1, 0.1, 0.2, 0.3, 0.4], - [0.1, 0.2, np.inf, np.inf, np.inf], - [np.inf, np.inf, np.inf, np.inf, np.inf], - ] - ) - - PB = np.array( - [ - [0.0, 0.0, 0.0, 0.0, 0.0], - [0.0, 0.15, 0.25, 0.35, 0.45], - [0.15, 0.25, 0.35, 0.45, 0.55], - [0.01, 0.02, 0.03, 0.04, 0.05], - [0.6, 0.7, 0.8, 0.9, 1], - [0.1, 0.1, 0.2, 0.3, 0.4], - [0.1, 0.2, 0.3, 0.4, 0.5], - [0.0, 0.3, np.inf, np.inf, np.inf], - [np.inf, np.inf, np.inf, np.inf, np.inf], - ] - ) - - n, k = PA.shape + n=50 + k=5 + + PA = np.random.randint(0, 5, size=(n, k)) + PA = np.sort(PA) + + PB = np.random.randint(0, 5, size=(n, k)) + PB = np.sort(PB) + + #PA = np.array( + # [ + # [0.0, 0.0, 0.0, 0.0, 0.0], + # [0.1, 0.2, 0.3, 0.4, 0.5], + # [0.1, 0.2, 0.3, 0.4, 0.5], + # [0.1, 0.2, 0.3, 0.4, 0.5], + # [0.1, 0.2, 0.3, 0.4, 0.5], + # [0.1, 0.2, 0.3, 0.4, 0.5], + # [0.1, 0.1, 0.2, 0.3, 0.4], + # [0.1, 0.2, np.inf, np.inf, np.inf], + # [np.inf, np.inf, np.inf, np.inf, np.inf], + # ] + #) + + #PB = np.array( + # [ + # [0.0, 0.0, 0.0, 0.0, 0.0], + # [0.0, 0.15, 0.25, 0.35, 0.45], + # [0.15, 0.25, 0.35, 0.45, 0.55], + # [0.01, 0.02, 0.03, 0.04, 0.05], + # [0.6, 0.7, 0.8, 0.9, 1], + # [0.1, 0.1, 0.2, 0.3, 0.4], + # [0.1, 0.2, 0.3, 0.4, 0.5], + # [0.0, 0.3, np.inf, np.inf, np.inf], + # [np.inf, np.inf, np.inf, np.inf, np.inf], + # ] + #) IA = np.arange(n * k).reshape(n, k) IB = IA.copy() + n * k - IA[7, 2:] = -1 - IA[8, :] = -1 - IB[7, 2:] = -1 - IB[8, :] = -1 + + #n, k = PA.shape + #IA = np.arange(n * k).reshape(n, k) + #IB = IA.copy() + n * k + #IA[7, 2:] = -1 + #IA[8, :] = -1 + #IB[7, 2:] = -1 + #IB[8, :] = -1 ref_P = PA.copy() ref_I = IA.copy() From 2ff2b85d7a4ec6f3bbea3dd21c05681a15e62dc7 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 22 May 2022 23:01:32 -0600 Subject: [PATCH 108/416] Add minor comment --- stumpy/core.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/stumpy/core.py b/stumpy/core.py index 89b6266fc..7528d5f85 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2525,6 +2525,8 @@ def _merge_topk_PI(PA, PB, IA, IB): if PB[i, j] < PA[i, -1]: idx = np.searchsorted(PA[i], PB[i, j], side="right") + # .copy() operation is needed to resolve wrong result that is + # caused by "prange" PA[i, idx + 1 :] = PA[i, idx:-1].copy() PA[i, idx] = PB[i, j] IA[i, idx + 1 :] = IA[i, idx:-1].copy() From c3060278426d583fa4a35c41b0c8758f8aa857a8 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 22 May 2022 23:04:40 -0600 Subject: [PATCH 109/416] Erase unnecessary comments --- tests/test_core.py | 40 ++-------------------------------------- 1 file changed, 2 insertions(+), 38 deletions(-) diff --git a/tests/test_core.py b/tests/test_core.py index 21e08fd76..a1efbf681 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1031,8 +1031,8 @@ def test_select_P_ABBA_val_inf(): def test_merge_topk_PI(): - n=50 - k=5 + n = 50 + k = 5 PA = np.random.randint(0, 5, size=(n, k)) PA = np.sort(PA) @@ -1040,45 +1040,9 @@ def test_merge_topk_PI(): PB = np.random.randint(0, 5, size=(n, k)) PB = np.sort(PB) - #PA = np.array( - # [ - # [0.0, 0.0, 0.0, 0.0, 0.0], - # [0.1, 0.2, 0.3, 0.4, 0.5], - # [0.1, 0.2, 0.3, 0.4, 0.5], - # [0.1, 0.2, 0.3, 0.4, 0.5], - # [0.1, 0.2, 0.3, 0.4, 0.5], - # [0.1, 0.2, 0.3, 0.4, 0.5], - # [0.1, 0.1, 0.2, 0.3, 0.4], - # [0.1, 0.2, np.inf, np.inf, np.inf], - # [np.inf, np.inf, np.inf, np.inf, np.inf], - # ] - #) - - #PB = np.array( - # [ - # [0.0, 0.0, 0.0, 0.0, 0.0], - # [0.0, 0.15, 0.25, 0.35, 0.45], - # [0.15, 0.25, 0.35, 0.45, 0.55], - # [0.01, 0.02, 0.03, 0.04, 0.05], - # [0.6, 0.7, 0.8, 0.9, 1], - # [0.1, 0.1, 0.2, 0.3, 0.4], - # [0.1, 0.2, 0.3, 0.4, 0.5], - # [0.0, 0.3, np.inf, np.inf, np.inf], - # [np.inf, np.inf, np.inf, np.inf, np.inf], - # ] - #) - IA = np.arange(n * k).reshape(n, k) IB = IA.copy() + n * k - #n, k = PA.shape - #IA = np.arange(n * k).reshape(n, k) - #IB = IA.copy() + n * k - #IA[7, 2:] = -1 - #IA[8, :] = -1 - #IB[7, 2:] = -1 - #IB[8, :] = -1 - ref_P = PA.copy() ref_I = IA.copy() From 898e9f366d3d4a0cebc1bfdddd20c722a2594f26 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 23 May 2022 11:56:42 -0600 Subject: [PATCH 110/416] Remove unnecessary copy operation --- tests/test_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_core.py b/tests/test_core.py index a1efbf681..3fa1447bd 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1041,7 +1041,7 @@ def test_merge_topk_PI(): PB = np.sort(PB) IA = np.arange(n * k).reshape(n, k) - IB = IA.copy() + n * k + IB = IA + n * k ref_P = PA.copy() ref_I = IA.copy() From 3541faec462fc0869af9bcb3b6eafc93469ebc21 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 24 May 2022 11:49:05 -0600 Subject: [PATCH 111/416] Major revision in function _merge_topk_PI - use PB to get number of iterations for the two most outer for-loops - improve Docstring - use start and stop to narrow down the search space - use for-loop instead of .copy() operation. --- stumpy/core.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index bce98964d..0cc858f93 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2505,10 +2505,12 @@ def _merge_topk_PI(PA, PB, IA, IB): Parameters ---------- PA : numpy.ndarray - a (top-k) matrix profile + a (top-k) matrix profile, with ndim of 2, where values in each row are + sorted in ascending order. Also, it needs to be the same shape as PB. PB : numpy.ndarray - a (top-k) matrix profile + a (top-k) matrix profile, with ndim of 2, where values in each row are + sorted in ascending order. Also, it needs to be the same shape as PA. IA : numpy.ndarray a (top-k) matrix profile indices, corresponding to PA @@ -2520,14 +2522,20 @@ def _merge_topk_PI(PA, PB, IA, IB): ------- None """ - for i in prange(PA.shape[0]): - for j in range(PA.shape[1]): + for i in prange(PB.shape[0]): + start = 0 + stop = np.searchsorted(PA[i], PB[i, -1], side="right") + + for j in range(PB.shape[1]): if PB[i, j] < PA[i, -1]: - idx = np.searchsorted(PA[i], PB[i, j], side="right") + idx = np.searchsorted(PA[i, start:stop], PB[i, j], side="right") + start + + for g in range(PB.shape[1] - 1, idx, -1): + PA[i, g] = PA[i, g - 1] + IA[i, g] = IA[i, g - 1] - # .copy() operation is needed to resolve wrong result that is - # caused by "prange" - PA[i, idx + 1 :] = PA[i, idx:-1].copy() PA[i, idx] = PB[i, j] - IA[i, idx + 1 :] = IA[i, idx:-1].copy() IA[i, idx] = IB[i, j] + + start = idx + stop += 1 # because of shifting elements to the right by one From ce8cd4c599b8763519b483a4c9c3f695dc445350 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 28 May 2022 00:54:00 -0600 Subject: [PATCH 112/416] Add device function to find insertion index into sorted array --- stumpy/core.py | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/stumpy/core.py b/stumpy/core.py index 535471761..200980648 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2604,3 +2604,48 @@ def _merge_topk_PI(PA, PB, IA, IB): start = idx stop += 1 # because of shifting elements to the right by one + + +@cuda.jit("i8(f8[:], f8, i8[:], i8)", device=True) +def _gpu_searchsorted_right(a, v, bfs, nlevel): + """ + a device function in replace of numpy.searchsorted(a, v, side='right') + + Parameters + ---------- + a : numpy.ndarray + 1-dim array sorted in ascending order. + + v : float + value to insert into array `a` + + bfs : numpy.ndarray + the level order indices from the implicit construction of a binary + search tree followed by a breadth first (level order) search. + + nlevel : int + the number of levels in the binary search tree based from which the array + `bfs` is obtained. + + Returns + ------- + idx : int + the index of the insertion point + """ + n = a.shape[0] + idx = 0 + for level in range(nlevel): + if v < a[bfs[idx]]: + next_idx = 2 * idx + 1 + else: + next_idx = 2 * idx + 2 + + if level == nlevel-1 or bfs[next_idx]<0: + if v < a[bfs[idx]]: + idx = max(bfs[idx], 0) + else: + idx = min(bfs[idx] + 1, n) + break + idx = next_idx + + return idx From 09bbe7fb689e47330aacac6737e56d5d0d416356 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 28 May 2022 01:11:01 -0600 Subject: [PATCH 113/416] Add test function for gpu_searchsorted --- tests/test_core.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/tests/test_core.py b/tests/test_core.py index 4437149d8..7423718ab 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1086,3 +1086,37 @@ def test_merge_topk_PI(): ref = np.column_stack((ref_P, ref_I)) comp = np.column_stack((comp_P, comp_I)) npt.assert_array_equal(ref, comp) + + +def test_gpu_searchsorted(): + # define a function the same as `core._gpu_searchsorted_right` but + # without cuda.jit decorator. + def gpu_searchsorted_right(a, v, bfs, nlevel): + n = a.shape[0] + idx = 0 + for level in range(nlevel): + if v < a[bfs[idx]]: + next_idx = 2 * idx + 1 + else: + next_idx = 2 * idx + 2 + + if level == nlevel-1 or bfs[next_idx]<0: + if v < a[bfs[idx]]: + idx = max(bfs[idx], 0) + else: + idx = min(bfs[idx] + 1, n) + break + idx = next_idx + + return idx + + for n in range(1, 100): + a = np.sort(np.random.rand(n)) + bfs = core._bfs_indices(n, fill_value=-1) + nlevel = np.floor(np.log2(n) + 1).astype(np.int64) + for i in range(n): + v = a[i] + npt.assert_almost_equal(gpu_searchsorted_right(a, v, bfs, nlevel), np.searchsorted(a, v, side="right")) + + v = a[i] + 0.001 + npt.assert_almost_equal(gpu_searchsorted_right(a, v, bfs, nlevel), np.searchsorted(a, v, side="right")) From 4948667e38c3b76c1421f4ccf0aedf05c9d82f96 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 28 May 2022 01:13:04 -0600 Subject: [PATCH 114/416] Correct format --- stumpy/core.py | 2 +- tests/test_core.py | 12 +++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 200980648..3245bd216 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2640,7 +2640,7 @@ def _gpu_searchsorted_right(a, v, bfs, nlevel): else: next_idx = 2 * idx + 2 - if level == nlevel-1 or bfs[next_idx]<0: + if level == nlevel - 1 or bfs[next_idx] < 0: if v < a[bfs[idx]]: idx = max(bfs[idx], 0) else: diff --git a/tests/test_core.py b/tests/test_core.py index 7423718ab..152a58a01 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1100,7 +1100,7 @@ def gpu_searchsorted_right(a, v, bfs, nlevel): else: next_idx = 2 * idx + 2 - if level == nlevel-1 or bfs[next_idx]<0: + if level == nlevel - 1 or bfs[next_idx] < 0: if v < a[bfs[idx]]: idx = max(bfs[idx], 0) else: @@ -1116,7 +1116,13 @@ def gpu_searchsorted_right(a, v, bfs, nlevel): nlevel = np.floor(np.log2(n) + 1).astype(np.int64) for i in range(n): v = a[i] - npt.assert_almost_equal(gpu_searchsorted_right(a, v, bfs, nlevel), np.searchsorted(a, v, side="right")) + npt.assert_almost_equal( + gpu_searchsorted_right(a, v, bfs, nlevel), + np.searchsorted(a, v, side="right"), + ) v = a[i] + 0.001 - npt.assert_almost_equal(gpu_searchsorted_right(a, v, bfs, nlevel), np.searchsorted(a, v, side="right")) + npt.assert_almost_equal( + gpu_searchsorted_right(a, v, bfs, nlevel), + np.searchsorted(a, v, side="right"), + ) From cdd7a334ac69408a2ba6810f521b5419afc9ed02 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 28 May 2022 01:25:55 -0600 Subject: [PATCH 115/416] Fixed minor bug --- stumpy/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stumpy/core.py b/stumpy/core.py index 3245bd216..e2688459e 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -7,7 +7,7 @@ import inspect import numpy as np -from numba import njit, prange +from numba import cuda, njit, prange from scipy.signal import convolve from scipy.ndimage import maximum_filter1d, minimum_filter1d from scipy import linalg From 71ade4772dce47a9765c8f5081a02b523d8501fd Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 28 May 2022 11:52:57 -0600 Subject: [PATCH 116/416] Fixed the name of a variable --- stumpy/gpu_stump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index 0d76e19b6..1b82707fb 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -319,7 +319,7 @@ def _gpu_stump( Note that left and right matrix profiles are only available for self-joins. """ threads_per_block = config.STUMPY_THREADS_PER_BLOCK - blocks_per_grid = math.ceil(k / threads_per_block) + blocks_per_grid = math.ceil(profile_len / threads_per_block) T_A = np.load(T_A_fname, allow_pickle=False) T_B = np.load(T_B_fname, allow_pickle=False) From ac472fc2331f6ef03e2fb5b08fb0c05090d15341 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 28 May 2022 15:02:19 -0600 Subject: [PATCH 117/416] Fixed grammatical error in docstring --- stumpy/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stumpy/core.py b/stumpy/core.py index e2688459e..9a7b1012b 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2624,7 +2624,7 @@ def _gpu_searchsorted_right(a, v, bfs, nlevel): search tree followed by a breadth first (level order) search. nlevel : int - the number of levels in the binary search tree based from which the array + the number of levels in the binary search tree from which the array `bfs` is obtained. Returns From fc149e688ce2f1bdac409ec66b7376a881edcd21 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 28 May 2022 15:09:01 -0600 Subject: [PATCH 118/416] Use device function for searchsorting --- stumpy/gpu_stump.py | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index 1b82707fb..d8ad43fe8 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -17,7 +17,7 @@ @cuda.jit( "(i8, f8[:], f8[:], i8, f8[:], f8[:], f8[:], f8[:], f8[:]," - "f8[:], f8[:], i8, b1, i8, f8[:, :], f8[:], f8[:], i8[:, :], i8[:], i8[:], b1, i2)" + "f8[:], f8[:], i8, b1, i8, f8[:, :], f8[:], f8[:], i8[:, :], i8[:], i8[:], b1, i8[:], i8, i2)" ) def _compute_and_update_PI_kernel( i, @@ -41,6 +41,8 @@ def _compute_and_update_PI_kernel( indices_L, indices_R, compute_QT, + bfs, + nlevel, k, ): """ @@ -116,6 +118,14 @@ def _compute_and_update_PI_kernel( compute_QT : bool A boolean flag for whether or not to compute QT + bfs : numpy.ndarray + the level order indices from the implicit construction of a binary + search tree followed by a breadth first (level order) search. + + nlevel : int + the number of levels in the binary search tree from which the array + `bfs` is obtained. + k : int The number of top `k` smallest distances used to construct the matrix profile. Note that this will increase the total computational time and memory usage @@ -182,13 +192,12 @@ def _compute_and_update_PI_kernel( profile_R[j] = p_norm indices_R[j] = i - for idx in range(k, -1, -1): - if (p_norm < profile[j, idx - 1]) and (idx > 0): - profile[j, idx - 1] = profile[j, idx - 2] - indices[j, idx - 1] = indices[j, idx - 2] - else: - break - if idx < k: + if p_norm < profile[j, -1]: + idx = core._gpu_searchsorted_right(profile[j], p_norm, bfs, nlevel) + for g in range(k - 1, idx, -1): + profile[j, g] = profile[j, g - 1] + indices[j, g] = indices[j, g - 1] + profile[j, idx] = p_norm indices[j, idx] = i @@ -318,6 +327,10 @@ def _gpu_stump( Note that left and right matrix profiles are only available for self-joins. """ + bfs = core._bfs_indices(k, fill_value=-1) + nlevel = np.floor(np.log2(k) + 1).astype(np.int64) # number of levels in + # binary seearch tree from which `bfs` is constructed. + threads_per_block = config.STUMPY_THREADS_PER_BLOCK blocks_per_grid = math.ceil(profile_len / threads_per_block) @@ -384,6 +397,8 @@ def _gpu_stump( device_indices_L, device_indices_R, False, + bfs, + nlevel, k, ) @@ -410,6 +425,8 @@ def _gpu_stump( device_indices_L, device_indices_R, True, + bfs, + nlevel, k, ) From 7ac67a8302ddcbd0d3affc0538b891fe19a92b17 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 28 May 2022 15:16:31 -0600 Subject: [PATCH 119/416] Correct style --- stumpy/core.py | 2 +- stumpy/gpu_stump.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 9a7b1012b..101813759 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2609,7 +2609,7 @@ def _merge_topk_PI(PA, PB, IA, IB): @cuda.jit("i8(f8[:], f8, i8[:], i8)", device=True) def _gpu_searchsorted_right(a, v, bfs, nlevel): """ - a device function in replace of numpy.searchsorted(a, v, side='right') + Device function to replace numpy.searchsorted(a, v, side='right') Parameters ---------- diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index d8ad43fe8..1a379eda0 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -17,7 +17,8 @@ @cuda.jit( "(i8, f8[:], f8[:], i8, f8[:], f8[:], f8[:], f8[:], f8[:]," - "f8[:], f8[:], i8, b1, i8, f8[:, :], f8[:], f8[:], i8[:, :], i8[:], i8[:], b1, i8[:], i8, i2)" + "f8[:], f8[:], i8, b1, i8, f8[:, :], f8[:], f8[:], i8[:, :], i8[:], i8[:]," + "b1, i8[:], i8, i2)" ) def _compute_and_update_PI_kernel( i, From 92467e24387e490b0289a37738261904ce3148d7 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 28 May 2022 17:27:06 -0600 Subject: [PATCH 120/416] Remove signature from cuda device function --- stumpy/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stumpy/core.py b/stumpy/core.py index 101813759..f342d888e 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2606,7 +2606,7 @@ def _merge_topk_PI(PA, PB, IA, IB): stop += 1 # because of shifting elements to the right by one -@cuda.jit("i8(f8[:], f8, i8[:], i8)", device=True) +@cuda.jit(device=True) def _gpu_searchsorted_right(a, v, bfs, nlevel): """ Device function to replace numpy.searchsorted(a, v, side='right') From bdfb258ea5e516f1c064141fe3d1d15dc895b858 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 28 May 2022 20:37:23 -0600 Subject: [PATCH 121/416] Full Coverage confirmed From bb5de99711bd580b77cc407cc5091ace97839c5c Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 28 May 2022 20:45:12 -0600 Subject: [PATCH 122/416] revising the definiton of parameter bfs in docstring --- stumpy/core.py | 4 ++-- stumpy/gpu_stump.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index f342d888e..54eb29e4c 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2620,8 +2620,8 @@ def _gpu_searchsorted_right(a, v, bfs, nlevel): value to insert into array `a` bfs : numpy.ndarray - the level order indices from the implicit construction of a binary - search tree followed by a breadth first (level order) search. + The breadth-first-search indices where the missing leaves of its corresponding + binary search tree are filled with -1. nlevel : int the number of levels in the binary search tree from which the array diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index 1a379eda0..d8d877078 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -120,8 +120,8 @@ def _compute_and_update_PI_kernel( A boolean flag for whether or not to compute QT bfs : numpy.ndarray - the level order indices from the implicit construction of a binary - search tree followed by a breadth first (level order) search. + The breadth-first-search indices where the missing leaves of its corresponding + binary search tree are filled with -1. nlevel : int the number of levels in the binary search tree from which the array From a005a415482dbce75a6030a5e0a3e98118cad333 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 30 May 2022 01:56:24 -0600 Subject: [PATCH 123/416] Copy array into device memory before passing it to kernel function --- stumpy/gpu_stump.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index d8d877078..a7682f52f 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -328,10 +328,6 @@ def _gpu_stump( Note that left and right matrix profiles are only available for self-joins. """ - bfs = core._bfs_indices(k, fill_value=-1) - nlevel = np.floor(np.log2(k) + 1).astype(np.int64) # number of levels in - # binary seearch tree from which `bfs` is constructed. - threads_per_block = config.STUMPY_THREADS_PER_BLOCK blocks_per_grid = math.ceil(profile_len / threads_per_block) @@ -344,6 +340,11 @@ def _gpu_stump( μ_Q = np.load(μ_Q_fname, allow_pickle=False) σ_Q = np.load(σ_Q_fname, allow_pickle=False) + + device_bfs = cuda.to_device(core._bfs_indices(k, fill_value=-1)) + nlevel = np.floor(np.log2(k) + 1).astype(np.int64) + # number of levels in # binary seearch tree from which `bfs` is constructed. + with cuda.gpus[device_id]: device_T_A = cuda.to_device(T_A) device_QT_odd = cuda.to_device(QT) @@ -398,7 +399,7 @@ def _gpu_stump( device_indices_L, device_indices_R, False, - bfs, + device_bfs, nlevel, k, ) @@ -426,7 +427,7 @@ def _gpu_stump( device_indices_L, device_indices_R, True, - bfs, + device_bfs, nlevel, k, ) From ade9bb4f37295d1c0ae831672356a27ce625ff31 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 30 May 2022 16:58:35 -0600 Subject: [PATCH 124/416] use float values for generating arrays --- tests/test_core.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/tests/test_core.py b/tests/test_core.py index 152a58a01..e25d6a664 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1065,11 +1065,14 @@ def test_merge_topk_PI(): n = 50 k = 5 - PA = np.random.randint(0, 5, size=(n, k)) - PA = np.sort(PA) + PA = np.random.rand(n * k).reshape(n, k) + PA = np.sort(PA, axis=1) - PB = np.random.randint(0, 5, size=(n, k)) - PB = np.sort(PB) + PB = np.random.rand(n * k).reshape(n, k) + col_idx = np.random.randint(0, k, size=n) + for i in range(n): + PB[i, col_idx[i]] = np.random.choice(PA[i], size=1, replace=False) + PB = np.sort(PB, axis=1) IA = np.arange(n * k).reshape(n, k) IB = IA + n * k @@ -1083,9 +1086,8 @@ def test_merge_topk_PI(): naive.merge_topk_PI(ref_P, PB, ref_I, IB) core._merge_topk_PI(comp_P, PB, comp_I, IB) - ref = np.column_stack((ref_P, ref_I)) - comp = np.column_stack((comp_P, comp_I)) - npt.assert_array_equal(ref, comp) + npt.assert_array_equal(ref_P, comp_P) + npt.assert_array_equal(ref_I, comp_I) def test_gpu_searchsorted(): From 853c2ec805e37b7839983856f9ba0e882da3730a Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 31 May 2022 06:27:54 -0600 Subject: [PATCH 125/416] move device function to gpu_stump module --- stumpy/core.py | 45 --------------------------------------------- stumpy/gpu_stump.py | 45 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 45 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 54eb29e4c..0ebb5ae50 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2604,48 +2604,3 @@ def _merge_topk_PI(PA, PB, IA, IB): start = idx stop += 1 # because of shifting elements to the right by one - - -@cuda.jit(device=True) -def _gpu_searchsorted_right(a, v, bfs, nlevel): - """ - Device function to replace numpy.searchsorted(a, v, side='right') - - Parameters - ---------- - a : numpy.ndarray - 1-dim array sorted in ascending order. - - v : float - value to insert into array `a` - - bfs : numpy.ndarray - The breadth-first-search indices where the missing leaves of its corresponding - binary search tree are filled with -1. - - nlevel : int - the number of levels in the binary search tree from which the array - `bfs` is obtained. - - Returns - ------- - idx : int - the index of the insertion point - """ - n = a.shape[0] - idx = 0 - for level in range(nlevel): - if v < a[bfs[idx]]: - next_idx = 2 * idx + 1 - else: - next_idx = 2 * idx + 2 - - if level == nlevel - 1 or bfs[next_idx] < 0: - if v < a[bfs[idx]]: - idx = max(bfs[idx], 0) - else: - idx = min(bfs[idx] + 1, n) - break - idx = next_idx - - return idx diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index a7682f52f..ec6db99d3 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -15,6 +15,51 @@ logger = logging.getLogger(__name__) +@cuda.jit(device=True) +def _gpu_searchsorted_right(a, v, bfs, nlevel): + """ + Device function to replace numpy.searchsorted(a, v, side='right') + + Parameters + ---------- + a : numpy.ndarray + 1-dim array sorted in ascending order. + + v : float + value to insert into array `a` + + bfs : numpy.ndarray + The breadth-first-search indices where the missing leaves of its corresponding + binary search tree are filled with -1. + + nlevel : int + the number of levels in the binary search tree from which the array + `bfs` is obtained. + + Returns + ------- + idx : int + the index of the insertion point + """ + n = a.shape[0] + idx = 0 + for level in range(nlevel): + if v < a[bfs[idx]]: + next_idx = 2 * idx + 1 + else: + next_idx = 2 * idx + 2 + + if level == nlevel - 1 or bfs[next_idx] < 0: + if v < a[bfs[idx]]: + idx = max(bfs[idx], 0) + else: + idx = min(bfs[idx] + 1, n) + break + idx = next_idx + + return idx + + @cuda.jit( "(i8, f8[:], f8[:], i8, f8[:], f8[:], f8[:], f8[:], f8[:]," "f8[:], f8[:], i8, b1, i8, f8[:, :], f8[:], f8[:], i8[:, :], i8[:], i8[:]," From e3b5119246a964bb46e560a525832fe68b397bf4 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 31 May 2022 06:29:33 -0600 Subject: [PATCH 126/416] Add gpu_searchsorted_left for the sake completeness --- stumpy/gpu_stump.py | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index ec6db99d3..c7f7aec16 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -15,6 +15,51 @@ logger = logging.getLogger(__name__) +@cuda.jit(device=True) +def _gpu_searchsorted_left(a, v, bfs, nlevel): + """ + Device function to replace numpy.searchsorted(a, v, side='left') + + Parameters + ---------- + a : numpy.ndarray + 1-dim array sorted in ascending order. + + v : float + value to insert into array `a` + + bfs : numpy.ndarray + The breadth-first-search indices where the missing leaves of its corresponding + binary search tree are filled with -1. + + nlevel : int + the number of levels in the binary search tree from which the array + `bfs` is obtained. + + Returns + ------- + idx : int + the index of the insertion point + """ + n = a.shape[0] + idx = 0 + for level in range(nlevel): + if v <= a[bfs[idx]]: + next_idx = 2 * idx + 1 + else: + next_idx = 2 * idx + 2 + + if level == nlevel - 1 or bfs[next_idx] < 0: + if v <= a[bfs[idx]]: + idx = max(bfs[idx], 0) + else: + idx = min(bfs[idx] + 1, n) + break + idx = next_idx + + return idx + + @cuda.jit(device=True) def _gpu_searchsorted_right(a, v, bfs, nlevel): """ From c5779e551e2288f3db60ea93d9293cf60a70c2bd Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 31 May 2022 06:46:04 -0600 Subject: [PATCH 127/416] Move test function to test_gpu_stump --- tests/test_core.py | 40 ---------------------------------------- tests/test_gpu_stump.py | 18 ++++++++++++++++++ 2 files changed, 18 insertions(+), 40 deletions(-) diff --git a/tests/test_core.py b/tests/test_core.py index e25d6a664..528286061 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1088,43 +1088,3 @@ def test_merge_topk_PI(): npt.assert_array_equal(ref_P, comp_P) npt.assert_array_equal(ref_I, comp_I) - - -def test_gpu_searchsorted(): - # define a function the same as `core._gpu_searchsorted_right` but - # without cuda.jit decorator. - def gpu_searchsorted_right(a, v, bfs, nlevel): - n = a.shape[0] - idx = 0 - for level in range(nlevel): - if v < a[bfs[idx]]: - next_idx = 2 * idx + 1 - else: - next_idx = 2 * idx + 2 - - if level == nlevel - 1 or bfs[next_idx] < 0: - if v < a[bfs[idx]]: - idx = max(bfs[idx], 0) - else: - idx = min(bfs[idx] + 1, n) - break - idx = next_idx - - return idx - - for n in range(1, 100): - a = np.sort(np.random.rand(n)) - bfs = core._bfs_indices(n, fill_value=-1) - nlevel = np.floor(np.log2(n) + 1).astype(np.int64) - for i in range(n): - v = a[i] - npt.assert_almost_equal( - gpu_searchsorted_right(a, v, bfs, nlevel), - np.searchsorted(a, v, side="right"), - ) - - v = a[i] + 0.001 - npt.assert_almost_equal( - gpu_searchsorted_right(a, v, bfs, nlevel), - np.searchsorted(a, v, side="right"), - ) diff --git a/tests/test_gpu_stump.py b/tests/test_gpu_stump.py index 1a2662647..dfbf5e405 100644 --- a/tests/test_gpu_stump.py +++ b/tests/test_gpu_stump.py @@ -38,6 +38,24 @@ def test_gpu_stump_int_input(): with pytest.raises(TypeError): gpu_stump(np.arange(10), 5, ignore_trivial=True) +def test_gpu_searchsorted(): + for n in range(1, 100): + a = np.sort(np.random.rand(n)) + bfs = core._bfs_indices(n, fill_value=-1) + nlevel = np.floor(np.log2(n) + 1).astype(np.int64) + for i in range(n): + v = a[i] - 0.001 + npt.assert_almost_equal(gpu_stump._gpu_searchsorted_left(a, v, bfs, nlevel), np.searchsorted(a, v, side="left")) + npt.assert_almost_equal(gpu_stump._gpu_searchsorted_right(a, v, bfs, nlevel), np.searchsorted(a, v, side="right")) + + v = a[i] + npt.assert_almost_equal(gpu_stump._gpu_searchsorted_left(a, v, bfs, nlevel), np.searchsorted(a, v, side="left")) + npt.assert_almost_equal(gpu_stump._gpu_searchsorted_right(a, v, bfs, nlevel), np.searchsorted(a, v, side="right")) + + v = a[i] + 0.001 + npt.assert_almost_equal(gpu_stump._gpu_searchsorted_left(a, v, bfs, nlevel), np.searchsorted(a, v, side="left")) + npt.assert_almost_equal(gpu_stump._gpu_searchsorted_right(a, v, bfs, nlevel), np.searchsorted(a, v, side="right")) + @pytest.mark.filterwarnings("ignore", category=NumbaPerformanceWarning) @pytest.mark.parametrize("T_A, T_B", test_data) From 38e531c34a63e3f4a98f476c9501b705de2a2b29 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 31 May 2022 06:50:47 -0600 Subject: [PATCH 128/416] correct format --- stumpy/core.py | 2 +- stumpy/gpu_stump.py | 1 - tests/test_gpu_stump.py | 35 +++++++++++++++++++++++++++-------- 3 files changed, 28 insertions(+), 10 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 0ebb5ae50..535471761 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -7,7 +7,7 @@ import inspect import numpy as np -from numba import cuda, njit, prange +from numba import njit, prange from scipy.signal import convolve from scipy.ndimage import maximum_filter1d, minimum_filter1d from scipy import linalg diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index c7f7aec16..22748e089 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -430,7 +430,6 @@ def _gpu_stump( μ_Q = np.load(μ_Q_fname, allow_pickle=False) σ_Q = np.load(σ_Q_fname, allow_pickle=False) - device_bfs = cuda.to_device(core._bfs_indices(k, fill_value=-1)) nlevel = np.floor(np.log2(k) + 1).astype(np.int64) # number of levels in # binary seearch tree from which `bfs` is constructed. diff --git a/tests/test_gpu_stump.py b/tests/test_gpu_stump.py index dfbf5e405..1e79fb577 100644 --- a/tests/test_gpu_stump.py +++ b/tests/test_gpu_stump.py @@ -1,7 +1,7 @@ import numpy as np import numpy.testing as npt import pandas as pd -from stumpy import gpu_stump +from stumpy import core, gpu_stump from stumpy import config from numba import cuda @@ -38,23 +38,42 @@ def test_gpu_stump_int_input(): with pytest.raises(TypeError): gpu_stump(np.arange(10), 5, ignore_trivial=True) + def test_gpu_searchsorted(): for n in range(1, 100): a = np.sort(np.random.rand(n)) bfs = core._bfs_indices(n, fill_value=-1) nlevel = np.floor(np.log2(n) + 1).astype(np.int64) for i in range(n): - v = a[i] - 0.001 - npt.assert_almost_equal(gpu_stump._gpu_searchsorted_left(a, v, bfs, nlevel), np.searchsorted(a, v, side="left")) - npt.assert_almost_equal(gpu_stump._gpu_searchsorted_right(a, v, bfs, nlevel), np.searchsorted(a, v, side="right")) + v = a[i] - 0.001 + npt.assert_almost_equal( + gpu_stump._gpu_searchsorted_left(a, v, bfs, nlevel), + np.searchsorted(a, v, side="left"), + ) + npt.assert_almost_equal( + gpu_stump._gpu_searchsorted_right(a, v, bfs, nlevel), + np.searchsorted(a, v, side="right"), + ) v = a[i] - npt.assert_almost_equal(gpu_stump._gpu_searchsorted_left(a, v, bfs, nlevel), np.searchsorted(a, v, side="left")) - npt.assert_almost_equal(gpu_stump._gpu_searchsorted_right(a, v, bfs, nlevel), np.searchsorted(a, v, side="right")) + npt.assert_almost_equal( + gpu_stump._gpu_searchsorted_left(a, v, bfs, nlevel), + np.searchsorted(a, v, side="left"), + ) + npt.assert_almost_equal( + gpu_stump._gpu_searchsorted_right(a, v, bfs, nlevel), + np.searchsorted(a, v, side="right"), + ) v = a[i] + 0.001 - npt.assert_almost_equal(gpu_stump._gpu_searchsorted_left(a, v, bfs, nlevel), np.searchsorted(a, v, side="left")) - npt.assert_almost_equal(gpu_stump._gpu_searchsorted_right(a, v, bfs, nlevel), np.searchsorted(a, v, side="right")) + npt.assert_almost_equal( + gpu_stump._gpu_searchsorted_left(a, v, bfs, nlevel), + np.searchsorted(a, v, side="left"), + ) + npt.assert_almost_equal( + gpu_stump._gpu_searchsorted_right(a, v, bfs, nlevel), + np.searchsorted(a, v, side="right"), + ) @pytest.mark.filterwarnings("ignore", category=NumbaPerformanceWarning) From 5a7b3c099419de1a09368f2930eabce410730693 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 31 May 2022 08:26:58 -0600 Subject: [PATCH 129/416] Fixed calling function --- stumpy/gpu_stump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index 22748e089..bf7e3b57d 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -284,7 +284,7 @@ def _compute_and_update_PI_kernel( indices_R[j] = i if p_norm < profile[j, -1]: - idx = core._gpu_searchsorted_right(profile[j], p_norm, bfs, nlevel) + idx = _gpu_searchsorted_right(profile[j], p_norm, bfs, nlevel) for g in range(k - 1, idx, -1): profile[j, g] = profile[j, g - 1] indices[j, g] = indices[j, g - 1] From e1b0d205e463fd2e02a906ab349ca492d303be27 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 31 May 2022 08:32:25 -0600 Subject: [PATCH 130/416] Make function callable from both CPU and GPU to avoid duplication for unit testing. --- stumpy/gpu_stump.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index bf7e3b57d..99a3ba839 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -7,7 +7,7 @@ import os import numpy as np -from numba import cuda +from numba import cuda, jit from . import core, config from .gpu_aamp import gpu_aamp @@ -15,10 +15,11 @@ logger = logging.getLogger(__name__) -@cuda.jit(device=True) +@jit # equivalent to `__host__ __device__` in C++ CUDA def _gpu_searchsorted_left(a, v, bfs, nlevel): """ - Device function to replace numpy.searchsorted(a, v, side='left') + A function equivalent to numpy.searchsorted(a, v, side='left'), designed + to be used mainly as device function Parameters ---------- @@ -60,7 +61,7 @@ def _gpu_searchsorted_left(a, v, bfs, nlevel): return idx -@cuda.jit(device=True) +@jit # equivalent to `__host__ __device__` in C++ CUDA def _gpu_searchsorted_right(a, v, bfs, nlevel): """ Device function to replace numpy.searchsorted(a, v, side='right') From 922544c3ae21d018db7600d4b466a2ae40d107fd Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 31 May 2022 08:46:07 -0600 Subject: [PATCH 131/416] Fixed calling function --- tests/test_gpu_stump.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tests/test_gpu_stump.py b/tests/test_gpu_stump.py index 1e79fb577..108ac0d91 100644 --- a/tests/test_gpu_stump.py +++ b/tests/test_gpu_stump.py @@ -2,6 +2,7 @@ import numpy.testing as npt import pandas as pd from stumpy import core, gpu_stump +from stumpy.gpu_stump import _gpu_searchsorted_left, _gpu_searchsorted_right from stumpy import config from numba import cuda @@ -47,31 +48,31 @@ def test_gpu_searchsorted(): for i in range(n): v = a[i] - 0.001 npt.assert_almost_equal( - gpu_stump._gpu_searchsorted_left(a, v, bfs, nlevel), + _gpu_searchsorted_left(a, v, bfs, nlevel), np.searchsorted(a, v, side="left"), ) npt.assert_almost_equal( - gpu_stump._gpu_searchsorted_right(a, v, bfs, nlevel), + _gpu_searchsorted_right(a, v, bfs, nlevel), np.searchsorted(a, v, side="right"), ) v = a[i] npt.assert_almost_equal( - gpu_stump._gpu_searchsorted_left(a, v, bfs, nlevel), + _gpu_searchsorted_left(a, v, bfs, nlevel), np.searchsorted(a, v, side="left"), ) npt.assert_almost_equal( - gpu_stump._gpu_searchsorted_right(a, v, bfs, nlevel), + _gpu_searchsorted_right(a, v, bfs, nlevel), np.searchsorted(a, v, side="right"), ) v = a[i] + 0.001 npt.assert_almost_equal( - gpu_stump._gpu_searchsorted_left(a, v, bfs, nlevel), + _gpu_searchsorted_left(a, v, bfs, nlevel), np.searchsorted(a, v, side="left"), ) npt.assert_almost_equal( - gpu_stump._gpu_searchsorted_right(a, v, bfs, nlevel), + _gpu_searchsorted_right(a, v, bfs, nlevel), np.searchsorted(a, v, side="right"), ) From 102979b1235e00744567484b97a774658d3b2e1d Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 31 May 2022 08:51:31 -0600 Subject: [PATCH 132/416] Revised the test function for merge_topk_PI --- tests/test_core.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_core.py b/tests/test_core.py index 528286061..a297dd3fa 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1066,11 +1066,12 @@ def test_merge_topk_PI(): k = 5 PA = np.random.rand(n * k).reshape(n, k) - PA = np.sort(PA, axis=1) + PA = np.sort(PA, axis=1) # sorting each row separately PB = np.random.rand(n * k).reshape(n, k) + col_idx = np.random.randint(0, k, size=n) - for i in range(n): + for i in range(n): # creating ties between values of PA and PB PB[i, col_idx[i]] = np.random.choice(PA[i], size=1, replace=False) PB = np.sort(PB, axis=1) From a8aecf6679a9dbb02be80cdf75cf55ce99ae6aae Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 31 May 2022 09:18:04 -0600 Subject: [PATCH 133/416] Revise docstrings --- stumpy/gpu_stump.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index 99a3ba839..35bf3f12f 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -18,7 +18,7 @@ @jit # equivalent to `__host__ __device__` in C++ CUDA def _gpu_searchsorted_left(a, v, bfs, nlevel): """ - A function equivalent to numpy.searchsorted(a, v, side='left'), designed + Equivalent to numpy.searchsorted(a, v, side='left'), designed to be used mainly as device function Parameters @@ -64,7 +64,8 @@ def _gpu_searchsorted_left(a, v, bfs, nlevel): @jit # equivalent to `__host__ __device__` in C++ CUDA def _gpu_searchsorted_right(a, v, bfs, nlevel): """ - Device function to replace numpy.searchsorted(a, v, side='right') + Equivalent to numpy.searchsorted(a, v, side='left'), designed + to be used mainly as device function Parameters ---------- From 38318ecdb8ab602d8ceb9d8afe4d1abc1b6ed9ed Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 31 May 2022 09:19:31 -0600 Subject: [PATCH 134/416] Rename variable --- stumpy/gpu_stump.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index 35bf3f12f..9fb657668 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -124,7 +124,7 @@ def _compute_and_update_PI_kernel( Σ_T, μ_Q, σ_Q, - profile_len, + w, ignore_trivial, excl_zone, profile, @@ -179,7 +179,7 @@ def _compute_and_update_PI_kernel( σ_Q : numpy.ndarray Standard deviation of the query sequence, `Q` - profile_len : int + w : int The total number of sliding windows to iterate over ignore_trivial : bool @@ -247,7 +247,7 @@ def _compute_and_update_PI_kernel( for j in range(start, QT_out.shape[0], stride): zone_start = max(0, j - excl_zone) - zone_stop = min(profile_len, j + excl_zone) + zone_stop = min(w, j + excl_zone) if compute_QT: QT_out[j] = ( @@ -307,7 +307,7 @@ def _gpu_stump( QT_first_fname, μ_Q_fname, σ_Q_fname, - profile_len, + w, ignore_trivial=True, range_start=1, device_id=0, @@ -362,7 +362,7 @@ def _gpu_stump( The file name for the standard deviation of the query sequence, `Q`, relative to the current sliding window - profile_len : int + w : int The total number of sliding windows to iterate over ignore_trivial : bool @@ -421,7 +421,7 @@ def _gpu_stump( Note that left and right matrix profiles are only available for self-joins. """ threads_per_block = config.STUMPY_THREADS_PER_BLOCK - blocks_per_grid = math.ceil(profile_len / threads_per_block) + blocks_per_grid = math.ceil(w / threads_per_block) T_A = np.load(T_A_fname, allow_pickle=False) T_B = np.load(T_B_fname, allow_pickle=False) @@ -452,14 +452,14 @@ def _gpu_stump( device_M_T = cuda.to_device(M_T) device_Σ_T = cuda.to_device(Σ_T) - profile = np.full((profile_len, k), np.inf, dtype=np.float64) - indices = np.full((profile_len, k), -1, dtype=np.int64) + profile = np.full((w, k), np.inf, dtype=np.float64) + indices = np.full((w, k), -1, dtype=np.int64) - profile_L = np.full(profile_len, np.inf, dtype=np.float64) - indices_L = np.full(profile_len, -1, dtype=np.int64) + profile_L = np.full(w, np.inf, dtype=np.float64) + indices_L = np.full(w, -1, dtype=np.int64) - profile_R = np.full(profile_len, np.inf, dtype=np.float64) - indices_R = np.full(profile_len, -1, dtype=np.int64) + profile_R = np.full(w, np.inf, dtype=np.float64) + indices_R = np.full(w, -1, dtype=np.int64) device_profile = cuda.to_device(profile) device_profile_L = cuda.to_device(profile_L) @@ -480,7 +480,7 @@ def _gpu_stump( device_Σ_T, device_μ_Q, device_σ_Q, - profile_len, + w, ignore_trivial, excl_zone, device_profile, @@ -508,7 +508,7 @@ def _gpu_stump( device_Σ_T, device_μ_Q, device_σ_Q, - profile_len, + w, ignore_trivial, excl_zone, device_profile, @@ -695,7 +695,7 @@ def gpu_stump( logger.warning("Try setting `ignore_trivial = False`.") n = T_B.shape[0] - profile_len = T_A.shape[0] - m + 1 + w = T_A.shape[0] - m + 1 l = n - m + 1 excl_zone = int( np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM) @@ -765,7 +765,7 @@ def gpu_stump( QT_first_fname, μ_Q_fname, σ_Q_fname, - profile_len, + w, ignore_trivial, start + 1, device_ids[idx], @@ -794,7 +794,7 @@ def gpu_stump( QT_first_fname, μ_Q_fname, σ_Q_fname, - profile_len, + w, ignore_trivial, start + 1, device_ids[idx], @@ -866,7 +866,7 @@ def gpu_stump( indices_R[0] = np.where(cond, indices_R[0], indices_R[i]) out = np.empty( - (profile_len, 2 * k + 2), dtype=object + (w, 2 * k + 2), dtype=object ) # last two columns are to store # (top-1) left/right matrix profile indices out[:, :k] = profile[0] From 76f97cbb896f0d66819022cb0acfc43e011d67c0 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 31 May 2022 14:38:11 -0600 Subject: [PATCH 135/416] Corrected format --- stumpy/gpu_stump.py | 8 +++----- tests/test_core.py | 2 +- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index 9fb657668..371bbeaa4 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -15,7 +15,7 @@ logger = logging.getLogger(__name__) -@jit # equivalent to `__host__ __device__` in C++ CUDA +@jit # equivalent to `__host__ __device__` in C++ CUDA def _gpu_searchsorted_left(a, v, bfs, nlevel): """ Equivalent to numpy.searchsorted(a, v, side='left'), designed @@ -61,7 +61,7 @@ def _gpu_searchsorted_left(a, v, bfs, nlevel): return idx -@jit # equivalent to `__host__ __device__` in C++ CUDA +@jit # equivalent to `__host__ __device__` in C++ CUDA def _gpu_searchsorted_right(a, v, bfs, nlevel): """ Equivalent to numpy.searchsorted(a, v, side='left'), designed @@ -865,9 +865,7 @@ def gpu_stump( profile_R[0] = np.where(cond, profile_R[0], profile_R[i]) indices_R[0] = np.where(cond, indices_R[0], indices_R[i]) - out = np.empty( - (w, 2 * k + 2), dtype=object - ) # last two columns are to store + out = np.empty((w, 2 * k + 2), dtype=object) # last two columns are to store # (top-1) left/right matrix profile indices out[:, :k] = profile[0] out[:, k:] = np.column_stack((indices[0], indices_L[0], indices_R[0])) diff --git a/tests/test_core.py b/tests/test_core.py index a297dd3fa..63a33d1d0 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1071,7 +1071,7 @@ def test_merge_topk_PI(): PB = np.random.rand(n * k).reshape(n, k) col_idx = np.random.randint(0, k, size=n) - for i in range(n): # creating ties between values of PA and PB + for i in range(n): # creating ties between values of PA and PB PB[i, col_idx[i]] = np.random.choice(PA[i], size=1, replace=False) PB = np.sort(PB, axis=1) From 157944d358c3452b25cfdbf6ac79d38795fd478f Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 31 May 2022 15:58:19 -0600 Subject: [PATCH 136/416] All test passed and full coverage From 8a31eff92cbcd3f67f4943820bad957291006e6f Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 31 May 2022 16:10:26 -0600 Subject: [PATCH 137/416] Fixed typo --- stumpy/gpu_stump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index 371bbeaa4..d6e02f669 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -64,7 +64,7 @@ def _gpu_searchsorted_left(a, v, bfs, nlevel): @jit # equivalent to `__host__ __device__` in C++ CUDA def _gpu_searchsorted_right(a, v, bfs, nlevel): """ - Equivalent to numpy.searchsorted(a, v, side='left'), designed + Equivalent to numpy.searchsorted(a, v, side='right'), designed to be used mainly as device function Parameters From 9610d7445fa9f715dd6b5f8e7c69c178950774f0 Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 1 Jun 2022 19:50:25 -0600 Subject: [PATCH 138/416] Change decorator to create device function --- stumpy/gpu_stump.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index d6e02f669..3a37db6ee 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -7,7 +7,7 @@ import os import numpy as np -from numba import cuda, jit +from numba import cuda from . import core, config from .gpu_aamp import gpu_aamp @@ -15,11 +15,10 @@ logger = logging.getLogger(__name__) -@jit # equivalent to `__host__ __device__` in C++ CUDA +@cuda.jit(device=True) def _gpu_searchsorted_left(a, v, bfs, nlevel): """ - Equivalent to numpy.searchsorted(a, v, side='left'), designed - to be used mainly as device function + A device function, equivalent to numpy.searchsorted(a, v, side='left') Parameters ---------- @@ -61,11 +60,10 @@ def _gpu_searchsorted_left(a, v, bfs, nlevel): return idx -@jit # equivalent to `__host__ __device__` in C++ CUDA +@cuda.jit(device=True) def _gpu_searchsorted_right(a, v, bfs, nlevel): """ - Equivalent to numpy.searchsorted(a, v, side='right'), designed - to be used mainly as device function + A device function, equivalent to numpy.searchsorted(a, v, side='right') Parameters ---------- From 9ce12a3127fd1a646302f1d98c97da3666f8e149 Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 1 Jun 2022 19:54:32 -0600 Subject: [PATCH 139/416] Fixed typos --- stumpy/gpu_stump.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index 3a37db6ee..63366a183 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -432,7 +432,7 @@ def _gpu_stump( device_bfs = cuda.to_device(core._bfs_indices(k, fill_value=-1)) nlevel = np.floor(np.log2(k) + 1).astype(np.int64) - # number of levels in # binary seearch tree from which `bfs` is constructed. + # number of levels in binary seearch tree from which `bfs` is constructed. with cuda.gpus[device_id]: device_T_A = cuda.to_device(T_A) @@ -853,12 +853,12 @@ def gpu_stump( # Update (top-k) matrix profile and matrix profile indices core._merge_topk_PI(profile[0], profile[i], indices[0], indices[i]) - # Update (top-1) left matrix profile and matrix profil indices + # Update (top-1) left matrix profile and matrix profile indices cond = profile_L[0] < profile_L[i] profile_L[0] = np.where(cond, profile_L[0], profile_L[i]) indices_L[0] = np.where(cond, indices_L[0], indices_L[i]) - # Update (top-1) right matrix profile and matrix profil indices + # Update (top-1) right matrix profile and matrix profile indices cond = profile_R[0] < profile_R[i] profile_R[0] = np.where(cond, profile_R[0], profile_R[i]) indices_R[0] = np.where(cond, indices_R[0], indices_R[i]) From 4f2ea6c321bf40a33afd5e72ea7b9553a2e55c3f Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 1 Jun 2022 20:01:17 -0600 Subject: [PATCH 140/416] Rename function to improve readability --- tests/naive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/naive.py b/tests/naive.py index 3074c2359..101b6857a 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -156,7 +156,7 @@ def stamp(T_A, m, T_B=None, exclusion_zone=None): # pragma: no cover return result -def searchsorted(a, v): +def searchsorted_right(a, v): """ Naive version of numpy.searchsorted(..., side='right') """ From e4ae016a777791a0b32bc7a09029006e2179f211 Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 1 Jun 2022 20:32:25 -0600 Subject: [PATCH 141/416] Enhance test function to test with different values of k --- tests/test_gpu_stump.py | 20 ++++++++++---------- tests/test_stump.py | 22 +++++++++++----------- tests/test_stumped.py | 16 ++++++++-------- 3 files changed, 29 insertions(+), 29 deletions(-) diff --git a/tests/test_gpu_stump.py b/tests/test_gpu_stump.py index 108ac0d91..99b6af68d 100644 --- a/tests/test_gpu_stump.py +++ b/tests/test_gpu_stump.py @@ -393,15 +393,15 @@ def test_gpu_stump_nan_zero_mean_self_join(): @pytest.mark.filterwarnings("ignore", category=NumbaPerformanceWarning) @pytest.mark.parametrize("T_A, T_B", test_data) def test_gpu_stump_self_join_KNN(T_A, T_B): - k = 3 m = 3 - zone = int(np.ceil(m / 4)) - ref_mp = naive.stump(T_B, m, exclusion_zone=zone, row_wise=True, k=k) - comp_mp = gpu_stump(T_B, m, ignore_trivial=True, k=k) - naive.replace_inf(ref_mp) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp, comp_mp) + for k in range(1, 4): + zone = int(np.ceil(m / 4)) + ref_mp = naive.stump(T_B, m, exclusion_zone=zone, row_wise=True, k=k) + comp_mp = gpu_stump(T_B, m, ignore_trivial=True, k=k) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) - comp_mp = gpu_stump(pd.Series(T_B), m, ignore_trivial=True, k=k) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp, comp_mp) + comp_mp = gpu_stump(pd.Series(T_B), m, ignore_trivial=True, k=k) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) diff --git a/tests/test_stump.py b/tests/test_stump.py index af2a2315e..fcfccdea6 100644 --- a/tests/test_stump.py +++ b/tests/test_stump.py @@ -244,15 +244,15 @@ def test_stump_nan_zero_mean_self_join(): @pytest.mark.parametrize("T_A, T_B", test_data) def test_stump_self_join_KNN(T_A, T_B): - k = 3 - m = 3 - zone = int(np.ceil(m / 4)) - ref_mp = naive.stump(T_B, m, exclusion_zone=zone, k=k) - comp_mp = stump(T_B, m, ignore_trivial=True, k=k) - naive.replace_inf(ref_mp) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp, comp_mp) + for k in range(4): + m = 3 + zone = int(np.ceil(m / 4)) + ref_mp = naive.stump(T_B, m, exclusion_zone=zone, k=k) + comp_mp = stump(T_B, m, ignore_trivial=True, k=k) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) - comp_mp = stump(pd.Series(T_B), m, ignore_trivial=True, k=k) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp, comp_mp) + comp_mp = stump(pd.Series(T_B), m, ignore_trivial=True, k=k) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) diff --git a/tests/test_stumped.py b/tests/test_stumped.py index 02e914436..bcf6f26c1 100644 --- a/tests/test_stumped.py +++ b/tests/test_stumped.py @@ -617,11 +617,11 @@ def test_stumped_two_subsequences_nan_inf_A_B_join_swap( @pytest.mark.parametrize("T_A, T_B", test_data) def test_stumped_self_join_KNN(T_A, T_B, dask_cluster): with Client(dask_cluster) as dask_client: - k = 3 - m = 3 - zone = int(np.ceil(m / 4)) - ref_mp = naive.stump(T_B, m, exclusion_zone=zone, k=k) - comp_mp = stumped(dask_client, T_B, m, ignore_trivial=True, k=k) - naive.replace_inf(ref_mp) - naive.replace_inf(comp_mp) - npt.assert_almost_equal(ref_mp, comp_mp) + for k in range(4): + m = 3 + zone = int(np.ceil(m / 4)) + ref_mp = naive.stump(T_B, m, exclusion_zone=zone, k=k) + comp_mp = stumped(dask_client, T_B, m, ignore_trivial=True, k=k) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) From 5565904e5c101d7548da9e413a8d6191e5741ede Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 1 Jun 2022 20:40:24 -0600 Subject: [PATCH 142/416] Add test for k>1 for AB-join --- tests/test_gpu_stump.py | 12 ++++++++++++ tests/test_stump.py | 17 ++++++++++++++++- tests/test_stumped.py | 18 +++++++++++++++++- 3 files changed, 45 insertions(+), 2 deletions(-) diff --git a/tests/test_gpu_stump.py b/tests/test_gpu_stump.py index 99b6af68d..7c984164a 100644 --- a/tests/test_gpu_stump.py +++ b/tests/test_gpu_stump.py @@ -405,3 +405,15 @@ def test_gpu_stump_self_join_KNN(T_A, T_B): comp_mp = gpu_stump(pd.Series(T_B), m, ignore_trivial=True, k=k) naive.replace_inf(comp_mp) npt.assert_almost_equal(ref_mp, comp_mp) + + +@pytest.mark.filterwarnings("ignore", category=NumbaPerformanceWarning) +@pytest.mark.parametrize("T_A, T_B", test_data) +def test_gpu_stump_A_B_join_KNN(T_A, T_B): + for k in range(1, 4): + m = 3 + ref_mp = naive.stump(T_B, m, T_B=T_A, row_wise=True, k=k) + comp_mp = gpu_stump(T_B, m, T_A, ignore_trivial=False, k=k) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) diff --git a/tests/test_stump.py b/tests/test_stump.py index fcfccdea6..029fc2696 100644 --- a/tests/test_stump.py +++ b/tests/test_stump.py @@ -244,7 +244,7 @@ def test_stump_nan_zero_mean_self_join(): @pytest.mark.parametrize("T_A, T_B", test_data) def test_stump_self_join_KNN(T_A, T_B): - for k in range(4): + for k in range(1, 4): m = 3 zone = int(np.ceil(m / 4)) ref_mp = naive.stump(T_B, m, exclusion_zone=zone, k=k) @@ -256,3 +256,18 @@ def test_stump_self_join_KNN(T_A, T_B): comp_mp = stump(pd.Series(T_B), m, ignore_trivial=True, k=k) naive.replace_inf(comp_mp) npt.assert_almost_equal(ref_mp, comp_mp) + + +@pytest.mark.parametrize("T_A, T_B", test_data) +def test_stump_A_B_join_KNN(T_A, T_B): + for k in range(1, 4): + m = 3 + ref_mp = naive.stump(T_A, m, T_B=T_B, k=k) + comp_mp = stump(T_A, m, T_B, ignore_trivial=False, k=k) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) + + comp_mp = stump(pd.Series(T_A), m, pd.Series(T_B), ignore_trivial=False, k=k) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) diff --git a/tests/test_stumped.py b/tests/test_stumped.py index bcf6f26c1..363a58432 100644 --- a/tests/test_stumped.py +++ b/tests/test_stumped.py @@ -617,7 +617,7 @@ def test_stumped_two_subsequences_nan_inf_A_B_join_swap( @pytest.mark.parametrize("T_A, T_B", test_data) def test_stumped_self_join_KNN(T_A, T_B, dask_cluster): with Client(dask_cluster) as dask_client: - for k in range(4): + for k in range(1, 4): m = 3 zone = int(np.ceil(m / 4)) ref_mp = naive.stump(T_B, m, exclusion_zone=zone, k=k) @@ -625,3 +625,19 @@ def test_stumped_self_join_KNN(T_A, T_B, dask_cluster): naive.replace_inf(ref_mp) naive.replace_inf(comp_mp) npt.assert_almost_equal(ref_mp, comp_mp) + + +@pytest.mark.filterwarnings("ignore:numpy.dtype size changed") +@pytest.mark.filterwarnings("ignore:numpy.ufunc size changed") +@pytest.mark.filterwarnings("ignore:numpy.ndarray size changed") +@pytest.mark.filterwarnings("ignore:\\s+Port 8787 is already in use:UserWarning") +@pytest.mark.parametrize("T_A, T_B", test_data) +def test_stumped_A_B_join_KNN(T_A, T_B, dask_cluster): + with Client(dask_cluster) as dask_client: + for k in range(1, 4): + m = 3 + ref_mp = naive.stump(T_A, m, T_B=T_B, k=k) + comp_mp = stumped(dask_client, T_A, m, T_B, ignore_trivial=False, k=k) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal(ref_mp, comp_mp) From bf6edcc51a8ce6f919e0724fdc36846e2a023cf6 Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 1 Jun 2022 22:40:57 -0600 Subject: [PATCH 143/416] Add wrapper kernel for device function and change design of test function --- tests/test_gpu_stump.py | 78 ++++++++++++++++++++++++----------------- 1 file changed, 46 insertions(+), 32 deletions(-) diff --git a/tests/test_gpu_stump.py b/tests/test_gpu_stump.py index 7c984164a..9edea0dcd 100644 --- a/tests/test_gpu_stump.py +++ b/tests/test_gpu_stump.py @@ -40,41 +40,55 @@ def test_gpu_stump_int_input(): gpu_stump(np.arange(10), 5, ignore_trivial=True) -def test_gpu_searchsorted(): - for n in range(1, 100): - a = np.sort(np.random.rand(n)) - bfs = core._bfs_indices(n, fill_value=-1) - nlevel = np.floor(np.log2(n) + 1).astype(np.int64) - for i in range(n): - v = a[i] - 0.001 - npt.assert_almost_equal( - _gpu_searchsorted_left(a, v, bfs, nlevel), - np.searchsorted(a, v, side="left"), - ) - npt.assert_almost_equal( - _gpu_searchsorted_right(a, v, bfs, nlevel), - np.searchsorted(a, v, side="right"), - ) +@cuda.jit("(f8[:, :], f8[:], i8[:], i8, b1, i8[:])") +def _gpu_searchsorted_kernel(A, V, bfs, nlevel, is_left, IDX): + # A wrapper kernel for calling device function _gpu_searchsorted_left/right. + i = cuda.grid(1) + if i < A.shape[0]: + if is_left: + IDX[i] = _gpu_searchsorted_left(A[i], V[i], bfs, nlevel) + else: + IDX[i] = _gpu_searchsorted_right(A[i], V[i], bfs, nlevel) - v = a[i] - npt.assert_almost_equal( - _gpu_searchsorted_left(a, v, bfs, nlevel), - np.searchsorted(a, v, side="left"), - ) - npt.assert_almost_equal( - _gpu_searchsorted_right(a, v, bfs, nlevel), - np.searchsorted(a, v, side="right"), - ) - v = a[i] + 0.001 - npt.assert_almost_equal( - _gpu_searchsorted_left(a, v, bfs, nlevel), - np.searchsorted(a, v, side="left"), - ) - npt.assert_almost_equal( - _gpu_searchsorted_right(a, v, bfs, nlevel), - np.searchsorted(a, v, side="right"), +def test_gpu_searchsorted(): + n = 5000 + for k in range(1, 21): + bfs = core._bfs_indices(k, fill_value=-1) + nlevel = np.floor(np.log2(k) + 1).astype(np.int64) + + A = np.sort(np.random.rand(n, k), axis=1) + V = np.empty(n) + col_idx = np.random.randint(0, k, size=n) + diff = [-0.001, 0, 0.001] + for i in range(n): # creating ties between values of PA and PB + V[i] = np.random.choice(A[i, col_idx[i]], size=1, replace=False) + V[i] += diff[i % 3] + + device_A = cuda.to_device(A) + device_V = cuda.to_device(V) + device_bfs = cuda.to_device(bfs) + for is_left in [True, False]: + if is_left: + side = 'left' + else: + side = 'right' + + ref_IDX = np.full(n, -1, dtype=np.int64) + for i in range(n): + ref_IDX[i] = np.searchsorted(A[i], V[i], side=side) + + comp_IDX = np.full(n, -1, dtype=np.int64) + device_comp_IDX = cuda.to_device(comp_IDX) + + threads_per_block = config.STUMPY_THREADS_PER_BLOCK + blocks_per_grid = math.ceil(n / threads_per_block) + _gpu_searchsorted_kernel[blocks_per_grid, threads_per_block]( + device_A, device_V, device_bfs, nlevel, is_left, device_comp_IDX ) + comp_IDX = device_comp_IDX.copy_to_host() + + npt.assert_array_equal(ref_IDX, comp_IDX) @pytest.mark.filterwarnings("ignore", category=NumbaPerformanceWarning) From 1b7d971865ab24b7fc423c7a131e1c74c61378c9 Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 1 Jun 2022 22:45:51 -0600 Subject: [PATCH 144/416] minor corrections --- tests/naive.py | 4 ++-- tests/test_gpu_stump.py | 9 +++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 101b6857a..712bfee1b 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -239,14 +239,14 @@ def stump(T_A, m, T_B=None, exclusion_zone=None, row_wise=False, k=1): for i in iter_range: D = distance_matrix[i, i + g] # D: a single element if D < P[i, k - 1]: - idx = searchsorted(P[i], D) + idx = searchsorted_right(P[i], D) # to keep the top-k, we must get rid of the last element. P[i, :k] = np.insert(P[i, :k], idx, D)[:-1] I[i, :k] = np.insert(I[i, :k], idx, i + g)[:-1] if ignore_trivial: # Self-joins only if D < P[i + g, k - 1]: - idx = searchsorted(P[i + g], D) + idx = searchsorted_right(P[i + g], D) P[i + g, :k] = np.insert(P[i + g, :k], idx, D)[:-1] I[i + g, :k] = np.insert(I[i + g, :k], idx, i)[:-1] diff --git a/tests/test_gpu_stump.py b/tests/test_gpu_stump.py index 9edea0dcd..908537833 100644 --- a/tests/test_gpu_stump.py +++ b/tests/test_gpu_stump.py @@ -1,3 +1,4 @@ +import math import numpy as np import numpy.testing as npt import pandas as pd @@ -70,11 +71,11 @@ def test_gpu_searchsorted(): device_bfs = cuda.to_device(bfs) for is_left in [True, False]: if is_left: - side = 'left' + side = "left" else: - side = 'right' + side = "right" - ref_IDX = np.full(n, -1, dtype=np.int64) + ref_IDX = np.full(n, -1, dtype=np.int64) for i in range(n): ref_IDX[i] = np.searchsorted(A[i], V[i], side=side) @@ -84,7 +85,7 @@ def test_gpu_searchsorted(): threads_per_block = config.STUMPY_THREADS_PER_BLOCK blocks_per_grid = math.ceil(n / threads_per_block) _gpu_searchsorted_kernel[blocks_per_grid, threads_per_block]( - device_A, device_V, device_bfs, nlevel, is_left, device_comp_IDX + device_A, device_V, device_bfs, nlevel, is_left, device_comp_IDX ) comp_IDX = device_comp_IDX.copy_to_host() From 7f65b946c592007b75d881f79064f9261a5e4f9c Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 1 Jun 2022 23:41:49 -0600 Subject: [PATCH 145/416] Fixed minor bug --- tests/test_gpu_stump.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/test_gpu_stump.py b/tests/test_gpu_stump.py index 908537833..3b24f0e9d 100644 --- a/tests/test_gpu_stump.py +++ b/tests/test_gpu_stump.py @@ -54,7 +54,7 @@ def _gpu_searchsorted_kernel(A, V, bfs, nlevel, is_left, IDX): def test_gpu_searchsorted(): n = 5000 - for k in range(1, 21): + for k in range(1, 100): bfs = core._bfs_indices(k, fill_value=-1) nlevel = np.floor(np.log2(k) + 1).astype(np.int64) @@ -62,9 +62,8 @@ def test_gpu_searchsorted(): V = np.empty(n) col_idx = np.random.randint(0, k, size=n) diff = [-0.001, 0, 0.001] - for i in range(n): # creating ties between values of PA and PB - V[i] = np.random.choice(A[i, col_idx[i]], size=1, replace=False) - V[i] += diff[i % 3] + for i in range(n): + V[i] = A[i, col_idx[i]] + diff[i % 3] device_A = cuda.to_device(A) device_V = cuda.to_device(V) From 10878fdd997492eb85e51f5f6be574f76ba22ef3 Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 1 Jun 2022 23:59:23 -0600 Subject: [PATCH 146/416] Swap TA and TB to allow k to not be bigger than length of distance profile --- tests/test_gpu_stump.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_gpu_stump.py b/tests/test_gpu_stump.py index 3b24f0e9d..f73bd389c 100644 --- a/tests/test_gpu_stump.py +++ b/tests/test_gpu_stump.py @@ -426,8 +426,8 @@ def test_gpu_stump_self_join_KNN(T_A, T_B): def test_gpu_stump_A_B_join_KNN(T_A, T_B): for k in range(1, 4): m = 3 - ref_mp = naive.stump(T_B, m, T_B=T_A, row_wise=True, k=k) - comp_mp = gpu_stump(T_B, m, T_A, ignore_trivial=False, k=k) + ref_mp = naive.stump(T_A, m, T_B=T_B, row_wise=True, k=k) + comp_mp = gpu_stump(T_A, m, T_B, ignore_trivial=False, k=k) naive.replace_inf(ref_mp) naive.replace_inf(comp_mp) npt.assert_almost_equal(ref_mp, comp_mp) From d282dfdb0b7e985d0d15db4bdc64a8905565b0c2 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 2 Jun 2022 10:18:54 -0600 Subject: [PATCH 147/416] Redesign test function --- tests/test_gpu_stump.py | 53 ++++++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/tests/test_gpu_stump.py b/tests/test_gpu_stump.py index f73bd389c..67c9ec0f9 100644 --- a/tests/test_gpu_stump.py +++ b/tests/test_gpu_stump.py @@ -54,41 +54,46 @@ def _gpu_searchsorted_kernel(A, V, bfs, nlevel, is_left, IDX): def test_gpu_searchsorted(): n = 5000 - for k in range(1, 100): + threads_per_block = config.STUMPY_THREADS_PER_BLOCK + blocks_per_grid = math.ceil(n / threads_per_block) + + for k in range(1, 32): bfs = core._bfs_indices(k, fill_value=-1) nlevel = np.floor(np.log2(k) + 1).astype(np.int64) A = np.sort(np.random.rand(n, k), axis=1) V = np.empty(n) - col_idx = np.random.randint(0, k, size=n) diff = [-0.001, 0, 0.001] for i in range(n): - V[i] = A[i, col_idx[i]] + diff[i % 3] + V[i] = A[i, i % k] + diff[i % 3] device_A = cuda.to_device(A) device_V = cuda.to_device(V) device_bfs = cuda.to_device(bfs) - for is_left in [True, False]: - if is_left: - side = "left" - else: - side = "right" - - ref_IDX = np.full(n, -1, dtype=np.int64) - for i in range(n): - ref_IDX[i] = np.searchsorted(A[i], V[i], side=side) - - comp_IDX = np.full(n, -1, dtype=np.int64) - device_comp_IDX = cuda.to_device(comp_IDX) - - threads_per_block = config.STUMPY_THREADS_PER_BLOCK - blocks_per_grid = math.ceil(n / threads_per_block) - _gpu_searchsorted_kernel[blocks_per_grid, threads_per_block]( - device_A, device_V, device_bfs, nlevel, is_left, device_comp_IDX - ) - comp_IDX = device_comp_IDX.copy_to_host() - - npt.assert_array_equal(ref_IDX, comp_IDX) + + side = "left" # is_left = True + ref_IDX = np.full(n, -1, dtype=np.int64) + for i in range(n): + ref_IDX[i] = np.searchsorted(A[i], V[i], side=side) + comp_IDX = np.full(n, -1, dtype=np.int64) + device_comp_IDX = cuda.to_device(comp_IDX) + _gpu_searchsorted_kernel[blocks_per_grid, threads_per_block]( + device_A, device_V, device_bfs, nlevel, True, device_comp_IDX + ) + comp_IDX = device_comp_IDX.copy_to_host() + npt.assert_array_equal(ref_IDX, comp_IDX) + + side = "right" # is_left = False + ref_IDX = np.full(n, -1, dtype=np.int64) + for i in range(n): + ref_IDX[i] = np.searchsorted(A[i], V[i], side=side) + comp_IDX = np.full(n, -1, dtype=np.int64) + device_comp_IDX = cuda.to_device(comp_IDX) + _gpu_searchsorted_kernel[blocks_per_grid, threads_per_block]( + device_A, device_V, device_bfs, nlevel, False, device_comp_IDX + ) + comp_IDX = device_comp_IDX.copy_to_host() + npt.assert_array_equal(ref_IDX, comp_IDX) @pytest.mark.filterwarnings("ignore", category=NumbaPerformanceWarning) From 9de9dd2e5242026bb42dcdf31d317f1d4d89a37e Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 2 Jun 2022 10:29:16 -0600 Subject: [PATCH 148/416] minor refactoring --- tests/test_gpu_stump.py | 4 ++-- tests/test_stump.py | 6 +++--- tests/test_stumped.py | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/test_gpu_stump.py b/tests/test_gpu_stump.py index 67c9ec0f9..aa70cd114 100644 --- a/tests/test_gpu_stump.py +++ b/tests/test_gpu_stump.py @@ -413,8 +413,8 @@ def test_gpu_stump_nan_zero_mean_self_join(): @pytest.mark.parametrize("T_A, T_B", test_data) def test_gpu_stump_self_join_KNN(T_A, T_B): m = 3 + zone = int(np.ceil(m / 4)) for k in range(1, 4): - zone = int(np.ceil(m / 4)) ref_mp = naive.stump(T_B, m, exclusion_zone=zone, row_wise=True, k=k) comp_mp = gpu_stump(T_B, m, ignore_trivial=True, k=k) naive.replace_inf(ref_mp) @@ -429,8 +429,8 @@ def test_gpu_stump_self_join_KNN(T_A, T_B): @pytest.mark.filterwarnings("ignore", category=NumbaPerformanceWarning) @pytest.mark.parametrize("T_A, T_B", test_data) def test_gpu_stump_A_B_join_KNN(T_A, T_B): + m = 3 for k in range(1, 4): - m = 3 ref_mp = naive.stump(T_A, m, T_B=T_B, row_wise=True, k=k) comp_mp = gpu_stump(T_A, m, T_B, ignore_trivial=False, k=k) naive.replace_inf(ref_mp) diff --git a/tests/test_stump.py b/tests/test_stump.py index 029fc2696..3e0b34299 100644 --- a/tests/test_stump.py +++ b/tests/test_stump.py @@ -244,9 +244,9 @@ def test_stump_nan_zero_mean_self_join(): @pytest.mark.parametrize("T_A, T_B", test_data) def test_stump_self_join_KNN(T_A, T_B): + m = 3 + zone = int(np.ceil(m / 4)) for k in range(1, 4): - m = 3 - zone = int(np.ceil(m / 4)) ref_mp = naive.stump(T_B, m, exclusion_zone=zone, k=k) comp_mp = stump(T_B, m, ignore_trivial=True, k=k) naive.replace_inf(ref_mp) @@ -260,8 +260,8 @@ def test_stump_self_join_KNN(T_A, T_B): @pytest.mark.parametrize("T_A, T_B", test_data) def test_stump_A_B_join_KNN(T_A, T_B): + m = 3 for k in range(1, 4): - m = 3 ref_mp = naive.stump(T_A, m, T_B=T_B, k=k) comp_mp = stump(T_A, m, T_B, ignore_trivial=False, k=k) naive.replace_inf(ref_mp) diff --git a/tests/test_stumped.py b/tests/test_stumped.py index 363a58432..7e8b053d3 100644 --- a/tests/test_stumped.py +++ b/tests/test_stumped.py @@ -617,9 +617,9 @@ def test_stumped_two_subsequences_nan_inf_A_B_join_swap( @pytest.mark.parametrize("T_A, T_B", test_data) def test_stumped_self_join_KNN(T_A, T_B, dask_cluster): with Client(dask_cluster) as dask_client: + m = 3 + zone = int(np.ceil(m / 4)) for k in range(1, 4): - m = 3 - zone = int(np.ceil(m / 4)) ref_mp = naive.stump(T_B, m, exclusion_zone=zone, k=k) comp_mp = stumped(dask_client, T_B, m, ignore_trivial=True, k=k) naive.replace_inf(ref_mp) @@ -634,8 +634,8 @@ def test_stumped_self_join_KNN(T_A, T_B, dask_cluster): @pytest.mark.parametrize("T_A, T_B", test_data) def test_stumped_A_B_join_KNN(T_A, T_B, dask_cluster): with Client(dask_cluster) as dask_client: + m = 3 for k in range(1, 4): - m = 3 ref_mp = naive.stump(T_A, m, T_B=T_B, k=k) comp_mp = stumped(dask_client, T_A, m, T_B, ignore_trivial=False, k=k) naive.replace_inf(ref_mp) From 9789cd9ff98483a74e25bdc15d3d31d79769ce36 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 4 Jun 2022 03:00:30 -0600 Subject: [PATCH 149/416] Extend test function to test with different values of parameter k --- tests/test_core.py | 38 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/tests/test_core.py b/tests/test_core.py index 63a33d1d0..f83d09504 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1063,29 +1063,27 @@ def test_select_P_ABBA_val_inf(): def test_merge_topk_PI(): n = 50 - k = 5 + for k in range(1, 6): + PA = np.random.rand(n * k).reshape(n, k) + PA = np.sort(PA, axis=1) # sorting each row separately - PA = np.random.rand(n * k).reshape(n, k) - PA = np.sort(PA, axis=1) # sorting each row separately + PB = np.random.rand(n * k).reshape(n, k) + col_idx = np.random.randint(0, k, size=n) + for i in range(n): # creating ties between values of PA and PB + PB[i, col_idx[i]] = np.random.choice(PA[i], size=1, replace=False) + PB = np.sort(PB, axis=1) # sorting each row separately - PB = np.random.rand(n * k).reshape(n, k) + IA = np.arange(n * k).reshape(n, k) + IB = IA + n * k - col_idx = np.random.randint(0, k, size=n) - for i in range(n): # creating ties between values of PA and PB - PB[i, col_idx[i]] = np.random.choice(PA[i], size=1, replace=False) - PB = np.sort(PB, axis=1) + ref_P = PA.copy() + ref_I = IA.copy() - IA = np.arange(n * k).reshape(n, k) - IB = IA + n * k + comp_P = PA.copy() + comp_I = IA.copy() - ref_P = PA.copy() - ref_I = IA.copy() + naive.merge_topk_PI(ref_P, PB, ref_I, IB) + core._merge_topk_PI(comp_P, PB, comp_I, IB) - comp_P = PA.copy() - comp_I = IA.copy() - - naive.merge_topk_PI(ref_P, PB, ref_I, IB) - core._merge_topk_PI(comp_P, PB, comp_I, IB) - - npt.assert_array_equal(ref_P, comp_P) - npt.assert_array_equal(ref_I, comp_I) + npt.assert_array_equal(ref_P, comp_P) + npt.assert_array_equal(ref_I, comp_I) From 6faa6453ef20d7f291bbb56cd3a3168ac0853214 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 4 Jun 2022 04:42:28 -0600 Subject: [PATCH 150/416] Minor changes in test function --- tests/test_gpu_stump.py | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/tests/test_gpu_stump.py b/tests/test_gpu_stump.py index aa70cd114..071337cd5 100644 --- a/tests/test_gpu_stump.py +++ b/tests/test_gpu_stump.py @@ -58,39 +58,37 @@ def test_gpu_searchsorted(): blocks_per_grid = math.ceil(n / threads_per_block) for k in range(1, 32): - bfs = core._bfs_indices(k, fill_value=-1) + device_bfs = cuda.to_device(core._bfs_indices(k, fill_value=-1)) nlevel = np.floor(np.log2(k) + 1).astype(np.int64) A = np.sort(np.random.rand(n, k), axis=1) - V = np.empty(n) - diff = [-0.001, 0, 0.001] - for i in range(n): - V[i] = A[i, i % k] + diff[i % 3] - device_A = cuda.to_device(A) + + V = np.random.rand(n) + for i, idx in enumerate(np.random.choice(np.arange(n), size=k, replace=False)): + V[idx] = A[idx, i] # create ties device_V = cuda.to_device(V) - device_bfs = cuda.to_device(bfs) - side = "left" # is_left = True - ref_IDX = np.full(n, -1, dtype=np.int64) - for i in range(n): - ref_IDX[i] = np.searchsorted(A[i], V[i], side=side) + is_left = True # test case + ref_IDX = [np.searchsorted(A[i], V[i], side="left") for i in range(n)] + ref_IDX = np.asarray(ref_IDX, dtype=np.int64) + comp_IDX = np.full(n, -1, dtype=np.int64) device_comp_IDX = cuda.to_device(comp_IDX) _gpu_searchsorted_kernel[blocks_per_grid, threads_per_block]( - device_A, device_V, device_bfs, nlevel, True, device_comp_IDX + device_A, device_V, device_bfs, nlevel, is_left, device_comp_IDX ) comp_IDX = device_comp_IDX.copy_to_host() npt.assert_array_equal(ref_IDX, comp_IDX) - side = "right" # is_left = False - ref_IDX = np.full(n, -1, dtype=np.int64) - for i in range(n): - ref_IDX[i] = np.searchsorted(A[i], V[i], side=side) + is_left = False # test case + ref_IDX = [np.searchsorted(A[i], V[i], side="right") for i in range(n)] + ref_IDX = np.asarray(ref_IDX, dtype=np.int64) + comp_IDX = np.full(n, -1, dtype=np.int64) device_comp_IDX = cuda.to_device(comp_IDX) _gpu_searchsorted_kernel[blocks_per_grid, threads_per_block]( - device_A, device_V, device_bfs, nlevel, False, device_comp_IDX + device_A, device_V, device_bfs, nlevel, is_left, device_comp_IDX ) comp_IDX = device_comp_IDX.copy_to_host() npt.assert_array_equal(ref_IDX, comp_IDX) From caee6994451d12154ab980cc5342d7ebc2ba214f Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 7 Jun 2022 10:58:40 -0600 Subject: [PATCH 151/416] Exclude test for k=1 to avoid redundancy --- tests/test_gpu_stump.py | 4 ++-- tests/test_stump.py | 6 +++--- tests/test_stumped.py | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/test_gpu_stump.py b/tests/test_gpu_stump.py index 071337cd5..14f435dda 100644 --- a/tests/test_gpu_stump.py +++ b/tests/test_gpu_stump.py @@ -412,7 +412,7 @@ def test_gpu_stump_nan_zero_mean_self_join(): def test_gpu_stump_self_join_KNN(T_A, T_B): m = 3 zone = int(np.ceil(m / 4)) - for k in range(1, 4): + for k in range(2, 4): ref_mp = naive.stump(T_B, m, exclusion_zone=zone, row_wise=True, k=k) comp_mp = gpu_stump(T_B, m, ignore_trivial=True, k=k) naive.replace_inf(ref_mp) @@ -428,7 +428,7 @@ def test_gpu_stump_self_join_KNN(T_A, T_B): @pytest.mark.parametrize("T_A, T_B", test_data) def test_gpu_stump_A_B_join_KNN(T_A, T_B): m = 3 - for k in range(1, 4): + for k in range(2, 4): ref_mp = naive.stump(T_A, m, T_B=T_B, row_wise=True, k=k) comp_mp = gpu_stump(T_A, m, T_B, ignore_trivial=False, k=k) naive.replace_inf(ref_mp) diff --git a/tests/test_stump.py b/tests/test_stump.py index 3e0b34299..df8912829 100644 --- a/tests/test_stump.py +++ b/tests/test_stump.py @@ -246,11 +246,11 @@ def test_stump_nan_zero_mean_self_join(): def test_stump_self_join_KNN(T_A, T_B): m = 3 zone = int(np.ceil(m / 4)) - for k in range(1, 4): + for k in range(2, 4): ref_mp = naive.stump(T_B, m, exclusion_zone=zone, k=k) comp_mp = stump(T_B, m, ignore_trivial=True, k=k) naive.replace_inf(ref_mp) - naive.replace_inf(comp_mp) + naive.replace_insf(comp_mp) npt.assert_almost_equal(ref_mp, comp_mp) comp_mp = stump(pd.Series(T_B), m, ignore_trivial=True, k=k) @@ -261,7 +261,7 @@ def test_stump_self_join_KNN(T_A, T_B): @pytest.mark.parametrize("T_A, T_B", test_data) def test_stump_A_B_join_KNN(T_A, T_B): m = 3 - for k in range(1, 4): + for k in range(2, 4): ref_mp = naive.stump(T_A, m, T_B=T_B, k=k) comp_mp = stump(T_A, m, T_B, ignore_trivial=False, k=k) naive.replace_inf(ref_mp) diff --git a/tests/test_stumped.py b/tests/test_stumped.py index 7e8b053d3..9181d81c8 100644 --- a/tests/test_stumped.py +++ b/tests/test_stumped.py @@ -619,7 +619,7 @@ def test_stumped_self_join_KNN(T_A, T_B, dask_cluster): with Client(dask_cluster) as dask_client: m = 3 zone = int(np.ceil(m / 4)) - for k in range(1, 4): + for k in range(2, 4): ref_mp = naive.stump(T_B, m, exclusion_zone=zone, k=k) comp_mp = stumped(dask_client, T_B, m, ignore_trivial=True, k=k) naive.replace_inf(ref_mp) @@ -635,7 +635,7 @@ def test_stumped_self_join_KNN(T_A, T_B, dask_cluster): def test_stumped_A_B_join_KNN(T_A, T_B, dask_cluster): with Client(dask_cluster) as dask_client: m = 3 - for k in range(1, 4): + for k in range(2, 4): ref_mp = naive.stump(T_A, m, T_B=T_B, k=k) comp_mp = stumped(dask_client, T_A, m, T_B, ignore_trivial=False, k=k) naive.replace_inf(ref_mp) From 493f6cbcd714c822e3cfb61173e3e7bc892765cb Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 7 Jun 2022 11:02:07 -0600 Subject: [PATCH 152/416] Revise test function - Make parameter `n` a function of config setting - Add filterwarning --- tests/test_gpu_stump.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_gpu_stump.py b/tests/test_gpu_stump.py index 14f435dda..ef8c03c1d 100644 --- a/tests/test_gpu_stump.py +++ b/tests/test_gpu_stump.py @@ -52,8 +52,10 @@ def _gpu_searchsorted_kernel(A, V, bfs, nlevel, is_left, IDX): IDX[i] = _gpu_searchsorted_right(A[i], V[i], bfs, nlevel) +@pytest.mark.filterwarnings("ignore", category=NumbaPerformanceWarning) def test_gpu_searchsorted(): - n = 5000 + n = 3 * config.STUMPY_THREADS_PER_BLOCK + 1 + threads_per_block = config.STUMPY_THREADS_PER_BLOCK blocks_per_grid = math.ceil(n / threads_per_block) From 986f4697966aa344f31f14ab5a09501fe923805b Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 7 Jun 2022 11:38:58 -0600 Subject: [PATCH 153/416] Fixed typo --- tests/test_stump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_stump.py b/tests/test_stump.py index df8912829..e08746758 100644 --- a/tests/test_stump.py +++ b/tests/test_stump.py @@ -250,7 +250,7 @@ def test_stump_self_join_KNN(T_A, T_B): ref_mp = naive.stump(T_B, m, exclusion_zone=zone, k=k) comp_mp = stump(T_B, m, ignore_trivial=True, k=k) naive.replace_inf(ref_mp) - naive.replace_insf(comp_mp) + naive.replace_inf(comp_mp) npt.assert_almost_equal(ref_mp, comp_mp) comp_mp = stump(pd.Series(T_B), m, ignore_trivial=True, k=k) From b438c9c9e9fe167a7821e1fd0703030104e08012 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 7 Jun 2022 12:08:21 -0600 Subject: [PATCH 154/416] Avoided creating new array in memory --- tests/test_core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_core.py b/tests/test_core.py index f83d09504..7998c042d 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1065,13 +1065,13 @@ def test_merge_topk_PI(): n = 50 for k in range(1, 6): PA = np.random.rand(n * k).reshape(n, k) - PA = np.sort(PA, axis=1) # sorting each row separately + PA[:, :] = np.sort(PA, axis=1) # sorting each row separately PB = np.random.rand(n * k).reshape(n, k) col_idx = np.random.randint(0, k, size=n) for i in range(n): # creating ties between values of PA and PB PB[i, col_idx[i]] = np.random.choice(PA[i], size=1, replace=False) - PB = np.sort(PB, axis=1) # sorting each row separately + PB[:, :] = np.sort(PB, axis=1) # sorting each row separately IA = np.arange(n * k).reshape(n, k) IB = IA + n * k From a54c8789b4488568ce465bb32b4fb73e31fb5c3b Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 11 Jun 2022 02:06:10 -0600 Subject: [PATCH 155/416] Improve naive.prescrump to return TopK matrix profile --- tests/naive.py | 71 ++++++++++++++++++++++++++++---------------------- 1 file changed, 40 insertions(+), 31 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index fabe3d922..b742c86dd 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1403,16 +1403,14 @@ def aampdist_snippets( ) -def prescrump(T_A, m, T_B, s, exclusion_zone=None): +def prescrump(T_A, m, T_B, s, exclusion_zone=None, k=1): dist_matrix = distance_matrix(T_A, T_B, m) n_A = T_A.shape[0] l = n_A - m + 1 - P = np.empty(l) - I = np.empty(l, dtype=np.int64) - P[:] = np.inf - I[:] = -1 + P = np.full((l, k), np.inf, dtype=np.float64) + I = np.full((l, k), -1, dtype=np.int64) for i in np.random.permutation(range(0, l, s)): distance_profile = dist_matrix[i] @@ -1420,33 +1418,44 @@ def prescrump(T_A, m, T_B, s, exclusion_zone=None): apply_exclusion_zone(distance_profile, i, exclusion_zone, np.inf) # only for self-join - mask = distance_profile < P - P[mask] = distance_profile[mask] - I[mask] = i - - I[i] = np.argmin(distance_profile) - P[i] = distance_profile[I[i]] - if P[i] == np.inf: - I[i] = -1 + for idx in np.flatnonzero(distance_profile < P[:, -1]): + pos = np.searchsorted(P[idx], distance_profile[idx], side="right") + P[idx] = np.insert(P[idx], pos, distance_profile[idx])[:-1] + I[idx] = np.insert(I[idx], pos, i)[:-1] + + I[i, 1:] = I[i, :-1] + I[i, 0] = np.argmin(distance_profile) + P[i, 1:] = P[i, :-1] + P[i, 0] = distance_profile[I[i, 0]] + if P[i, 0] == np.inf: + I[i, 0] = -1 else: - j = I[i] - for k in range(1, min(s, l - max(i, j))): - d = dist_matrix[i + k, j + k] - if d < P[i + k]: - P[i + k] = d - I[i + k] = j + k - if d < P[j + k]: - P[j + k] = d - I[j + k] = i + k - - for k in range(1, min(s, i + 1, j + 1)): - d = dist_matrix[i - k, j - k] - if d < P[i - k]: - P[i - k] = d - I[i - k] = j - k - if d < P[j - k]: - P[j - k] = d - I[j - k] = i - k + j = I[i, 0] # index of 1st NN + for g in range(1, min(s, l - max(i, j))): + d = dist_matrix[i + g, j + g] + if d < P[i + g, -1]: + pos = np.searchsorted(P[i + g], d, side="right") + P[i + g] = np.insert(P[i + g], pos, d)[:-1] + I[i + g] = np.insert(I[i + g], pos, j + g)[:-1] + if d < P[j + g]: + pos = np.searchsorted(P[j + g], d, side="right") + P[j + g] = np.insert(P[j + g], pos, d)[:-1] + I[j + g] = np.insert(I[j + g], pos, i + g)[:-1] + + for g in range(1, min(s, i + 1, j + 1)): + d = dist_matrix[i - g, j - g] + if d < P[i - g, -1]: + pos = np.searchsorted(P[i - g], d, side="right") + P[i - g] = np.insert(P[i - g], pos, d)[:-1] + I[i - g] = np.insert(I[i - g], pos, j - g)[:-1] + if d < P[j - g]: + pos = np.searchsorted(P[j - g], d, side="right") + P[j - g] = np.insert(P[j - g], pos, d)[:-1] + I[j - g] = np.insert(I[j - g], pos, i - g)[:-1] + + if k == 1: + P = P.ravel() + I = I.ravel() return P, I From 1bf2fc29b0913855507e7945ce00f215ab6d8a74 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 11 Jun 2022 02:06:16 -0600 Subject: [PATCH 156/416] test_scrump passed From 647ec3ebe3f35cc5d353a1f9eae1f12dc9321ad8 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 11 Jun 2022 02:14:04 -0600 Subject: [PATCH 157/416] Add new test function for prescrump TopK matrix profile --- tests/test_scrump.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tests/test_scrump.py b/tests/test_scrump.py index ff96d9eee..8ba48a024 100644 --- a/tests/test_scrump.py +++ b/tests/test_scrump.py @@ -690,3 +690,21 @@ def test_scrump_nan_zero_mean_self_join(percentages): npt.assert_almost_equal(ref_I, comp_I) npt.assert_almost_equal(ref_left_I, comp_left_I) npt.assert_almost_equal(ref_right_I, comp_right_I) + + +@pytest.mark.parametrize("T_A, T_B", test_data) +def test_prescrump_self_join_KNN(T_A, T_B): + m = 3 + zone = int(np.ceil(m / 4)) + for k in range(2, 4): + for s in range(1, zone + 1): + seed = np.random.randint(100000) + + np.random.seed(seed) + ref_P, ref_I = naive.prescrump(T_B, m, T_B, s=s, exclusion_zone=zone, k=k) + + np.random.seed(seed) + comp_P, comp_I = prescrump(T_B, m, s=s, k=k) + + npt.assert_almost_equal(ref_P, comp_P) + npt.assert_almost_equal(ref_I, comp_I) From 6e35c7192cc35f9dbc2db08cd85d9aee0386e5f2 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 11 Jun 2022 03:49:29 -0600 Subject: [PATCH 158/416] Enhance performance prescrump to return top-k matrix profile --- stumpy/scrump.py | 191 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 140 insertions(+), 51 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 53d10b612..1c3e15c14 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -31,9 +31,10 @@ def _compute_PI( P_squared, I, excl_zone=None, + k=1, ): """ - Compute (Numba JIT-compiled) and update the squared matrix profile distance + Compute (Numba JIT-compiled) and update the squared (top-k) matrix profile distance and matrix profile indces according to the preSCRIMP algorithm Parameters @@ -78,14 +79,19 @@ def _compute_PI( `int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM))` P_squared : numpy.ndarray - The squared matrix profile + The squared (top-k) matrix profile I : numpy.ndarray - The matrix profile indices + The (top-k) matrix profile indices excl_zone : int The half width for the exclusion zone relative to the `i`. + k : int, default 1 + The number of top `k` smallest distances used to construct the matrix profile. + Note that this will increase the total computational time and memory usage + when k > 1. + Returns ------- None @@ -112,58 +118,119 @@ def _compute_PI( squared_distance_profile[zone_start : zone_stop + 1] = np.inf # only for self-join - mask = squared_distance_profile < P_squared[thread_idx] - P_squared[thread_idx][mask] = squared_distance_profile[mask] - I[thread_idx][mask] = i - - I[thread_idx, i] = np.argmin(squared_distance_profile) - P_squared[thread_idx, i] = squared_distance_profile[I[thread_idx, i]] - if P_squared[thread_idx, i] == np.inf: # pragma: no cover - I[thread_idx, i] = -1 + IDX = np.flatnonzero( + squared_distance_profile < P_squared[thread_idx, :, -1] + ) + for idx in IDX: + pos = np.searchsorted( + P_squared[thread_idx, idx], + squared_distance_profile[idx], + side="right", + ) + # shifting to the right + for loc in range(k - 1, pos, -1): + P_squared[thread_idx, idx, loc] = P_squared[ + thread_idx, idx, loc - 1 + ] + I[thread_idx, idx, loc] = I[thread_idx, idx, loc - 1] + + P_squared[thread_idx, idx, pos] = squared_distance_profile[idx] + I[thread_idx, idx, pos] = i + + # shifting to the right + for loc in range(k - 1, 0, -1): + P_squared[thread_idx, i, loc] = P_squared[thread_idx, i, loc - 1] + I[thread_idx, i, loc] = I[thread_idx, i, loc - 1] + + I[thread_idx, i, 0] = np.argmin(squared_distance_profile) + P_squared[thread_idx, i, 0] = squared_distance_profile[I[thread_idx, i, 0]] + + if P_squared[thread_idx, i, 0] == np.inf: # pragma: no cover + I[thread_idx, i, 0] = -1 else: - j = I[thread_idx, i] + j = I[thread_idx, i, 0] # Given the squared distance, work backwards and compute QT - QT_j = (m - P_squared[thread_idx, i] / 2.0) * (Σ_T[j] * σ_Q[i]) + ( + QT_j = (m - P_squared[thread_idx, i, 0] / 2.0) * (Σ_T[j] * σ_Q[i]) + ( m * M_T[j] * μ_Q[i] ) QT_j_prime = QT_j - for k in range(1, min(s, l - max(i, j))): + for g in range(1, min(s, l - max(i, j))): QT_j = ( QT_j - - T_B[i + k - 1] * T_A[j + k - 1] - + T_B[i + k + m - 1] * T_A[j + k + m - 1] + - T_B[i + g - 1] * T_A[j + g - 1] + + T_B[i + g + m - 1] * T_A[j + g + m - 1] ) D_squared = core._calculate_squared_distance( m, QT_j, - M_T[i + k], - Σ_T[i + k], - μ_Q[j + k], - σ_Q[j + k], + M_T[i + g], + Σ_T[i + g], + μ_Q[j + g], + σ_Q[j + g], ) - if D_squared < P_squared[thread_idx, i + k]: - P_squared[thread_idx, i + k] = D_squared - I[thread_idx, i + k] = j + k - if D_squared < P_squared[thread_idx, j + k]: - P_squared[thread_idx, j + k] = D_squared - I[thread_idx, j + k] = i + k + if D_squared < P_squared[thread_idx, i + g, -1]: + pos = np.searchsorted( + P_squared[thread_idx, i + g], D_squared, side="right" + ) + # shifting to the right + for loc in range(k - 1, pos, -1): + P_squared[thread_idx, i + g, loc] = P_squared[ + thread_idx, i + g, loc - 1 + ] + I[thread_idx, i + g, loc] = I[thread_idx, i + g, loc - 1] + + P_squared[thread_idx, i + g, pos] = D_squared + I[thread_idx, i + g, pos] = j + g + if D_squared < P_squared[thread_idx, j + g, -1]: + pos = np.searchsorted( + P_squared[thread_idx, j + g], D_squared, side="right" + ) + # shifting to the right + for loc in range(k - 1, pos, -1): + P_squared[thread_idx, j + g, loc] = P_squared[ + thread_idx, j + g, loc - 1 + ] + I[thread_idx, j + g, loc] = I[thread_idx, j + g, loc - 1] + + P_squared[thread_idx, j + g, pos] = D_squared + I[thread_idx, j + g, pos] = i + g QT_j = QT_j_prime - for k in range(1, min(s, i + 1, j + 1)): - QT_j = QT_j - T_B[i - k + m] * T_A[j - k + m] + T_B[i - k] * T_A[j - k] + for g in range(1, min(s, i + 1, j + 1)): + QT_j = QT_j - T_B[i - g + m] * T_A[j - g + m] + T_B[i - g] * T_A[j - g] D_squared = core._calculate_squared_distance( m, QT_j, - M_T[i - k], - Σ_T[i - k], - μ_Q[j - k], - σ_Q[j - k], + M_T[i - g], + Σ_T[i - g], + μ_Q[j - g], + σ_Q[j - g], ) - if D_squared < P_squared[thread_idx, i - k]: - P_squared[thread_idx, i - k] = D_squared - I[thread_idx, i - k] = j - k - if D_squared < P_squared[thread_idx, j - k]: - P_squared[thread_idx, j - k] = D_squared - I[thread_idx, j - k] = i - k + if D_squared < P_squared[thread_idx, i - g, -1]: + pos = np.searchsorted( + P_squared[thread_idx, i - g], D_squared, side="right" + ) + # shifting to the right + for loc in range(k - 1, pos, -1): + P_squared[thread_idx, i - g, loc] = P_squared[ + thread_idx, i - g, loc - 1 + ] + I[thread_idx, i - g, loc] = I[thread_idx, i - g, loc - 1] + + P_squared[thread_idx, i - g, pos] = D_squared + I[thread_idx, i - g, pos] = j - g + if D_squared < P_squared[thread_idx, j - g, -1]: + pos = np.searchsorted( + P_squared[thread_idx, j - g], D_squared, side="right" + ) + # shifting to the right + for loc in range(k - 1, pos, -1): + P_squared[thread_idx, j - g, loc] = P_squared[ + thread_idx, j - g, loc - 1 + ] + I[thread_idx, j - g, loc] = I[thread_idx, j - g, loc - 1] + + P_squared[thread_idx, j - g, pos] = D_squared + I[thread_idx, j - g, pos] = i - g @njit( @@ -183,6 +250,7 @@ def _prescrump( indices, s, excl_zone=None, + k=1, ): """ A Numba JIT-compiled implementation of the preSCRIMP algorithm. @@ -232,13 +300,22 @@ def _prescrump( excl_zone : int The half width for the exclusion zone relative to the `i`. + k : int, default 1 + The number of top `k` smallest distances used to construct the matrix profile. + Note that this will increase the total computational time and memory usage + when k > 1. + Returns ------- out1 : numpy.ndarray - Matrix profile + The (top-k) Matrix profile. When k = 1 (default), the first and only column + consists of the matrix profile. However, when k > 1, the output has exacly + k columns consist of the top-k matrix profile. out2 : numpy.ndarray - Matrix profile indices + The (top-k) Matrix profile. When k = 1 (default), the first and only column + consists of the matrix profile indices. However, when k > 1, the output has + exacly k columns consist of the top-k matrix profile indices. Notes ----- @@ -249,8 +326,8 @@ def _prescrump( """ n_threads = numba.config.NUMBA_NUM_THREADS l = T_A.shape[0] - m + 1 - P_squared = np.full((n_threads, l), np.inf, dtype=np.float64) - I = np.full((n_threads, l), -1, dtype=np.int64) + P_squared = np.full((n_threads, l, k), np.inf, dtype=np.float64) + I = np.full((n_threads, l, k), -1, dtype=np.int64) idx_ranges = core._get_ranges(len(indices), n_threads, truncate=False) for thread_idx in prange(n_threads): @@ -270,23 +347,21 @@ def _prescrump( P_squared, I, excl_zone, + k, ) for thread_idx in range(1, n_threads): - for i in range(l): - if P_squared[thread_idx, i] < P_squared[0, i]: - P_squared[0, i] = P_squared[thread_idx, i] - I[0, i] = I[thread_idx, i] + core._merge_topk_PI(P_squared[0], P_squared[thread_idx], I[0], I[thread_idx]) return np.sqrt(P_squared[0]), I[0] @core.non_normalized(scraamp.prescraamp) -def prescrump(T_A, m, T_B=None, s=None, normalize=True, p=2.0): +def prescrump(T_A, m, T_B=None, s=None, normalize=True, p=2.0, k=1): """ A convenience wrapper around the Numba JIT-compiled parallelized `_prescrump` - function which computes the approximate matrix profile according to the preSCRIMP - algorithm + function which computes the approximate (top-k) matrix profile according to + the preSCRIMP algorithm Parameters ---------- @@ -313,13 +388,22 @@ def prescrump(T_A, m, T_B=None, s=None, normalize=True, p=2.0): The p-norm to apply for computing the Minkowski distance. This parameter is ignored when `normalize == True`. + k : int, default 1 + The number of top `k` smallest distances used to construct the matrix profile. + Note that this will increase the total computational time and memory usage + when k > 1. + Returns ------- P : numpy.ndarray - Matrix profile + The (top-k) Matrix profile. When k = 1 (default), it is a 1d array. However, + when k > 1, it is a 2d array with exacly `k` columns consist of the top-k + matrix profile. I : numpy.ndarray - Matrix profile indices + The (top-k) Matrix profile indices. When k = 1 (default), it is a 1d array. + However, when k > 1, it is a 2d array with exacly `k` columns consist of + the top-k matrix profile indices. Notes ----- @@ -355,8 +439,13 @@ def prescrump(T_A, m, T_B=None, s=None, normalize=True, p=2.0): indices, s, excl_zone, + k, ) + if k == 1: + P = P.ravel() + I = I.ravel() + return P, I From 5f9ce865ecc91a99562994d9ecc45787cbc754d7 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 11 Jun 2022 03:54:18 -0600 Subject: [PATCH 159/416] Add test function for top-k feature of prescrump AB join --- tests/test_scrump.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tests/test_scrump.py b/tests/test_scrump.py index 8ba48a024..0989010ce 100644 --- a/tests/test_scrump.py +++ b/tests/test_scrump.py @@ -708,3 +708,21 @@ def test_prescrump_self_join_KNN(T_A, T_B): npt.assert_almost_equal(ref_P, comp_P) npt.assert_almost_equal(ref_I, comp_I) + + +@pytest.mark.parametrize("T_A, T_B", test_data) +def test_prescrump_A_B_join_KNN(T_A, T_B): + m = 3 + zone = int(np.ceil(m / 4)) + for k in range(2, 4): + for s in range(1, zone + 1): + seed = np.random.randint(100000) + + np.random.seed(seed) + ref_P, ref_I = naive.prescrump(T_A, m, T_B, s=s) + + np.random.seed(seed) + comp_P, comp_I = prescrump(T_A, m, T_B=T_B, s=s) + + npt.assert_almost_equal(ref_P, comp_P) + npt.assert_almost_equal(ref_I, comp_I) From 4096b261cfc7bb139e107ee028ea69d8d889b920 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 11 Jun 2022 16:12:20 -0600 Subject: [PATCH 160/416] Temporarily added parameter k to prescraamp to pass non normalized decorator test --- stumpy/scraamp.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/stumpy/scraamp.py b/stumpy/scraamp.py index 4bcccd508..44a8b7c83 100644 --- a/stumpy/scraamp.py +++ b/stumpy/scraamp.py @@ -270,7 +270,8 @@ def _prescraamp( return np.power(P_NORM[0], 1.0 / p), I[0] -def prescraamp(T_A, m, T_B=None, s=None, p=2.0): +def prescraamp(T_A, m, T_B=None, s=None, p=2.0, k=1): + # this function should be modified so that it can return top-k matrix profile """ A convenience wrapper around the Numba JIT-compiled parallelized `_prescraamp` function which computes the approximate matrix profile according to the @@ -295,6 +296,11 @@ def prescraamp(T_A, m, T_B=None, s=None, p=2.0): p : float, default 2.0 The p-norm to apply for computing the Minkowski distance. + k : int, default 1 + The number of top `k` smallest distances used to construct the matrix profile. + Note that this will increase the total computational time and memory usage + when k > 1. + Returns ------- P : numpy.ndarray From c85357e9ec5b957fd98029c7d23c298b37cee1db Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 11 Jun 2022 17:20:04 -0600 Subject: [PATCH 161/416] Refactored --- stumpy/scrump.py | 95 ++++++++++++++++++++++-------------------------- 1 file changed, 44 insertions(+), 51 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 1c3e15c14..5b86b02e6 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -14,6 +14,32 @@ logger = logging.getLogger(__name__) +@njit +def _insert(a, idx, v): + """ + Insert value `v` into array `a` at index `idx` (in place) and throw away + the last element (i.e. not changing the length of original array) + + Parameters + ---------- + a: numpy.ndarray + a 1d array + + idx: int + the index at which the value `v` should be inserted + + v: float + the value that should be inserted into array `a` at index `idx` + + Returns + ------- + None + """ + for i in range(a.shape[0] - 1, idx, -1): + a[i] = a[i - 1] + a[idx] = v + + @njit(fastmath=True) def _compute_PI( T_A, @@ -128,22 +154,14 @@ def _compute_PI( side="right", ) # shifting to the right - for loc in range(k - 1, pos, -1): - P_squared[thread_idx, idx, loc] = P_squared[ - thread_idx, idx, loc - 1 - ] - I[thread_idx, idx, loc] = I[thread_idx, idx, loc - 1] - - P_squared[thread_idx, idx, pos] = squared_distance_profile[idx] - I[thread_idx, idx, pos] = i - - # shifting to the right - for loc in range(k - 1, 0, -1): - P_squared[thread_idx, i, loc] = P_squared[thread_idx, i, loc - 1] - I[thread_idx, i, loc] = I[thread_idx, i, loc - 1] + _insert( + P_squared[thread_idx, idx, :], pos, squared_distance_profile[idx] + ) + _insert(I[thread_idx, idx, :], pos, i) - I[thread_idx, i, 0] = np.argmin(squared_distance_profile) - P_squared[thread_idx, i, 0] = squared_distance_profile[I[thread_idx, i, 0]] + idx = np.argmin(squared_distance_profile) + _insert(P_squared[thread_idx, i, :], 0, squared_distance_profile[idx]) + _insert(I[thread_idx, i, :], 0, idx) if P_squared[thread_idx, i, 0] == np.inf: # pragma: no cover I[thread_idx, i, 0] = -1 @@ -172,28 +190,16 @@ def _compute_PI( pos = np.searchsorted( P_squared[thread_idx, i + g], D_squared, side="right" ) - # shifting to the right - for loc in range(k - 1, pos, -1): - P_squared[thread_idx, i + g, loc] = P_squared[ - thread_idx, i + g, loc - 1 - ] - I[thread_idx, i + g, loc] = I[thread_idx, i + g, loc - 1] - - P_squared[thread_idx, i + g, pos] = D_squared - I[thread_idx, i + g, pos] = j + g + _insert(P_squared[thread_idx, i + g, :], pos, D_squared) + _insert(I[thread_idx, i + g, :], pos, j + g) + if D_squared < P_squared[thread_idx, j + g, -1]: pos = np.searchsorted( P_squared[thread_idx, j + g], D_squared, side="right" ) - # shifting to the right - for loc in range(k - 1, pos, -1): - P_squared[thread_idx, j + g, loc] = P_squared[ - thread_idx, j + g, loc - 1 - ] - I[thread_idx, j + g, loc] = I[thread_idx, j + g, loc - 1] - - P_squared[thread_idx, j + g, pos] = D_squared - I[thread_idx, j + g, pos] = i + g + _insert(P_squared[thread_idx, j + g, :], pos, D_squared) + _insert(I[thread_idx, j + g, :], pos, i + g) + QT_j = QT_j_prime for g in range(1, min(s, i + 1, j + 1)): QT_j = QT_j - T_B[i - g + m] * T_A[j - g + m] + T_B[i - g] * T_A[j - g] @@ -209,28 +215,15 @@ def _compute_PI( pos = np.searchsorted( P_squared[thread_idx, i - g], D_squared, side="right" ) - # shifting to the right - for loc in range(k - 1, pos, -1): - P_squared[thread_idx, i - g, loc] = P_squared[ - thread_idx, i - g, loc - 1 - ] - I[thread_idx, i - g, loc] = I[thread_idx, i - g, loc - 1] - - P_squared[thread_idx, i - g, pos] = D_squared - I[thread_idx, i - g, pos] = j - g + _insert(P_squared[thread_idx, i - g, :], pos, D_squared) + _insert(I[thread_idx, i - g, :], pos, j - g) + if D_squared < P_squared[thread_idx, j - g, -1]: pos = np.searchsorted( P_squared[thread_idx, j - g], D_squared, side="right" ) - # shifting to the right - for loc in range(k - 1, pos, -1): - P_squared[thread_idx, j - g, loc] = P_squared[ - thread_idx, j - g, loc - 1 - ] - I[thread_idx, j - g, loc] = I[thread_idx, j - g, loc - 1] - - P_squared[thread_idx, j - g, pos] = D_squared - I[thread_idx, j - g, pos] = i - g + _insert(P_squared[thread_idx, j - g, :], pos, D_squared) + _insert(I[thread_idx, j - g, :], pos, i - g) @njit( From e9a61bea21c1f2cc20781d78ba3110f11ca727e7 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 11 Jun 2022 18:47:09 -0600 Subject: [PATCH 162/416] Confirmed Full test and coverage passing From c0b05ed91b46d09910d29fe03fb1ac72aa4d3435 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 11 Jun 2022 18:49:11 -0600 Subject: [PATCH 163/416] Removed wrong comment --- stumpy/scrump.py | 1 - 1 file changed, 1 deletion(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 5b86b02e6..997e4552c 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -153,7 +153,6 @@ def _compute_PI( squared_distance_profile[idx], side="right", ) - # shifting to the right _insert( P_squared[thread_idx, idx, :], pos, squared_distance_profile[idx] ) From 820408d22f477d863b23b64bd5a4bb8456abf4d8 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 12 Jun 2022 16:45:27 -0600 Subject: [PATCH 164/416] Move function to stumpy.core --- stumpy/core.py | 26 +++++++++++++++++++++++++ stumpy/scrump.py | 50 ++++++++++++------------------------------------ 2 files changed, 38 insertions(+), 38 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 535471761..f9ad2a06c 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2604,3 +2604,29 @@ def _merge_topk_PI(PA, PB, IA, IB): start = idx stop += 1 # because of shifting elements to the right by one + + +@njit +def _insert(a, idx, v): + """ + Insert value `v` into array `a` at index `idx` (in place) and throw away + the last element (i.e. not changing the length of original array) + + Parameters + ---------- + a: numpy.ndarray + a 1d array + + idx: int + the index at which the value `v` should be inserted + + v: float + the value that should be inserted into array `a` at index `idx` + + Returns + ------- + None + """ + for i in range(a.shape[0] - 1, idx, -1): + a[i] = a[i - 1] + a[idx] = v diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 997e4552c..fe4ea09da 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -14,32 +14,6 @@ logger = logging.getLogger(__name__) -@njit -def _insert(a, idx, v): - """ - Insert value `v` into array `a` at index `idx` (in place) and throw away - the last element (i.e. not changing the length of original array) - - Parameters - ---------- - a: numpy.ndarray - a 1d array - - idx: int - the index at which the value `v` should be inserted - - v: float - the value that should be inserted into array `a` at index `idx` - - Returns - ------- - None - """ - for i in range(a.shape[0] - 1, idx, -1): - a[i] = a[i - 1] - a[idx] = v - - @njit(fastmath=True) def _compute_PI( T_A, @@ -153,14 +127,14 @@ def _compute_PI( squared_distance_profile[idx], side="right", ) - _insert( + core._insert( P_squared[thread_idx, idx, :], pos, squared_distance_profile[idx] ) - _insert(I[thread_idx, idx, :], pos, i) + core._insert(I[thread_idx, idx, :], pos, i) idx = np.argmin(squared_distance_profile) - _insert(P_squared[thread_idx, i, :], 0, squared_distance_profile[idx]) - _insert(I[thread_idx, i, :], 0, idx) + core._insert(P_squared[thread_idx, i, :], 0, squared_distance_profile[idx]) + core._insert(I[thread_idx, i, :], 0, idx) if P_squared[thread_idx, i, 0] == np.inf: # pragma: no cover I[thread_idx, i, 0] = -1 @@ -189,15 +163,15 @@ def _compute_PI( pos = np.searchsorted( P_squared[thread_idx, i + g], D_squared, side="right" ) - _insert(P_squared[thread_idx, i + g, :], pos, D_squared) - _insert(I[thread_idx, i + g, :], pos, j + g) + core._insert(P_squared[thread_idx, i + g, :], pos, D_squared) + core._insert(I[thread_idx, i + g, :], pos, j + g) if D_squared < P_squared[thread_idx, j + g, -1]: pos = np.searchsorted( P_squared[thread_idx, j + g], D_squared, side="right" ) - _insert(P_squared[thread_idx, j + g, :], pos, D_squared) - _insert(I[thread_idx, j + g, :], pos, i + g) + core._insert(P_squared[thread_idx, j + g, :], pos, D_squared) + core._insert(I[thread_idx, j + g, :], pos, i + g) QT_j = QT_j_prime for g in range(1, min(s, i + 1, j + 1)): @@ -214,15 +188,15 @@ def _compute_PI( pos = np.searchsorted( P_squared[thread_idx, i - g], D_squared, side="right" ) - _insert(P_squared[thread_idx, i - g, :], pos, D_squared) - _insert(I[thread_idx, i - g, :], pos, j - g) + core._insert(P_squared[thread_idx, i - g, :], pos, D_squared) + core._insert(I[thread_idx, i - g, :], pos, j - g) if D_squared < P_squared[thread_idx, j - g, -1]: pos = np.searchsorted( P_squared[thread_idx, j - g], D_squared, side="right" ) - _insert(P_squared[thread_idx, j - g, :], pos, D_squared) - _insert(I[thread_idx, j - g, :], pos, i - g) + core._insert(P_squared[thread_idx, j - g, :], pos, D_squared) + core._insert(I[thread_idx, j - g, :], pos, i - g) @njit( From 9f1fc8a7609a438105dde6a8bd3c998ec17d54a5 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 12 Jun 2022 16:55:00 -0600 Subject: [PATCH 165/416] replace for-loop with Advanced indexing --- stumpy/core.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index f9ad2a06c..35f9c8f92 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2627,6 +2627,5 @@ def _insert(a, idx, v): ------- None """ - for i in range(a.shape[0] - 1, idx, -1): - a[i] = a[i - 1] + a[idx + 1 :] = a[idx:-1] a[idx] = v From 0744378e4889ccd0af137bb61f9cf9fe063c182b Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 12 Jun 2022 16:56:49 -0600 Subject: [PATCH 166/416] Improved Docstring --- stumpy/core.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/stumpy/core.py b/stumpy/core.py index 35f9c8f92..651cc3e35 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2618,7 +2618,8 @@ def _insert(a, idx, v): a 1d array idx: int - the index at which the value `v` should be inserted + the index at which the value `v` should be inserted. This can be any + integer number from `0` to `len(a) - 1` v: float the value that should be inserted into array `a` at index `idx` From e13fb7ab2a03b429efbaec1ada03bd80acdeb939 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 12 Jun 2022 17:20:02 -0600 Subject: [PATCH 167/416] Added test function --- tests/test_core.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/test_core.py b/tests/test_core.py index 7998c042d..4934a684d 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1087,3 +1087,17 @@ def test_merge_topk_PI(): npt.assert_array_equal(ref_P, comp_P) npt.assert_array_equal(ref_I, comp_I) + + +def test_insert(): + for k in range(1, 6): + ref_A = np.random.rand(k) + comp_A = ref_A.copy() + + insert_idx = np.arange(k) + values = np.random.rand(k) + for (idx, v) in zip(insert_idx, values): + ref_A = np.insert(ref_A, idx, v)[:-1] + core._insert(comp_A, idx, v) # updating comp_A + + npt.assert_array_equal(ref_A, comp_A) From 522b5ec9bb07f3e3f3bea362bad688ae3fb0177d Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 12 Jun 2022 18:36:13 -0600 Subject: [PATCH 168/416] minor change in test function --- tests/test_core.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/tests/test_core.py b/tests/test_core.py index 4934a684d..d9eb3c33e 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1091,13 +1091,15 @@ def test_merge_topk_PI(): def test_insert(): for k in range(1, 6): - ref_A = np.random.rand(k) - comp_A = ref_A.copy() + a = np.random.rand(k) - insert_idx = np.arange(k) + indices = np.arange(k) values = np.random.rand(k) - for (idx, v) in zip(insert_idx, values): - ref_A = np.insert(ref_A, idx, v)[:-1] - core._insert(comp_A, idx, v) # updating comp_A + for (idx, v) in zip(indices, values): + ref = a.copy() + comp = a.copy() - npt.assert_array_equal(ref_A, comp_A) + ref = np.insert(ref, idx, v)[:-1] + core._insert(comp, idx, v) # updating comp_A + + npt.assert_array_equal(ref, comp) From 736bf6befa23fceef1a811d0ed55f5f9058c8258 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 13 Jun 2022 18:13:27 -0600 Subject: [PATCH 169/416] Revised docstrings --- stumpy/core.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 651cc3e35..860c6ec41 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2565,7 +2565,8 @@ def _select_P_ABBA_value(P_ABBA, k, custom_func=None): def _merge_topk_PI(PA, PB, IA, IB): """ Merge two top-k matrix profiles PA and PB, and update PA (in place) while - prioritizing values of PA in ties. Also, update IA accordingly. + always choosing values of PA over values of PB in case of ties. Also, update + IA accordingly. Parameters ---------- @@ -2609,8 +2610,8 @@ def _merge_topk_PI(PA, PB, IA, IB): @njit def _insert(a, idx, v): """ - Insert value `v` into array `a` at index `idx` (in place) and throw away - the last element (i.e. not changing the length of original array) + Insert value `v` into array `a` at index `idx` (in place) and discard + the last element (i.e. without changing the length of `a`) Parameters ---------- From 5d265daaead86941c6b4c1757915075ae32ee4c9 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 13 Jun 2022 18:27:00 -0600 Subject: [PATCH 170/416] Renamed function to make it more specific --- stumpy/core.py | 2 +- stumpy/scrump.py | 38 ++++++++++++++++++++++++++------------ 2 files changed, 27 insertions(+), 13 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 860c6ec41..c4bc4cd15 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2608,7 +2608,7 @@ def _merge_topk_PI(PA, PB, IA, IB): @njit -def _insert(a, idx, v): +def _shift_at_index_and_insert(a, idx, v): """ Insert value `v` into array `a` at index `idx` (in place) and discard the last element (i.e. without changing the length of `a`) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index fe4ea09da..908607ee4 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -127,14 +127,16 @@ def _compute_PI( squared_distance_profile[idx], side="right", ) - core._insert( + core._shift_at_index_and_insert( P_squared[thread_idx, idx, :], pos, squared_distance_profile[idx] ) - core._insert(I[thread_idx, idx, :], pos, i) + core._shift_at_index_and_insert(I[thread_idx, idx, :], pos, i) idx = np.argmin(squared_distance_profile) - core._insert(P_squared[thread_idx, i, :], 0, squared_distance_profile[idx]) - core._insert(I[thread_idx, i, :], 0, idx) + core._shift_at_index_and_insert( + P_squared[thread_idx, i, :], 0, squared_distance_profile[idx] + ) + core._shift_at_index_and_insert(I[thread_idx, i, :], 0, idx) if P_squared[thread_idx, i, 0] == np.inf: # pragma: no cover I[thread_idx, i, 0] = -1 @@ -163,15 +165,19 @@ def _compute_PI( pos = np.searchsorted( P_squared[thread_idx, i + g], D_squared, side="right" ) - core._insert(P_squared[thread_idx, i + g, :], pos, D_squared) - core._insert(I[thread_idx, i + g, :], pos, j + g) + core._shift_at_index_and_insert( + P_squared[thread_idx, i + g, :], pos, D_squared + ) + core._shift_at_index_and_insert(I[thread_idx, i + g, :], pos, j + g) if D_squared < P_squared[thread_idx, j + g, -1]: pos = np.searchsorted( P_squared[thread_idx, j + g], D_squared, side="right" ) - core._insert(P_squared[thread_idx, j + g, :], pos, D_squared) - core._insert(I[thread_idx, j + g, :], pos, i + g) + core._shift_at_index_and_insert( + P_squared[thread_idx, j + g, :], pos, D_squared + ) + core._shift_at_index_and_insert(I[thread_idx, j + g, :], pos, i + g) QT_j = QT_j_prime for g in range(1, min(s, i + 1, j + 1)): @@ -188,15 +194,23 @@ def _compute_PI( pos = np.searchsorted( P_squared[thread_idx, i - g], D_squared, side="right" ) - core._insert(P_squared[thread_idx, i - g, :], pos, D_squared) - core._insert(I[thread_idx, i - g, :], pos, j - g) + core._shift_at_index_and_shift_at_index_and_insert( + P_squared[thread_idx, i - g, :], pos, D_squared + ) + core._shift_at_index_and_shift_at_index_and_insert( + I[thread_idx, i - g, :], pos, j - g + ) if D_squared < P_squared[thread_idx, j - g, -1]: pos = np.searchsorted( P_squared[thread_idx, j - g], D_squared, side="right" ) - core._insert(P_squared[thread_idx, j - g, :], pos, D_squared) - core._insert(I[thread_idx, j - g, :], pos, i - g) + core._shift_at_index_and_shift_at_index_and_insert( + P_squared[thread_idx, j - g, :], pos, D_squared + ) + core._shift_at_index_and_shift_at_index_and_insert( + I[thread_idx, j - g, :], pos, i - g + ) @njit( From 4db2de1a32979226f79855d0da39e5dcf6df2246 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 13 Jun 2022 18:33:36 -0600 Subject: [PATCH 171/416] Added if to check input parameter --- stumpy/core.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index c4bc4cd15..f7b8d4ee4 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2629,5 +2629,6 @@ def _shift_at_index_and_insert(a, idx, v): ------- None """ - a[idx + 1 :] = a[idx:-1] - a[idx] = v + if idx < len(a): + a[idx + 1 :] = a[idx:-1] + a[idx] = v From 16d02f2364661e3fa4b7ab1bd6c094b1cb60e61a Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 13 Jun 2022 18:49:08 -0600 Subject: [PATCH 172/416] Revise test function - rename functions - consider edge case in testing --- tests/test_core.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_core.py b/tests/test_core.py index d9eb3c33e..25416dbcd 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1089,17 +1089,17 @@ def test_merge_topk_PI(): npt.assert_array_equal(ref_I, comp_I) -def test_insert(): +def test_shift_at_index_and_insert(): for k in range(1, 6): a = np.random.rand(k) - indices = np.arange(k) - values = np.random.rand(k) + indices = np.arange(k + 1) + values = np.random.rand(k + 1) for (idx, v) in zip(indices, values): ref = a.copy() comp = a.copy() ref = np.insert(ref, idx, v)[:-1] - core._insert(comp, idx, v) # updating comp_A + core._shift_at_index_and_insert(comp, idx, v) # update comp in place npt.assert_array_equal(ref, comp) From 29894f2c8186a3bc9978b002564e008ead1e6ffc Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 13 Jun 2022 19:36:20 -0600 Subject: [PATCH 173/416] Removed unnecessary trailing colon --- stumpy/scrump.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 908607ee4..908aed5bd 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -128,15 +128,15 @@ def _compute_PI( side="right", ) core._shift_at_index_and_insert( - P_squared[thread_idx, idx, :], pos, squared_distance_profile[idx] + P_squared[thread_idx, idx], pos, squared_distance_profile[idx] ) - core._shift_at_index_and_insert(I[thread_idx, idx, :], pos, i) + core._shift_at_index_and_insert(I[thread_idx, idx], pos, i) idx = np.argmin(squared_distance_profile) core._shift_at_index_and_insert( - P_squared[thread_idx, i, :], 0, squared_distance_profile[idx] + P_squared[thread_idx, i], 0, squared_distance_profile[idx] ) - core._shift_at_index_and_insert(I[thread_idx, i, :], 0, idx) + core._shift_at_index_and_insert(I[thread_idx, i], 0, idx) if P_squared[thread_idx, i, 0] == np.inf: # pragma: no cover I[thread_idx, i, 0] = -1 @@ -166,18 +166,18 @@ def _compute_PI( P_squared[thread_idx, i + g], D_squared, side="right" ) core._shift_at_index_and_insert( - P_squared[thread_idx, i + g, :], pos, D_squared + P_squared[thread_idx, i + g], pos, D_squared ) - core._shift_at_index_and_insert(I[thread_idx, i + g, :], pos, j + g) + core._shift_at_index_and_insert(I[thread_idx, i + g], pos, j + g) if D_squared < P_squared[thread_idx, j + g, -1]: pos = np.searchsorted( P_squared[thread_idx, j + g], D_squared, side="right" ) core._shift_at_index_and_insert( - P_squared[thread_idx, j + g, :], pos, D_squared + P_squared[thread_idx, j + g], pos, D_squared ) - core._shift_at_index_and_insert(I[thread_idx, j + g, :], pos, i + g) + core._shift_at_index_and_insert(I[thread_idx, j + g], pos, i + g) QT_j = QT_j_prime for g in range(1, min(s, i + 1, j + 1)): @@ -195,10 +195,10 @@ def _compute_PI( P_squared[thread_idx, i - g], D_squared, side="right" ) core._shift_at_index_and_shift_at_index_and_insert( - P_squared[thread_idx, i - g, :], pos, D_squared + P_squared[thread_idx, i - g], pos, D_squared ) core._shift_at_index_and_shift_at_index_and_insert( - I[thread_idx, i - g, :], pos, j - g + I[thread_idx, i - g], pos, j - g ) if D_squared < P_squared[thread_idx, j - g, -1]: @@ -206,10 +206,10 @@ def _compute_PI( P_squared[thread_idx, j - g], D_squared, side="right" ) core._shift_at_index_and_shift_at_index_and_insert( - P_squared[thread_idx, j - g, :], pos, D_squared + P_squared[thread_idx, j - g], pos, D_squared ) core._shift_at_index_and_shift_at_index_and_insert( - I[thread_idx, j - g, :], pos, i - g + I[thread_idx, j - g], pos, i - g ) From 12d02aa9b58e9f4a1d1fe1c3f0fe113c2cea47c4 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 14 Jun 2022 00:42:56 -0600 Subject: [PATCH 174/416] rename variable to improve readability --- stumpy/scrump.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 908aed5bd..263879057 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -153,7 +153,7 @@ def _compute_PI( - T_B[i + g - 1] * T_A[j + g - 1] + T_B[i + g + m - 1] * T_A[j + g + m - 1] ) - D_squared = core._calculate_squared_distance( + d_squared = core._calculate_squared_distance( m, QT_j, M_T[i + g], @@ -161,28 +161,28 @@ def _compute_PI( μ_Q[j + g], σ_Q[j + g], ) - if D_squared < P_squared[thread_idx, i + g, -1]: + if d_squared < P_squared[thread_idx, i + g, -1]: pos = np.searchsorted( - P_squared[thread_idx, i + g], D_squared, side="right" + P_squared[thread_idx, i + g], d_squared, side="right" ) core._shift_at_index_and_insert( - P_squared[thread_idx, i + g], pos, D_squared + P_squared[thread_idx, i + g], pos, d_squared ) core._shift_at_index_and_insert(I[thread_idx, i + g], pos, j + g) - if D_squared < P_squared[thread_idx, j + g, -1]: + if d_squared < P_squared[thread_idx, j + g, -1]: pos = np.searchsorted( - P_squared[thread_idx, j + g], D_squared, side="right" + P_squared[thread_idx, j + g], d_squared, side="right" ) core._shift_at_index_and_insert( - P_squared[thread_idx, j + g], pos, D_squared + P_squared[thread_idx, j + g], pos, d_squared ) core._shift_at_index_and_insert(I[thread_idx, j + g], pos, i + g) QT_j = QT_j_prime for g in range(1, min(s, i + 1, j + 1)): QT_j = QT_j - T_B[i - g + m] * T_A[j - g + m] + T_B[i - g] * T_A[j - g] - D_squared = core._calculate_squared_distance( + d_squared = core._calculate_squared_distance( m, QT_j, M_T[i - g], @@ -190,23 +190,23 @@ def _compute_PI( μ_Q[j - g], σ_Q[j - g], ) - if D_squared < P_squared[thread_idx, i - g, -1]: + if d_squared < P_squared[thread_idx, i - g, -1]: pos = np.searchsorted( - P_squared[thread_idx, i - g], D_squared, side="right" + P_squared[thread_idx, i - g], d_squared, side="right" ) core._shift_at_index_and_shift_at_index_and_insert( - P_squared[thread_idx, i - g], pos, D_squared + P_squared[thread_idx, i - g], pos, d_squared ) core._shift_at_index_and_shift_at_index_and_insert( I[thread_idx, i - g], pos, j - g ) - if D_squared < P_squared[thread_idx, j - g, -1]: + if d_squared < P_squared[thread_idx, j - g, -1]: pos = np.searchsorted( - P_squared[thread_idx, j - g], D_squared, side="right" + P_squared[thread_idx, j - g], d_squared, side="right" ) core._shift_at_index_and_shift_at_index_and_insert( - P_squared[thread_idx, j - g], pos, D_squared + P_squared[thread_idx, j - g], pos, d_squared ) core._shift_at_index_and_shift_at_index_and_insert( I[thread_idx, j - g], pos, i - g From 3818cf622576b635ee8f6bf4ec623bd13271ec7d Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 14 Jun 2022 00:49:07 -0600 Subject: [PATCH 175/416] Revised performant and naive version of prescrump - allowed prescrump to allow 2d array when k is 1 - revised/improve docstrings --- stumpy/scrump.py | 24 ++++++++++-------------- tests/naive.py | 4 ---- 2 files changed, 10 insertions(+), 18 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 263879057..77f06021f 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -289,13 +289,13 @@ def _prescrump( ------- out1 : numpy.ndarray The (top-k) Matrix profile. When k = 1 (default), the first and only column - consists of the matrix profile. However, when k > 1, the output has exacly - k columns consist of the top-k matrix profile. + consists of the matrix profile. When k > 1, the output has exacly k columns + consist of the top-k matrix profile. out2 : numpy.ndarray The (top-k) Matrix profile. When k = 1 (default), the first and only column - consists of the matrix profile indices. However, when k > 1, the output has - exacly k columns consist of the top-k matrix profile indices. + consists of the matrix profile indices. When k > 1, the output has exacly + k columns consist of the top-k matrix profile indices. Notes ----- @@ -376,14 +376,14 @@ def prescrump(T_A, m, T_B=None, s=None, normalize=True, p=2.0, k=1): Returns ------- P : numpy.ndarray - The (top-k) Matrix profile. When k = 1 (default), it is a 1d array. However, - when k > 1, it is a 2d array with exacly `k` columns consist of the top-k - matrix profile. + The (top-k) Matrix profile. When k = 1 (default), the first and only column + consists of the matrix profile. When k > 1, the output has exacly k columns + consist of the top-k matrix profile. I : numpy.ndarray - The (top-k) Matrix profile indices. When k = 1 (default), it is a 1d array. - However, when k > 1, it is a 2d array with exacly `k` columns consist of - the top-k matrix profile indices. + The (top-k) Matrix profile. When k = 1 (default), the first and only column + consists of the matrix profile indices. When k > 1, the output has exacly + k columns consist of the top-k matrix profile indices. Notes ----- @@ -422,10 +422,6 @@ def prescrump(T_A, m, T_B=None, s=None, normalize=True, p=2.0, k=1): k, ) - if k == 1: - P = P.ravel() - I = I.ravel() - return P, I diff --git a/tests/naive.py b/tests/naive.py index b742c86dd..40b9d71e2 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1453,10 +1453,6 @@ def prescrump(T_A, m, T_B, s, exclusion_zone=None, k=1): P[j - g] = np.insert(P[j - g], pos, d)[:-1] I[j - g] = np.insert(I[j - g], pos, i - g)[:-1] - if k == 1: - P = P.ravel() - I = I.ravel() - return P, I From 241dee9281e9d1fcd72d104f0089d0ff3fcb52fc Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 14 Jun 2022 00:54:55 -0600 Subject: [PATCH 176/416] Add comments and reminders to improve readability --- stumpy/scrump.py | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 77f06021f..165692317 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -113,11 +113,31 @@ def _compute_PI( squared_distance_profile[:] = core._mass(Q, T_B, QT, μ_Q[i], σ_Q[i], M_T, Σ_T) squared_distance_profile[:] = np.square(squared_distance_profile) if excl_zone is not None: + # self-join zone_start = max(0, i - excl_zone) zone_stop = min(l, i + excl_zone) squared_distance_profile[zone_start : zone_stop + 1] = np.inf - # only for self-join + # Reminder(1): this `squared_distance_profile` is the (square of) distance profile + # that corresponds to `S_i`, the subsequence with start index `i`. + + # Reminder(2): `P_squared[thread_idx, index, :]` should contain the (approx.) + # TopK distance between `S_idx` to its neighbors (in thread_idx). And, + # these distances are sorted ascendingly. so, `P_squared[thread_idx, index, 0]` + # is smallest and `P_squared[thread_idx, index, -1]` is the largest in the array + # `P_squared[thread_idx, index, :]` + + # The value `d_squared = squared_distance_profile[idx]` is the squared-distance + # between `S_i` and the `S_idx`. Therefore, `d_squared` is the squared_distance + # from `S_idx` to one of its neighbors, `S_i`. If `d_squared` is less than + # `P_squared[thread_idx, idx, -1]`, then that means the so-far-discovered TopK + # for `S_idx` (i.e. `P_squared[thread_idx, idx, :]`) MUST be updated! + + # note: further explanation + # `squared_distance_profile` (of `S_i`) is actually the `i`-th row of + # Squared-Distance-Matrix. Its idx-th element (which is in idx-th column), + # is `d_squared = squared_distance_profile[idx]`. If `d_squared < P_squared[thread_idx, idx, -1]`, + # then `P_squared[thread_idx, idx, :]` MUST be updated. IDX = np.flatnonzero( squared_distance_profile < P_squared[thread_idx, :, -1] ) @@ -132,11 +152,16 @@ def _compute_PI( ) core._shift_at_index_and_insert(I[thread_idx, idx], pos, i) + # find EXACT (not approx.) value of `P_squared[thread_idx, i, 0]` idx = np.argmin(squared_distance_profile) core._shift_at_index_and_insert( P_squared[thread_idx, i], 0, squared_distance_profile[idx] ) core._shift_at_index_and_insert(I[thread_idx, i], 0, idx) + # [note] EXACT (not approx.) values of `P_squared[thread_idx, i, :]`` + # (not just its 0-th element) can be found by doing something like + # `np.sort(squared_distance_profile)[:k]`. However, it can increase the + # computing time, and thus this is avoided here. if P_squared[thread_idx, i, 0] == np.inf: # pragma: no cover I[thread_idx, i, 0] = -1 From 99095b8b4de8559a2977521e35989f83411dc46f Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 14 Jun 2022 01:15:17 -0600 Subject: [PATCH 177/416] minor changes to improve readability --- stumpy/scrump.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 165692317..db5683c29 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -133,7 +133,7 @@ def _compute_PI( # `P_squared[thread_idx, idx, -1]`, then that means the so-far-discovered TopK # for `S_idx` (i.e. `P_squared[thread_idx, idx, :]`) MUST be updated! - # note: further explanation + # note: further explanation! # `squared_distance_profile` (of `S_i`) is actually the `i`-th row of # Squared-Distance-Matrix. Its idx-th element (which is in idx-th column), # is `d_squared = squared_distance_profile[idx]`. If `d_squared < P_squared[thread_idx, idx, -1]`, @@ -142,23 +142,18 @@ def _compute_PI( squared_distance_profile < P_squared[thread_idx, :, -1] ) for idx in IDX: - pos = np.searchsorted( - P_squared[thread_idx, idx], - squared_distance_profile[idx], - side="right", - ) - core._shift_at_index_and_insert( - P_squared[thread_idx, idx], pos, squared_distance_profile[idx] - ) + d_squared = squared_distance_profile[idx] + pos = np.searchsorted(P_squared[thread_idx, idx], d_squared, side="right") + core._shift_at_index_and_insert(P_squared[thread_idx, idx], pos, d_squared) core._shift_at_index_and_insert(I[thread_idx, idx], pos, i) # find EXACT (not approx.) value of `P_squared[thread_idx, i, 0]` - idx = np.argmin(squared_distance_profile) + nn_of_i = np.argmin(squared_distance_profile) core._shift_at_index_and_insert( - P_squared[thread_idx, i], 0, squared_distance_profile[idx] + P_squared[thread_idx, i], 0, squared_distance_profile[nn_of_i] ) - core._shift_at_index_and_insert(I[thread_idx, i], 0, idx) - # [note] EXACT (not approx.) values of `P_squared[thread_idx, i, :]`` + core._shift_at_index_and_insert(I[thread_idx, i], 0, nn_of_i) + # [note] EXACT (not approx.) values of `P_squared[thread_idx, i, :]` # (not just its 0-th element) can be found by doing something like # `np.sort(squared_distance_profile)[:k]`. However, it can increase the # computing time, and thus this is avoided here. From 7a593b1f5a9ac3265d70b87aafd6741a579db189 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 14 Jun 2022 09:09:33 -0600 Subject: [PATCH 178/416] Revised comments --- stumpy/scrump.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index db5683c29..b06fe5ebe 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -109,7 +109,6 @@ def _compute_PI( for i in indices[start:stop]: Q = T_A[i : i + m] QT[:] = core._sliding_dot_product(Q, T_B) - # Update P[i] relative to all T[j : j + m] squared_distance_profile[:] = core._mass(Q, T_B, QT, μ_Q[i], σ_Q[i], M_T, Σ_T) squared_distance_profile[:] = np.square(squared_distance_profile) if excl_zone is not None: @@ -118,11 +117,13 @@ def _compute_PI( zone_stop = min(l, i + excl_zone) squared_distance_profile[zone_start : zone_stop + 1] = np.inf + # Update `P_squared[thread_idx, index, :]` with `squared_distance_profile[index]` + # Reminder(1): this `squared_distance_profile` is the (square of) distance profile # that corresponds to `S_i`, the subsequence with start index `i`. # Reminder(2): `P_squared[thread_idx, index, :]` should contain the (approx.) - # TopK distance between `S_idx` to its neighbors (in thread_idx). And, + # TopK distance between `S_index` to its neighbors (in thread_idx). And, # these distances are sorted ascendingly. so, `P_squared[thread_idx, index, 0]` # is smallest and `P_squared[thread_idx, index, -1]` is the largest in the array # `P_squared[thread_idx, index, :]` @@ -132,12 +133,18 @@ def _compute_PI( # from `S_idx` to one of its neighbors, `S_i`. If `d_squared` is less than # `P_squared[thread_idx, idx, -1]`, then that means the so-far-discovered TopK # for `S_idx` (i.e. `P_squared[thread_idx, idx, :]`) MUST be updated! + # Note that the matrix profile of indices in the trivial zone of `i` cannot + # be updated here since `squared_distance_profile` in those indices are + # set to inf. # note: further explanation! # `squared_distance_profile` (of `S_i`) is actually the `i`-th row of # Squared-Distance-Matrix. Its idx-th element (which is in idx-th column), # is `d_squared = squared_distance_profile[idx]`. If `d_squared < P_squared[thread_idx, idx, -1]`, - # then `P_squared[thread_idx, idx, :]` MUST be updated. + # it means this value (`d_squared`) can be in the TopK neighbors of `S_idx`. + # In other words, `d_squared` can be in TopK smallest values of `idx`-th COLUMN. (Recall + # that in SELF-JOIN we can use EITHER row OR column to find NearestNeighbors) + # Therefore, `P_squared[thread_idx, idx, :]` MUST be updated. IDX = np.flatnonzero( squared_distance_profile < P_squared[thread_idx, :, -1] ) @@ -147,16 +154,17 @@ def _compute_PI( core._shift_at_index_and_insert(P_squared[thread_idx, idx], pos, d_squared) core._shift_at_index_and_insert(I[thread_idx, idx], pos, i) - # find EXACT (not approx.) value of `P_squared[thread_idx, i, 0]` + # find EXACT (not approx.) value of `P_squared[thread_idx, i, 0]` to update + # matrix profile at index `i`. nn_of_i = np.argmin(squared_distance_profile) core._shift_at_index_and_insert( P_squared[thread_idx, i], 0, squared_distance_profile[nn_of_i] ) core._shift_at_index_and_insert(I[thread_idx, i], 0, nn_of_i) # [note] EXACT (not approx.) values of `P_squared[thread_idx, i, :]` - # (not just its 0-th element) can be found by doing something like - # `np.sort(squared_distance_profile)[:k]`. However, it can increase the - # computing time, and thus this is avoided here. + # (not just its 0-th element but ALL TopK) can be found by doing something like + # `np.sort(squared_distance_profile)[:k]`. However, this can increase the + # computing time, and thus this was avoided here. if P_squared[thread_idx, i, 0] == np.inf: # pragma: no cover I[thread_idx, i, 0] = -1 From 3efa744d3d706e80804917fda127d083d7b1a9fd Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 14 Jun 2022 09:16:02 -0600 Subject: [PATCH 179/416] Added comment to clarify the insertion indx idx-1 --- stumpy/stump.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/stumpy/stump.py b/stumpy/stump.py index f5a5fe811..e6e25c834 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -212,6 +212,12 @@ def _compute_diagonal( if T_B_subseq_isconstant[i + g] and T_A_subseq_isconstant[i]: pearson = 1.0 + # ρ[thread_idx, i, :] is sorted ascendingly. To update + # it, Its first element (i.e. the smallest value + # of array ρ[thread_idx, i]) MUST be discarded. Therefore, + # if the insertion index of new value in `ρ[thread_idx, i]` is idx, + # then, it should be substracted by 1 since the left of idx is shifted + # to the left. if pearson > ρ[thread_idx, i, 0]: idx = np.searchsorted(ρ[thread_idx, i], pearson) ρ[thread_idx, i, : idx - 1] = ρ[thread_idx, i, 1:idx] From 7d680b5731007080c99b399dfb51321fad7798d1 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 14 Jun 2022 09:31:24 -0600 Subject: [PATCH 180/416] Choosed shorter name for function --- stumpy/core.py | 2 +- stumpy/scrump.py | 38 ++++++++++++-------------------------- tests/test_core.py | 4 ++-- 3 files changed, 15 insertions(+), 29 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index f7b8d4ee4..3a22ac92c 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2608,7 +2608,7 @@ def _merge_topk_PI(PA, PB, IA, IB): @njit -def _shift_at_index_and_insert(a, idx, v): +def _shift_insert_at_index(a, idx, v): """ Insert value `v` into array `a` at index `idx` (in place) and discard the last element (i.e. without changing the length of `a`) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index b06fe5ebe..571d83c6d 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -151,16 +151,14 @@ def _compute_PI( for idx in IDX: d_squared = squared_distance_profile[idx] pos = np.searchsorted(P_squared[thread_idx, idx], d_squared, side="right") - core._shift_at_index_and_insert(P_squared[thread_idx, idx], pos, d_squared) - core._shift_at_index_and_insert(I[thread_idx, idx], pos, i) + core._shift_insert_at_index(P_squared[thread_idx, idx], pos, d_squared) + core._shift_insert_at_index(I[thread_idx, idx], pos, i) # find EXACT (not approx.) value of `P_squared[thread_idx, i, 0]` to update # matrix profile at index `i`. nn_of_i = np.argmin(squared_distance_profile) - core._shift_at_index_and_insert( - P_squared[thread_idx, i], 0, squared_distance_profile[nn_of_i] - ) - core._shift_at_index_and_insert(I[thread_idx, i], 0, nn_of_i) + core._shift_insert_at_index(P_squared[thread_idx, i], 0, squared_distance_profile[nn_of_i]) + core._shift_insert_at_index(I[thread_idx, i], 0, nn_of_i) # [note] EXACT (not approx.) values of `P_squared[thread_idx, i, :]` # (not just its 0-th element but ALL TopK) can be found by doing something like # `np.sort(squared_distance_profile)[:k]`. However, this can increase the @@ -193,19 +191,15 @@ def _compute_PI( pos = np.searchsorted( P_squared[thread_idx, i + g], d_squared, side="right" ) - core._shift_at_index_and_insert( - P_squared[thread_idx, i + g], pos, d_squared - ) - core._shift_at_index_and_insert(I[thread_idx, i + g], pos, j + g) + core._shift_insert_at_index(P_squared[thread_idx, i + g], pos, d_squared) + core._shift_insert_at_index(I[thread_idx, i + g], pos, j + g) if d_squared < P_squared[thread_idx, j + g, -1]: pos = np.searchsorted( P_squared[thread_idx, j + g], d_squared, side="right" ) - core._shift_at_index_and_insert( - P_squared[thread_idx, j + g], pos, d_squared - ) - core._shift_at_index_and_insert(I[thread_idx, j + g], pos, i + g) + core._shift_insert_at_index(P_squared[thread_idx, j + g], pos, d_squared) + core._shift_insert_at_index(I[thread_idx, j + g], pos, i + g) QT_j = QT_j_prime for g in range(1, min(s, i + 1, j + 1)): @@ -222,23 +216,15 @@ def _compute_PI( pos = np.searchsorted( P_squared[thread_idx, i - g], d_squared, side="right" ) - core._shift_at_index_and_shift_at_index_and_insert( - P_squared[thread_idx, i - g], pos, d_squared - ) - core._shift_at_index_and_shift_at_index_and_insert( - I[thread_idx, i - g], pos, j - g - ) + core._shift_insert_at_index(P_squared[thread_idx, i - g], pos, d_squared) + core._shift_insert_at_index(I[thread_idx, i - g], pos, j - g) if d_squared < P_squared[thread_idx, j - g, -1]: pos = np.searchsorted( P_squared[thread_idx, j - g], d_squared, side="right" ) - core._shift_at_index_and_shift_at_index_and_insert( - P_squared[thread_idx, j - g], pos, d_squared - ) - core._shift_at_index_and_shift_at_index_and_insert( - I[thread_idx, j - g], pos, i - g - ) + core._shift_insert_at_index(P_squared[thread_idx, j - g], pos, d_squared) + core._shift_insert_at_index(I[thread_idx, j - g], pos, i - g) @njit( diff --git a/tests/test_core.py b/tests/test_core.py index 25416dbcd..b7ea76b8c 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1089,7 +1089,7 @@ def test_merge_topk_PI(): npt.assert_array_equal(ref_I, comp_I) -def test_shift_at_index_and_insert(): +def test_shift_insert_at_index(): for k in range(1, 6): a = np.random.rand(k) @@ -1100,6 +1100,6 @@ def test_shift_at_index_and_insert(): comp = a.copy() ref = np.insert(ref, idx, v)[:-1] - core._shift_at_index_and_insert(comp, idx, v) # update comp in place + core._shift_insert_at_index(comp, idx, v) # update comp in place npt.assert_array_equal(ref, comp) From 6c8ab788b8f567323c07f50fca128b6b69c6e2f7 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 14 Jun 2022 11:34:38 -0600 Subject: [PATCH 181/416] Fixed typos --- stumpy/scrump.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 571d83c6d..4911bb9af 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -303,12 +303,12 @@ def _prescrump( ------- out1 : numpy.ndarray The (top-k) Matrix profile. When k = 1 (default), the first and only column - consists of the matrix profile. When k > 1, the output has exacly k columns + consists of the matrix profile. When k > 1, the output has exactly k columns consist of the top-k matrix profile. out2 : numpy.ndarray The (top-k) Matrix profile. When k = 1 (default), the first and only column - consists of the matrix profile indices. When k > 1, the output has exacly + consists of the matrix profile indices. When k > 1, the output has exactly k columns consist of the top-k matrix profile indices. Notes @@ -391,12 +391,12 @@ def prescrump(T_A, m, T_B=None, s=None, normalize=True, p=2.0, k=1): ------- P : numpy.ndarray The (top-k) Matrix profile. When k = 1 (default), the first and only column - consists of the matrix profile. When k > 1, the output has exacly k columns + consists of the matrix profile. When k > 1, the output has exactly k columns consist of the top-k matrix profile. I : numpy.ndarray The (top-k) Matrix profile. When k = 1 (default), the first and only column - consists of the matrix profile indices. When k > 1, the output has exacly + consists of the matrix profile indices. When k > 1, the output has exactly k columns consist of the top-k matrix profile indices. Notes From 0d6011dd361224df56fbf19a0931c812d8b51f07 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 14 Jun 2022 11:35:01 -0600 Subject: [PATCH 182/416] Renamed variable to improve readability --- tests/naive.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 40b9d71e2..ef5a42d78 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -237,28 +237,28 @@ def stump(T_A, m, T_B=None, exclusion_zone=None, row_wise=False, k=1): iter_range = range(-g, min(n_A - m + 1, n_B - m + 1 - g)) for i in iter_range: - D = distance_matrix[i, i + g] # D: a single element - if D < P[i, k - 1]: - idx = searchsorted_right(P[i], D) + d = distance_matrix[i, i + g] + if d < P[i, k - 1]: + idx = searchsorted_right(P[i], d) # to keep the top-k, we must get rid of the last element. - P[i, :k] = np.insert(P[i, :k], idx, D)[:-1] + P[i, :k] = np.insert(P[i, :k], idx, d)[:-1] I[i, :k] = np.insert(I[i, :k], idx, i + g)[:-1] if ignore_trivial: # Self-joins only - if D < P[i + g, k - 1]: - idx = searchsorted_right(P[i + g], D) - P[i + g, :k] = np.insert(P[i + g, :k], idx, D)[:-1] + if d < P[i + g, k - 1]: + idx = searchsorted_right(P[i + g], d) + P[i + g, :k] = np.insert(P[i + g, :k], idx, d)[:-1] I[i + g, :k] = np.insert(I[i + g, :k], idx, i)[:-1] if i < i + g: # Left matrix profile and left matrix profile index - if D < P[i + g, k]: - P[i + g, k] = D + if d < P[i + g, k]: + P[i + g, k] = d I[i + g, k] = i - if D < P[i, k + 1]: + if d < P[i, k + 1]: # right matrix profile and right matrix profile index - P[i, k + 1] = D + P[i, k + 1] = d I[i, k + 1] = i + g result = np.empty((l, 2 * k + 2), dtype=object) From 5bf6fc9b1af4699c9d7ef6006ad82f0c69aeca0b Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 14 Jun 2022 12:07:20 -0600 Subject: [PATCH 183/416] Revised and Improved comments --- stumpy/scrump.py | 48 +++++++++++++----------------------------------- 1 file changed, 13 insertions(+), 35 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 4911bb9af..bc80e6585 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -112,39 +112,20 @@ def _compute_PI( squared_distance_profile[:] = core._mass(Q, T_B, QT, μ_Q[i], σ_Q[i], M_T, Σ_T) squared_distance_profile[:] = np.square(squared_distance_profile) if excl_zone is not None: - # self-join zone_start = max(0, i - excl_zone) zone_stop = min(l, i + excl_zone) squared_distance_profile[zone_start : zone_stop + 1] = np.inf - # Update `P_squared[thread_idx, index, :]` with `squared_distance_profile[index]` - - # Reminder(1): this `squared_distance_profile` is the (square of) distance profile - # that corresponds to `S_i`, the subsequence with start index `i`. - - # Reminder(2): `P_squared[thread_idx, index, :]` should contain the (approx.) - # TopK distance between `S_index` to its neighbors (in thread_idx). And, - # these distances are sorted ascendingly. so, `P_squared[thread_idx, index, 0]` - # is smallest and `P_squared[thread_idx, index, -1]` is the largest in the array - # `P_squared[thread_idx, index, :]` - - # The value `d_squared = squared_distance_profile[idx]` is the squared-distance - # between `S_i` and the `S_idx`. Therefore, `d_squared` is the squared_distance - # from `S_idx` to one of its neighbors, `S_i`. If `d_squared` is less than - # `P_squared[thread_idx, idx, -1]`, then that means the so-far-discovered TopK - # for `S_idx` (i.e. `P_squared[thread_idx, idx, :]`) MUST be updated! - # Note that the matrix profile of indices in the trivial zone of `i` cannot - # be updated here since `squared_distance_profile` in those indices are - # set to inf. - - # note: further explanation! - # `squared_distance_profile` (of `S_i`) is actually the `i`-th row of - # Squared-Distance-Matrix. Its idx-th element (which is in idx-th column), - # is `d_squared = squared_distance_profile[idx]`. If `d_squared < P_squared[thread_idx, idx, -1]`, - # it means this value (`d_squared`) can be in the TopK neighbors of `S_idx`. - # In other words, `d_squared` can be in TopK smallest values of `idx`-th COLUMN. (Recall - # that in SELF-JOIN we can use EITHER row OR column to find NearestNeighbors) - # Therefore, `P_squared[thread_idx, idx, :]` MUST be updated. + if excl_zone is not None: # self-join + # note: S_index = T[index: index + m] + # `v = squared_distance_profile[idx]` is (the square of) + # `dist(S_i, S_idx)`, which is the same as `dist(S_idx, S_i)`. So, + # `squared_distance_profile[idx]` is (the square of) distane from `S_idx` + # to one of its neighbors, `S_i`. Therefore, the value `v` can be used to + # update the TopK of `S_idx`, stored "ascendingly" in `P_squared[thread_idx, idx, :]`. + + # `P_squared[thread_idx, idx, :]` in inf for those `idx` that are in the trivial zone, + # including the `i` itself. So, those will not be updated here. IDX = np.flatnonzero( squared_distance_profile < P_squared[thread_idx, :, -1] ) @@ -154,19 +135,16 @@ def _compute_PI( core._shift_insert_at_index(P_squared[thread_idx, idx], pos, d_squared) core._shift_insert_at_index(I[thread_idx, idx], pos, i) - # find EXACT (not approx.) value of `P_squared[thread_idx, i, 0]` to update - # matrix profile at index `i`. + # find EXACT (not approx.) value of `P_squared[thread_idx, i, 0]` nn_of_i = np.argmin(squared_distance_profile) core._shift_insert_at_index(P_squared[thread_idx, i], 0, squared_distance_profile[nn_of_i]) core._shift_insert_at_index(I[thread_idx, i], 0, nn_of_i) - # [note] EXACT (not approx.) values of `P_squared[thread_idx, i, :]` - # (not just its 0-th element but ALL TopK) can be found by doing something like - # `np.sort(squared_distance_profile)[:k]`. However, this can increase the - # computing time, and thus this was avoided here. if P_squared[thread_idx, i, 0] == np.inf: # pragma: no cover I[thread_idx, i, 0] = -1 else: + # update P_squared[thread_idx, index, :] for those `index` that are + # in the vicinity of `i` or its 1NN, `j`. j = I[thread_idx, i, 0] # Given the squared distance, work backwards and compute QT QT_j = (m - P_squared[thread_idx, i, 0] / 2.0) * (Σ_T[j] * σ_Q[i]) + ( From b978c70880281bfba9535ba0ff5d536c806b546e Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 14 Jun 2022 12:08:11 -0600 Subject: [PATCH 184/416] Corrected format --- stumpy/scrump.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index bc80e6585..db5417010 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -116,7 +116,7 @@ def _compute_PI( zone_stop = min(l, i + excl_zone) squared_distance_profile[zone_start : zone_stop + 1] = np.inf - if excl_zone is not None: # self-join + if excl_zone is not None: # self-join # note: S_index = T[index: index + m] # `v = squared_distance_profile[idx]` is (the square of) # `dist(S_i, S_idx)`, which is the same as `dist(S_idx, S_i)`. So, @@ -131,13 +131,17 @@ def _compute_PI( ) for idx in IDX: d_squared = squared_distance_profile[idx] - pos = np.searchsorted(P_squared[thread_idx, idx], d_squared, side="right") + pos = np.searchsorted( + P_squared[thread_idx, idx], d_squared, side="right" + ) core._shift_insert_at_index(P_squared[thread_idx, idx], pos, d_squared) core._shift_insert_at_index(I[thread_idx, idx], pos, i) # find EXACT (not approx.) value of `P_squared[thread_idx, i, 0]` nn_of_i = np.argmin(squared_distance_profile) - core._shift_insert_at_index(P_squared[thread_idx, i], 0, squared_distance_profile[nn_of_i]) + core._shift_insert_at_index( + P_squared[thread_idx, i], 0, squared_distance_profile[nn_of_i] + ) core._shift_insert_at_index(I[thread_idx, i], 0, nn_of_i) if P_squared[thread_idx, i, 0] == np.inf: # pragma: no cover @@ -169,14 +173,18 @@ def _compute_PI( pos = np.searchsorted( P_squared[thread_idx, i + g], d_squared, side="right" ) - core._shift_insert_at_index(P_squared[thread_idx, i + g], pos, d_squared) + core._shift_insert_at_index( + P_squared[thread_idx, i + g], pos, d_squared + ) core._shift_insert_at_index(I[thread_idx, i + g], pos, j + g) if d_squared < P_squared[thread_idx, j + g, -1]: pos = np.searchsorted( P_squared[thread_idx, j + g], d_squared, side="right" ) - core._shift_insert_at_index(P_squared[thread_idx, j + g], pos, d_squared) + core._shift_insert_at_index( + P_squared[thread_idx, j + g], pos, d_squared + ) core._shift_insert_at_index(I[thread_idx, j + g], pos, i + g) QT_j = QT_j_prime @@ -194,14 +202,18 @@ def _compute_PI( pos = np.searchsorted( P_squared[thread_idx, i - g], d_squared, side="right" ) - core._shift_insert_at_index(P_squared[thread_idx, i - g], pos, d_squared) + core._shift_insert_at_index( + P_squared[thread_idx, i - g], pos, d_squared + ) core._shift_insert_at_index(I[thread_idx, i - g], pos, j - g) if d_squared < P_squared[thread_idx, j - g, -1]: pos = np.searchsorted( P_squared[thread_idx, j - g], d_squared, side="right" ) - core._shift_insert_at_index(P_squared[thread_idx, j - g], pos, d_squared) + core._shift_insert_at_index( + P_squared[thread_idx, j - g], pos, d_squared + ) core._shift_insert_at_index(I[thread_idx, j - g], pos, i - g) From fa33084098032052fff762196d2658b49f1e44a7 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 14 Jun 2022 12:10:23 -0600 Subject: [PATCH 185/416] Corrected style --- stumpy/scrump.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index db5417010..0c9da42ea 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -121,11 +121,12 @@ def _compute_PI( # `v = squared_distance_profile[idx]` is (the square of) # `dist(S_i, S_idx)`, which is the same as `dist(S_idx, S_i)`. So, # `squared_distance_profile[idx]` is (the square of) distane from `S_idx` - # to one of its neighbors, `S_i`. Therefore, the value `v` can be used to - # update the TopK of `S_idx`, stored "ascendingly" in `P_squared[thread_idx, idx, :]`. + # to one of its neighbors, `S_i`. Therefore, the value `v` can be + # used to update the TopK of `S_idx`, stored "ascendingly" in + # `P_squared[thread_idx, idx, :]`. - # `P_squared[thread_idx, idx, :]` in inf for those `idx` that are in the trivial zone, - # including the `i` itself. So, those will not be updated here. + # `P_squared[thread_idx, idx, :]` in inf for those `idx` that are in + # the trivial zone, including the `i` itself. Those are not updated here. IDX = np.flatnonzero( squared_distance_profile < P_squared[thread_idx, :, -1] ) From dab7c47c1823e7e3dfa128c79213d8c225a011f0 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 14 Jun 2022 12:54:36 -0600 Subject: [PATCH 186/416] Enhanced naive scrump to return TopK matrix profile --- tests/naive.py | 54 ++++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 28 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index ef5a42d78..cf5c2fa31 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1456,7 +1456,7 @@ def prescrump(T_A, m, T_B, s, exclusion_zone=None, k=1): return P, I -def scrump(T_A, m, T_B, percentage, exclusion_zone, pre_scrump, s): +def scrump(T_A, m, T_B, percentage, exclusion_zone, pre_scrump, s, k=1): dist_matrix = distance_matrix(T_A, T_B, m) n_A = T_A.shape[0] @@ -1478,42 +1478,40 @@ def scrump(T_A, m, T_B, percentage, exclusion_zone, pre_scrump, s): diags_ranges_start = diags_ranges[0, 0] diags_ranges_stop = diags_ranges[0, 1] - out = np.full((l, 4), np.inf, dtype=object) - out[:, 1:] = -1 - left_P = np.full(l, np.inf, dtype=np.float64) - right_P = np.full(l, np.inf, dtype=np.float64) + P = np.full((l, k + 2), np.inf, dtype=np.float64) # Topk + left/ right + I = np.full((l, k + 2), -1, dtype=np.int64) # Topk + left/ right for diag_idx in range(diags_ranges_start, diags_ranges_stop): - k = diags[diag_idx] + g = diags[diag_idx] for i in range(n_A - m + 1): for j in range(n_B - m + 1): - if j - i == k: - if dist_matrix[i, j] < out[i, 0]: - out[i, 0] = dist_matrix[i, j] - out[i, 1] = i + k - - if exclusion_zone is not None and dist_matrix[i, j] < out[i + k, 0]: - out[i + k, 0] = dist_matrix[i, j] - out[i + k, 1] = i + if j - i == g: + d = dist_matrix[i, j] + if d < P[i, k - 1]: + # update TopK of P[i] + idx = searchsorted_right(P[i], d) + P[i, :k] = np.insert(P[i, :k], idx, d)[:-1] + I[i, :k] = np.insert(I[i, :k], idx, i + g)[:-1] + + if exclusion_zone is not None and d < P[i + g, k - 1]: + idx = searchsorted_right(P[i + g], d) + P[i + g, :k] = np.insert(P[i + g, :k], idx, d)[:-1] + I[i + g, :k] = np.insert(I[i + g, :k], idx, i)[:-1] # left matrix profile and left matrix profile indices - if ( - exclusion_zone is not None - and i < i + k - and dist_matrix[i, j] < left_P[i + k] - ): - left_P[i + k] = dist_matrix[i, j] - out[i + k, 2] = i + if exclusion_zone is not None and i < i + g and d < P[i + g, k]: + P[i + g, k] = d + I[i + g, k] = i # right matrix profile and right matrix profile indices - if ( - exclusion_zone is not None - and i + k > i - and dist_matrix[i, j] < right_P[i] - ): - right_P[i] = dist_matrix[i, j] - out[i, 3] = i + k + if exclusion_zone is not None and i + g > i and d < P[i, k + 1]: + P[i, k + 1] = d + I[i, k + 1] = i + g + + out = np.empty((l, 2 * k + 2), dtype=object) + out[:, :k] = P[:, :k] + out[:, k:] = I return out From ed30ea0d35a7f8b45ab0d284128fce0ed62dcdec Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 14 Jun 2022 12:55:45 -0600 Subject: [PATCH 187/416] Added new test function --- tests/test_scrump.py | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/tests/test_scrump.py b/tests/test_scrump.py index 0989010ce..1d581e2c5 100644 --- a/tests/test_scrump.py +++ b/tests/test_scrump.py @@ -726,3 +726,43 @@ def test_prescrump_A_B_join_KNN(T_A, T_B): npt.assert_almost_equal(ref_P, comp_P) npt.assert_almost_equal(ref_I, comp_I) + + +@pytest.mark.parametrize("T_A, T_B", test_data) +@pytest.mark.parametrize("percentages", percentages) +def test_scrump_self_join_KNN(T_A, T_B, percentages): + m = 3 + zone = int(np.ceil(m / 4)) + + for k in range(2, 4): + for percentage in percentages: + seed = np.random.randint(100000) + + np.random.seed(seed) + ref_mp = naive.scrump(T_B, m, T_B, percentage, zone, False, None, k=k) + ref_P = ref_mp[:, 0] + ref_I = ref_mp[:, 1] + ref_left_I = ref_mp[:, 2] + ref_right_I = ref_mp[:, 3] + + np.random.seed(seed) + approx = scrump( + T_B, + m, + ignore_trivial=True, + percentage=percentage, + pre_scrump=False, + k=k, + ) + approx.update() + comp_P = approx.P_ + comp_I = approx.I_ + comp_left_I = approx.left_I_ + comp_right_I = approx.right_I_ + + naive.replace_inf(ref_P) + naive.replace_inf(comp_P) + npt.assert_almost_equal(ref_P, comp_P) + npt.assert_almost_equal(ref_I, comp_I) + npt.assert_almost_equal(ref_left_I, comp_left_I) + npt.assert_almost_equal(ref_right_I, comp_right_I) From 3fa9f54272ca6c272688b1bf40437892cdf212ed Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 14 Jun 2022 13:15:06 -0600 Subject: [PATCH 188/416] Enhanced scrump to return TopK matrix profile --- stumpy/scrump.py | 89 ++++++++++++++++++++++++++++-------------------- 1 file changed, 53 insertions(+), 36 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 0c9da42ea..2388676bd 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -481,6 +481,11 @@ class scrump: The p-norm to apply for computing the Minkowski distance. This parameter is ignored when `normalize == True`. + k : int, default 1 + The number of top `k` smallest distances used to construct the matrix profile. + Note that this will increase the total computational time and memory usage + when k > 1. + Attributes ---------- P_ : numpy.ndarray @@ -544,6 +549,7 @@ def __init__( s=None, normalize=True, p=2.0, + k=1, ): """ Initialize the `scrump` object @@ -586,6 +592,11 @@ def __init__( p : float, default 2.0 The p-norm to apply for computing the Minkowski distance. This parameter is ignored when `normalize == True`. + + k : int, default 1 + The number of top `k` smallest distances used to construct the matrix + profile. Note that this will increase the total computational time and + memory usage when k > 1. """ self._ignore_trivial = ignore_trivial @@ -642,11 +653,15 @@ def __init__( self._n_A = self._T_A.shape[0] self._n_B = self._T_B.shape[0] self._l = self._n_A - self._m + 1 + self._k = k - self._P = np.empty((self._l, 3), dtype=np.float64) - self._I = np.empty((self._l, 3), dtype=np.int64) - self._P[:, :] = np.inf - self._I[:, :] = -1 + self._P = np.full((self._l, self._k), np.inf, dtype=np.float64) + self._PL = np.full(self._l, np.inf, dtype=np.float64) + self._PR = np.full(self._l, np.inf, dtype=np.float64) + + self._I = np.full((self._l, self._k), -1, dtype=np.int64) + self._IL = np.full(self._l, -1, dtype=np.int64) + self._IR = np.full(self._l, -1, dtype=np.int64) self._excl_zone = int(np.ceil(self._m / config.STUMPY_EXCL_ZONE_DENOM)) @@ -655,13 +670,11 @@ def __init__( if pre_scrump: if self._ignore_trivial: - P, I = prescrump(T_A, m, s=s) + P, I = prescrump(T_A, m, s=s, k=k) else: - P, I = prescrump(T_A, m, T_B=T_B, s=s) - for i in range(P.shape[0]): - if self._P[i, 0] > P[i]: - self._P[i, 0] = P[i] - self._I[i, 0] = I[i] + P, I = prescrump(T_A, m, T_B=T_B, s=s, k=k) + + core._merge_topk_PI(self._P, P, self._I, I) if self._ignore_trivial: self._diags = np.random.permutation( @@ -692,9 +705,9 @@ def __init__( def update(self): """ - Update the matrix profile and the matrix profile indices by computing - additional new distances (limited by `percentage`) that make up the full - distance matrix. + Update the (top-k) matrix profile and the (top-k) matrix profile indices by + computing additional new distances (limited by `percentage`) that make up + the full distance matrix. """ if self._chunk_idx < self._n_chunks: start_idx, stop_idx = self._chunk_diags_ranges[self._chunk_idx] @@ -715,52 +728,56 @@ def update(self): self._T_B_subseq_isconstant, self._diags[start_idx:stop_idx], self._ignore_trivial, - 1, # revise module to accept parameter k for top-k matrix profile + self._k, ) - P = np.column_stack((P, PL, PR)) - I = np.column_stack((I, IL, IR)) - - # Update matrix profile and indices - for i in range(self._P.shape[0]): - if self._P[i, 0] > P[i, 0]: - self._P[i, 0] = P[i, 0] - self._I[i, 0] = I[i, 0] - # left matrix profile and left matrix profile indices - if self._P[i, 1] > P[i, 1]: - self._P[i, 1] = P[i, 1] - self._I[i, 1] = I[i, 1] - # right matrix profile and right matrix profile indices - if self._P[i, 2] > P[i, 2]: - self._P[i, 2] = P[i, 2] - self._I[i, 2] = I[i, 2] + # Update (top-k) matrix profile and indices + core._merge_topk_PI(self._P, P, self._I, I) + + # update left matrix profile and indices + cond = PL < self._PL + self._PL = np.where(cond, PL, self._PL) + self._IL = np.where(cond, IL, self._IL) + + # update right matrix profile and indices + cond = PR < self._PR + self._PR = np.where(cond, PR, self._PR) + self._IR = np.where(cond, IR, self._IR) self._chunk_idx += 1 @property def P_(self): """ - Get the updated matrix profile + Get the updated (top-k) matrix profile. When `k=1`, it is a 1d array. + When `k>1`, it is a 2d array with exactly k columns consist of (top-k) matrix + profile. """ - return self._P[:, 0].astype(np.float64) + if self._k == 1: + return self._P.reshape((self._P.shape[0],)).astype(np.float64) + return self._P.astype(np.float64) @property def I_(self): """ - Get the updated matrix profile indices + Get the updated (top-k) matrix profile indices. When `k=1`, it is a 1d array. + When `k>1`, it is a 2d array with exactly k columns consist of (top-k) matrix + profile indices. """ - return self._I[:, 0].astype(np.int64) + if self._k == 1: + return self._I.reshape((self._I.shape[0],)).astype(np.int64) + return self._I.astype(np.int64) @property def left_I_(self): """ Get the updated left matrix profile indices """ - return self._I[:, 1].astype(np.int64) + return self._IL.astype(np.int64) @property def right_I_(self): """ Get the updated right matrix profile indices """ - return self._I[:, 2].astype(np.int64) + return self._IR.astype(np.int64) From c282f2c0df809e312c04ea6ef0d5f76d61294765 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 14 Jun 2022 13:19:21 -0600 Subject: [PATCH 189/416] Fixed test function --- tests/test_scrump.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_scrump.py b/tests/test_scrump.py index 1d581e2c5..206f432e3 100644 --- a/tests/test_scrump.py +++ b/tests/test_scrump.py @@ -740,10 +740,10 @@ def test_scrump_self_join_KNN(T_A, T_B, percentages): np.random.seed(seed) ref_mp = naive.scrump(T_B, m, T_B, percentage, zone, False, None, k=k) - ref_P = ref_mp[:, 0] - ref_I = ref_mp[:, 1] - ref_left_I = ref_mp[:, 2] - ref_right_I = ref_mp[:, 3] + ref_P = ref_mp[:, :k] + ref_I = ref_mp[:, k : 2 * k] + ref_left_I = ref_mp[:, 2 * k] + ref_right_I = ref_mp[:, 2 * k + 1] np.random.seed(seed) approx = scrump( From 380cf1d5c29ea36a42742aafb883fb1cfff37ec8 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 14 Jun 2022 13:55:02 -0600 Subject: [PATCH 190/416] Temporarily added parameter k to scraamp to pass non_normalized tests --- stumpy/scraamp.py | 1 + 1 file changed, 1 insertion(+) diff --git a/stumpy/scraamp.py b/stumpy/scraamp.py index 44a8b7c83..a6f26453c 100644 --- a/stumpy/scraamp.py +++ b/stumpy/scraamp.py @@ -423,6 +423,7 @@ def __init__( pre_scraamp=False, s=None, p=2.0, + k=1, # this function needs to be modified for top-k ): """ Initialize the `scraamp` object From 4ec3c5a6630ffcb8af9a226ebfde248dd5b8c6b8 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 14 Jun 2022 14:00:40 -0600 Subject: [PATCH 191/416] Added test function to test TopK scrump in AB_join --- tests/test_scrump.py | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/tests/test_scrump.py b/tests/test_scrump.py index 206f432e3..84d38d50c 100644 --- a/tests/test_scrump.py +++ b/tests/test_scrump.py @@ -766,3 +766,43 @@ def test_scrump_self_join_KNN(T_A, T_B, percentages): npt.assert_almost_equal(ref_I, comp_I) npt.assert_almost_equal(ref_left_I, comp_left_I) npt.assert_almost_equal(ref_right_I, comp_right_I) + + +@pytest.mark.parametrize("T_A, T_B", test_data) +@pytest.mark.parametrize("percentages", percentages) +def test_scrump_A_B_join_KNN(T_A, T_B, percentages): + m = 3 + for k in range(2, 4): + for percentage in percentages: + seed = np.random.randint(100000) + + np.random.seed(seed) + ref_mp = naive.scrump(T_A, m, T_B, percentage, None, False, None, k=k) + ref_P = ref_mp[:, :k] + ref_I = ref_mp[:, k : 2 * k] + ref_left_I = ref_mp[:, 2 * k] + ref_right_I = ref_mp[:, 2 * k + 1] + + np.random.seed(seed) + approx = scrump( + T_A, + m, + T_B, + ignore_trivial=False, + percentage=percentage, + pre_scrump=False, + k=k, + ) + approx.update() + comp_P = approx.P_ + comp_I = approx.I_ + comp_left_I = approx.left_I_ + comp_right_I = approx.right_I_ + + naive.replace_inf(ref_P) + naive.replace_inf(comp_P) + + npt.assert_almost_equal(ref_P, comp_P) + npt.assert_almost_equal(ref_I, comp_I) + npt.assert_almost_equal(ref_left_I, comp_left_I) + npt.assert_almost_equal(ref_right_I, comp_right_I) From 40132d4c9a3bd70988b7b08b46b2467bff81d9b4 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 16 Jun 2022 15:36:05 -0600 Subject: [PATCH 192/416] Refactored --- stumpy/core.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 3a22ac92c..9bba9d2a5 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2596,12 +2596,8 @@ def _merge_topk_PI(PA, PB, IA, IB): if PB[i, j] < PA[i, -1]: idx = np.searchsorted(PA[i, start:stop], PB[i, j], side="right") + start - for g in range(PB.shape[1] - 1, idx, -1): - PA[i, g] = PA[i, g - 1] - IA[i, g] = IA[i, g - 1] - - PA[i, idx] = PB[i, j] - IA[i, idx] = IB[i, j] + _shift_insert_at_index(PA[i], idx, PB[i, j]) + _shift_insert_at_index(IA[i], idx, IB[i, j]) start = idx stop += 1 # because of shifting elements to the right by one From b0132ca5919e4180fd7388fd80037b8faec77eb8 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 16 Jun 2022 15:45:13 -0600 Subject: [PATCH 193/416] Added definition of parameter k to docstring --- stumpy/scraamp.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/stumpy/scraamp.py b/stumpy/scraamp.py index a6f26453c..caeca4ad3 100644 --- a/stumpy/scraamp.py +++ b/stumpy/scraamp.py @@ -388,6 +388,11 @@ class scraamp: p : float, default 2.0 The p-norm to apply for computing the Minkowski distance. + k : int, default 1 + The number of top `k` smallest distances used to construct the matrix profile. + Note that this will increase the total computational time and memory usage + when k > 1. + Attributes ---------- P_ : numpy.ndarray @@ -460,6 +465,11 @@ def __init__( p : float, default 2.0 The p-norm to apply for computing the Minkowski distance. + + k : int, default 1 + The number of top `k` smallest distances used to construct the matrix profile. + Note that this will increase the total computational time and memory usage + when k > 1. """ self._ignore_trivial = ignore_trivial self._p = p From fdfdf07d74f9d9145d1dfcc1242203f3d437a2f2 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 16 Jun 2022 15:48:23 -0600 Subject: [PATCH 194/416] Improved docstring --- stumpy/scrump.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 2388676bd..aec047012 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -293,14 +293,14 @@ def _prescrump( Returns ------- out1 : numpy.ndarray - The (top-k) Matrix profile. When k = 1 (default), the first and only column - consists of the matrix profile. When k > 1, the output has exactly k columns - consist of the top-k matrix profile. + The (top-k) Matrix profile. When k=1 (default), the first (and only) column + in this 2D array consists of the matrix profile. When k > 1, the output + has exactly k columns consist of the top-k matrix profile. out2 : numpy.ndarray - The (top-k) Matrix profile. When k = 1 (default), the first and only column - consists of the matrix profile indices. When k > 1, the output has exactly - k columns consist of the top-k matrix profile indices. + The (top-k) Matrix profile indices. When k=1 (default), the first (and only) + column in this 2D array consists of the matrix profile indices. When k > 1, + the output has exactly k columns consist of the top-k matrix profile. Notes ----- From 7d9c76a8293b83ad4ebacbdc2f2c3da192625200 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 16 Jun 2022 15:57:57 -0600 Subject: [PATCH 195/416] Removed trailing colon --- stumpy/stump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index e6e25c834..901c1afe8 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -515,7 +515,7 @@ def _stump( PL = np.sqrt(p_norm_L) PR = np.sqrt(p_norm_R) - return P, PL, PR, I, IL[0, :], IR[0, :] + return P, PL, PR, I, IL[0], IR[0] @core.non_normalized(aamp) From 26749889dcdb44e01ebbebdc8ce0bdda83cf9f04 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 16 Jun 2022 15:59:45 -0600 Subject: [PATCH 196/416] Cleaned code --- stumpy/stumped.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/stumpy/stumped.py b/stumpy/stumped.py index fba8947ae..dc2978318 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -264,9 +264,6 @@ def stumped( ) ) - profile = np.empty((l, 2 * k)) - indices = np.empty((l, 2 * k)) - results = dask_client.gather(futures) profile, profile_L, profile_R, indices, indices_L, indices_R = results[0] From a1855a05fe14f6a6d1838340bf9eb7a9b37c51b6 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 16 Jun 2022 16:29:05 -0600 Subject: [PATCH 197/416] Avoided allocating new memory in inner for-loop --- tests/test_core.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test_core.py b/tests/test_core.py index b7ea76b8c..993f11afe 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1092,12 +1092,14 @@ def test_merge_topk_PI(): def test_shift_insert_at_index(): for k in range(1, 6): a = np.random.rand(k) + ref = np.empty(k, dtype=np.float64) + comp = np.empty(k, dtype=np.float64) indices = np.arange(k + 1) values = np.random.rand(k + 1) for (idx, v) in zip(indices, values): - ref = a.copy() - comp = a.copy() + ref[:] = a + comp[:] = a ref = np.insert(ref, idx, v)[:-1] core._shift_insert_at_index(comp, idx, v) # update comp in place From 5b561ffb2fd1d0b4de81b05ec60fdf8d251e283e Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 16 Jun 2022 16:30:38 -0600 Subject: [PATCH 198/416] Fixed typos --- stumpy/scrump.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index aec047012..441268fa4 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -120,12 +120,12 @@ def _compute_PI( # note: S_index = T[index: index + m] # `v = squared_distance_profile[idx]` is (the square of) # `dist(S_i, S_idx)`, which is the same as `dist(S_idx, S_i)`. So, - # `squared_distance_profile[idx]` is (the square of) distane from `S_idx` + # `squared_distance_profile[idx]` is (the square of) distance from `S_idx` # to one of its neighbors, `S_i`. Therefore, the value `v` can be # used to update the TopK of `S_idx`, stored "ascendingly" in # `P_squared[thread_idx, idx, :]`. - # `P_squared[thread_idx, idx, :]` in inf for those `idx` that are in + # `P_squared[thread_idx, idx, :]` is inf for those `idx` that are in # the trivial zone, including the `i` itself. Those are not updated here. IDX = np.flatnonzero( squared_distance_profile < P_squared[thread_idx, :, -1] From 3d02bf447eb86cb80fe5970099fa3f4316a54a24 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 16 Jun 2022 18:41:30 -0600 Subject: [PATCH 199/416] Improved comments --- stumpy/scrump.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 441268fa4..050e610ce 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -116,17 +116,12 @@ def _compute_PI( zone_stop = min(l, i + excl_zone) squared_distance_profile[zone_start : zone_stop + 1] = np.inf - if excl_zone is not None: # self-join - # note: S_index = T[index: index + m] - # `v = squared_distance_profile[idx]` is (the square of) - # `dist(S_i, S_idx)`, which is the same as `dist(S_idx, S_i)`. So, - # `squared_distance_profile[idx]` is (the square of) distance from `S_idx` - # to one of its neighbors, `S_i`. Therefore, the value `v` can be - # used to update the TopK of `S_idx`, stored "ascendingly" in - # `P_squared[thread_idx, idx, :]`. - - # `P_squared[thread_idx, idx, :]` is inf for those `idx` that are in - # the trivial zone, including the `i` itself. Those are not updated here. + if excl_zone is not None: + # Note that the squared distance, `squared_distance_profile[j]`, + # between subsequences `S_i = T[i : i + m]` and `S_j = T[j : j + m]` + # can be used to update the top-k for BOTH subsequence `i` and + # subsequence `j`. We update the latter here. + IDX = np.flatnonzero( squared_distance_profile < P_squared[thread_idx, :, -1] ) @@ -148,14 +143,14 @@ def _compute_PI( if P_squared[thread_idx, i, 0] == np.inf: # pragma: no cover I[thread_idx, i, 0] = -1 else: - # update P_squared[thread_idx, index, :] for those `index` that are - # in the vicinity of `i` or its 1NN, `j`. j = I[thread_idx, i, 0] # Given the squared distance, work backwards and compute QT QT_j = (m - P_squared[thread_idx, i, 0] / 2.0) * (Σ_T[j] * σ_Q[i]) + ( m * M_T[j] * μ_Q[i] ) QT_j_prime = QT_j + # Update Top-k of BOTH subsequences at i+g and j+g (i.e. left neighbor of i, j), + # by using the distance between `S_(i+g)` and `S_(j+g)` for g in range(1, min(s, l - max(i, j))): QT_j = ( QT_j @@ -189,6 +184,8 @@ def _compute_PI( core._shift_insert_at_index(I[thread_idx, j + g], pos, i + g) QT_j = QT_j_prime + # Update Top-k of BOTH subsequences at i-g and j-g (i.e. left neighbor of i, j), + # by using the distance between `S_(i-g)` and `S_(j-g)` for g in range(1, min(s, i + 1, j + 1)): QT_j = QT_j - T_B[i - g + m] * T_A[j - g + m] + T_B[i - g] * T_A[j - g] d_squared = core._calculate_squared_distance( From 551d2233554ef660e17e01e25bf7a5e04469c64e Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 16 Jun 2022 20:17:39 -0600 Subject: [PATCH 200/416] Avoided allocating new memory in each iteration --- tests/test_gpu_stump.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_gpu_stump.py b/tests/test_gpu_stump.py index ef8c03c1d..4d3093c99 100644 --- a/tests/test_gpu_stump.py +++ b/tests/test_gpu_stump.py @@ -55,6 +55,7 @@ def _gpu_searchsorted_kernel(A, V, bfs, nlevel, is_left, IDX): @pytest.mark.filterwarnings("ignore", category=NumbaPerformanceWarning) def test_gpu_searchsorted(): n = 3 * config.STUMPY_THREADS_PER_BLOCK + 1 + V = np.empty(n, dtype=np.float64) threads_per_block = config.STUMPY_THREADS_PER_BLOCK blocks_per_grid = math.ceil(n / threads_per_block) @@ -66,7 +67,7 @@ def test_gpu_searchsorted(): A = np.sort(np.random.rand(n, k), axis=1) device_A = cuda.to_device(A) - V = np.random.rand(n) + V[:] = np.random.rand(n) for i, idx in enumerate(np.random.choice(np.arange(n), size=k, replace=False)): V[idx] = A[idx, i] # create ties device_V = cuda.to_device(V) From 0de3a2812ec146776f8f7b2f43f1de67b2031758 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 16 Jun 2022 20:27:10 -0600 Subject: [PATCH 201/416] Same ndim in output regardless of value of k --- stumpy/scrump.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 050e610ce..89aecba9e 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -746,23 +746,20 @@ def update(self): @property def P_(self): """ - Get the updated (top-k) matrix profile. When `k=1`, it is a 1d array. - When `k>1`, it is a 2d array with exactly k columns consist of (top-k) matrix - profile. + Get the updated (top-k) matrix profile. When k=1 (default), the first (and only) + column in this 2D array consists of the matrix profile. When k > 1, the output + has exactly k columns consist of the top-k matrix profile. """ - if self._k == 1: - return self._P.reshape((self._P.shape[0],)).astype(np.float64) return self._P.astype(np.float64) @property def I_(self): """ - Get the updated (top-k) matrix profile indices. When `k=1`, it is a 1d array. - When `k>1`, it is a 2d array with exactly k columns consist of (top-k) matrix - profile indices. + Get the updated (top-k) matrix profile indices. When k=1 (default), the + first (and only) column in this 2D array consists of the matrix profile + indices. When k > 1, the output has exactly k columns consist of the top-k + matrix profile indices. """ - if self._k == 1: - return self._I.reshape((self._I.shape[0],)).astype(np.int64) return self._I.astype(np.int64) @property From d1e95f6465ff943cad939f55aee70fe7a35a4e2c Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 16 Jun 2022 20:35:34 -0600 Subject: [PATCH 202/416] Revised docstrings --- stumpy/scrump.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 89aecba9e..95b19e18f 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -486,19 +486,26 @@ class scrump: Attributes ---------- P_ : numpy.ndarray - The updated matrix profile + The updated (top-k) matrix profile I_ : numpy.ndarray - The updated matrix profile indices + The updated (top-k) matrix profile indices + + left_I_ : numpy.ndarray + The updated left (top-1) matrix profile indices + + right_I_ : numpy.ndarray + The updated right (top-1) matrix profile indices + Methods ------- update() Update the matrix profile and the matrix profile indices by computing additional new distances (limited by `percentage`) that make up the full - distance matrix. Each output contains three columns that correspond to - the matrix profile, the left matrix profile, and the right matrix profile, - respectively. + distance matrix. The outputs are (top-k) matrix profile, (top-1) left + matrix profile, (top-1) right matrix profile, (top-k) matrix profile indices, + (top-1) left matrix profile indices, (top-1) right matrix profile indices. See Also -------- @@ -765,13 +772,13 @@ def I_(self): @property def left_I_(self): """ - Get the updated left matrix profile indices + Get the updated left (top-1) matrix profile indices """ return self._IL.astype(np.int64) @property def right_I_(self): """ - Get the updated right matrix profile indices + Get the updated right (top-1) matrix profile indices """ return self._IR.astype(np.int64) From bfc4c8eae348136c0f21b81ce5496ef77dab8abb Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 16 Jun 2022 21:16:29 -0600 Subject: [PATCH 203/416] Enhanced function to perform shift left as well --- stumpy/core.py | 39 ++++++++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 9bba9d2a5..051155d51 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2604,27 +2604,48 @@ def _merge_topk_PI(PA, PB, IA, IB): @njit -def _shift_insert_at_index(a, idx, v): +def _shift_insert_at_index(a, idx, v, shift=1): """ Insert value `v` into array `a` at index `idx` (in place) and discard - the last element (i.e. without changing the length of `a`) + the last element (i.e. without changing the length of `a`) when `shift=1` (default). + When `shift=-1`, the first element will be discarded instead. + + Note + ---- + No check is performed to ensure the value of parameter `shift` is 1 or -1. + It is user's responsibility to provide a valid value for this parameter. Parameters ---------- a: numpy.ndarray - a 1d array + A 1d array idx: int - the index at which the value `v` should be inserted. This can be any - integer number from `0` to `len(a) - 1` + The index at which the value `v` should be inserted. This can be any + integer number from `0` to `len(a) - 1`. v: float - the value that should be inserted into array `a` at index `idx` + The value that should be inserted into array `a` at index `idx` + + shift: int, default 1 + The value 1 (default) indicates discarding the last element after inserting + value `v` at index `idx`. The other value, -1, indicates discarding the first + element after inserting value `v` at index `idx` Returns ------- None """ - if idx < len(a): - a[idx + 1 :] = a[idx:-1] - a[idx] = v + if shift == 1: + if 0 <= idx < len(a): + a[idx + 1 :] = a[idx:-1] + a[idx] = v + + elif shift == -1: + if 0 < idx <= len(a): + a[: idx - 1] = a[1 : idx] + # elements were shifted to left, and thus the insertion becomes `idx-1` + a[idx - 1] = v + + else: + pass From bbcb71f31ded82981171226229f90eab01df6ae9 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 16 Jun 2022 21:20:24 -0600 Subject: [PATCH 204/416] Enhanced test function to test newly added functionality --- tests/test_core.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tests/test_core.py b/tests/test_core.py index 993f11afe..65b25cdfc 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1097,11 +1097,24 @@ def test_shift_insert_at_index(): indices = np.arange(k + 1) values = np.random.rand(k + 1) + + # test shift = 1 for (idx, v) in zip(indices, values): ref[:] = a comp[:] = a ref = np.insert(ref, idx, v)[:-1] - core._shift_insert_at_index(comp, idx, v) # update comp in place + core._shift_insert_at_index(comp, idx, v, shift=1) # update comp in place + + npt.assert_array_equal(ref, comp) + + + # test shift = -1 + for (idx, v) in zip(indices, values): + ref[:] = a + comp[:] = a + + ref = np.insert(ref, idx, v)[1:] + core._shift_insert_at_index(comp, idx, v, shift=-1) # update comp in place npt.assert_array_equal(ref, comp) From ec889b476196a5b9ad23848978bd55334fc6a8ab Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 16 Jun 2022 21:21:44 -0600 Subject: [PATCH 205/416] Fixed format --- stumpy/core.py | 2 +- tests/test_core.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 051155d51..b4b102e83 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2643,7 +2643,7 @@ def _shift_insert_at_index(a, idx, v, shift=1): elif shift == -1: if 0 < idx <= len(a): - a[: idx - 1] = a[1 : idx] + a[: idx - 1] = a[1:idx] # elements were shifted to left, and thus the insertion becomes `idx-1` a[idx - 1] = v diff --git a/tests/test_core.py b/tests/test_core.py index 65b25cdfc..e3854b889 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1108,7 +1108,6 @@ def test_shift_insert_at_index(): npt.assert_array_equal(ref, comp) - # test shift = -1 for (idx, v) in zip(indices, values): ref[:] = a From f7ef962a66ee794cb502a582eba7f55a3d6a0ca4 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 16 Jun 2022 21:25:33 -0600 Subject: [PATCH 206/416] Fixed format --- stumpy/scraamp.py | 6 +++--- stumpy/scrump.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/stumpy/scraamp.py b/stumpy/scraamp.py index caeca4ad3..a0891bf22 100644 --- a/stumpy/scraamp.py +++ b/stumpy/scraamp.py @@ -467,9 +467,9 @@ def __init__( The p-norm to apply for computing the Minkowski distance. k : int, default 1 - The number of top `k` smallest distances used to construct the matrix profile. - Note that this will increase the total computational time and memory usage - when k > 1. + The number of top `k` smallest distances used to construct the matrix + profile. Note that this will increase the total computational time and + memory usage when k > 1. """ self._ignore_trivial = ignore_trivial self._p = p diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 95b19e18f..bb42b33fa 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -149,8 +149,8 @@ def _compute_PI( m * M_T[j] * μ_Q[i] ) QT_j_prime = QT_j - # Update Top-k of BOTH subsequences at i+g and j+g (i.e. left neighbor of i, j), - # by using the distance between `S_(i+g)` and `S_(j+g)` + # Update Top-k of BOTH subsequences at i+g and j+g (i.e. left neighbor + # of i, j), by using the distance between `S_(i+g)` and `S_(j+g)` for g in range(1, min(s, l - max(i, j))): QT_j = ( QT_j @@ -184,8 +184,8 @@ def _compute_PI( core._shift_insert_at_index(I[thread_idx, j + g], pos, i + g) QT_j = QT_j_prime - # Update Top-k of BOTH subsequences at i-g and j-g (i.e. left neighbor of i, j), - # by using the distance between `S_(i-g)` and `S_(j-g)` + # Update Top-k of BOTH subsequences at i-g and j-g (i.e. left neighbor + # of i, j), by using the distance between `S_(i-g)` and `S_(j-g)` for g in range(1, min(s, i + 1, j + 1)): QT_j = QT_j - T_B[i - g + m] * T_A[j - g + m] + T_B[i - g] * T_A[j - g] d_squared = core._calculate_squared_distance( From 92889162aa2673fcb3019637010643c4d8de9007 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 16 Jun 2022 21:32:34 -0600 Subject: [PATCH 207/416] Removed/Renamed intermediate variables --- stumpy/scrump.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index bb42b33fa..a00e19f4f 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -122,16 +122,17 @@ def _compute_PI( # can be used to update the top-k for BOTH subsequence `i` and # subsequence `j`. We update the latter here. - IDX = np.flatnonzero( + idx = np.flatnonzero( squared_distance_profile < P_squared[thread_idx, :, -1] ) - for idx in IDX: - d_squared = squared_distance_profile[idx] + for j in idx: pos = np.searchsorted( - P_squared[thread_idx, idx], d_squared, side="right" + P_squared[thread_idx, j], squared_distance_profile[j], side="right" ) - core._shift_insert_at_index(P_squared[thread_idx, idx], pos, d_squared) - core._shift_insert_at_index(I[thread_idx, idx], pos, i) + core._shift_insert_at_index( + P_squared[thread_idx, j], pos, squared_distance_profile[j] + ) + core._shift_insert_at_index(I[thread_idx, j], pos, i) # find EXACT (not approx.) value of `P_squared[thread_idx, i, 0]` nn_of_i = np.argmin(squared_distance_profile) From 163a775e36811cd9eecb07ea5f4729c05941b7ba Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 16 Jun 2022 21:40:44 -0600 Subject: [PATCH 208/416] Renamed variable for the sake of consistency --- stumpy/scrump.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index a00e19f4f..05aa6b63b 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -158,7 +158,7 @@ def _compute_PI( - T_B[i + g - 1] * T_A[j + g - 1] + T_B[i + g + m - 1] * T_A[j + g + m - 1] ) - d_squared = core._calculate_squared_distance( + D_squared = core._calculate_squared_distance( m, QT_j, M_T[i + g], @@ -166,21 +166,21 @@ def _compute_PI( μ_Q[j + g], σ_Q[j + g], ) - if d_squared < P_squared[thread_idx, i + g, -1]: + if D_squared < P_squared[thread_idx, i + g, -1]: pos = np.searchsorted( - P_squared[thread_idx, i + g], d_squared, side="right" + P_squared[thread_idx, i + g], D_squared, side="right" ) core._shift_insert_at_index( - P_squared[thread_idx, i + g], pos, d_squared + P_squared[thread_idx, i + g], pos, D_squared ) core._shift_insert_at_index(I[thread_idx, i + g], pos, j + g) - if d_squared < P_squared[thread_idx, j + g, -1]: + if D_squared < P_squared[thread_idx, j + g, -1]: pos = np.searchsorted( - P_squared[thread_idx, j + g], d_squared, side="right" + P_squared[thread_idx, j + g], D_squared, side="right" ) core._shift_insert_at_index( - P_squared[thread_idx, j + g], pos, d_squared + P_squared[thread_idx, j + g], pos, D_squared ) core._shift_insert_at_index(I[thread_idx, j + g], pos, i + g) @@ -189,7 +189,7 @@ def _compute_PI( # of i, j), by using the distance between `S_(i-g)` and `S_(j-g)` for g in range(1, min(s, i + 1, j + 1)): QT_j = QT_j - T_B[i - g + m] * T_A[j - g + m] + T_B[i - g] * T_A[j - g] - d_squared = core._calculate_squared_distance( + D_squared = core._calculate_squared_distance( m, QT_j, M_T[i - g], @@ -197,21 +197,21 @@ def _compute_PI( μ_Q[j - g], σ_Q[j - g], ) - if d_squared < P_squared[thread_idx, i - g, -1]: + if D_squared < P_squared[thread_idx, i - g, -1]: pos = np.searchsorted( - P_squared[thread_idx, i - g], d_squared, side="right" + P_squared[thread_idx, i - g], D_squared, side="right" ) core._shift_insert_at_index( - P_squared[thread_idx, i - g], pos, d_squared + P_squared[thread_idx, i - g], pos, D_squared ) core._shift_insert_at_index(I[thread_idx, i - g], pos, j - g) - if d_squared < P_squared[thread_idx, j - g, -1]: + if D_squared < P_squared[thread_idx, j - g, -1]: pos = np.searchsorted( - P_squared[thread_idx, j - g], d_squared, side="right" + P_squared[thread_idx, j - g], D_squared, side="right" ) core._shift_insert_at_index( - P_squared[thread_idx, j - g], pos, d_squared + P_squared[thread_idx, j - g], pos, D_squared ) core._shift_insert_at_index(I[thread_idx, j - g], pos, i - g) From cf3748da4967d2801da34885ba54cd6b1af8b06d Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 16 Jun 2022 22:30:31 -0600 Subject: [PATCH 209/416] Avoided shape mismatch by reshaping ndarray --- tests/test_scrump.py | 76 ++++++++++++++++++++++---------------------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/tests/test_scrump.py b/tests/test_scrump.py index 84d38d50c..0c9d54672 100644 --- a/tests/test_scrump.py +++ b/tests/test_scrump.py @@ -108,8 +108,8 @@ def test_scrump_self_join(T_A, T_B, percentages): np.random.seed(seed) ref_mp = naive.scrump(T_B, m, T_B, percentage, zone, False, None) - ref_P = ref_mp[:, 0] - ref_I = ref_mp[:, 1] + ref_P = ref_mp[:, 0].reshape(-1, 1) # to match shape of comp_P when k=1 + ref_I = ref_mp[:, 1].reshape(-1, 1) # to match shape of comp_I when k=1 ref_left_I = ref_mp[:, 2] ref_right_I = ref_mp[:, 3] @@ -141,8 +141,8 @@ def test_scrump_A_B_join(T_A, T_B, percentages): np.random.seed(seed) ref_mp = naive.scrump(T_A, m, T_B, percentage, None, False, None) - ref_P = ref_mp[:, 0] - ref_I = ref_mp[:, 1] + ref_P = ref_mp[:, 0].reshape(-1, 1) # to match shape of comp_P when k=1 + ref_I = ref_mp[:, 1].reshape(-1, 1) # to match shape of comp_I when k=1 ref_left_I = ref_mp[:, 2] ref_right_I = ref_mp[:, 3] @@ -175,8 +175,8 @@ def test_scrump_A_B_join_swap(T_A, T_B, percentages): np.random.seed(seed) ref_mp = naive.scrump(T_B, m, T_A, percentage, None, False, None) - ref_P = ref_mp[:, 0] - # ref_I = ref_mp[:, 1] + ref_P = ref_mp[:, 0].reshape(-1, 1) # to match shape of comp_P when k=1 + # ref_I = ref_mp[:, 1].reshape(-1, 1) ref_left_I = ref_mp[:, 2] ref_right_I = ref_mp[:, 3] @@ -211,8 +211,8 @@ def test_scrump_self_join_larger_window(T_A, T_B, m, percentages): np.random.seed(seed) ref_mp = naive.scrump(T_B, m, T_B, percentage, zone, False, None) - ref_P = ref_mp[:, 0] - ref_I = ref_mp[:, 1] + ref_P = ref_mp[:, 0].reshape(-1, 1) # to match shape of comp_P when k=1 + ref_I = ref_mp[:, 1].reshape(-1, 1) # to match shape of comp_I when k=1 ref_left_I = ref_mp[:, 2] ref_right_I = ref_mp[:, 3] @@ -241,8 +241,8 @@ def test_scrump_self_join_full(T_A, T_B): zone = int(np.ceil(m / 4)) ref_mp = naive.stump(T_B, m, exclusion_zone=zone, row_wise=True) - ref_P = ref_mp[:, 0] - ref_I = ref_mp[:, 1] + ref_P = ref_mp[:, 0].reshape(-1, 1) # to match shape of comp_P when k=1 + ref_I = ref_mp[:, 1].reshape(-1, 1) # to match shape of comp_I when k=1 ref_left_I = ref_mp[:, 2] ref_right_I = ref_mp[:, 3] @@ -262,8 +262,8 @@ def test_scrump_self_join_full(T_A, T_B): npt.assert_almost_equal(ref_right_I, comp_right_I) ref_mp = stump(T_B, m, ignore_trivial=True) - ref_P = ref_mp[:, 0] - ref_I = ref_mp[:, 1] + ref_P = ref_mp[:, 0].reshape(-1, 1) # to match shape of comp_P when k=1 + ref_I = ref_mp[:, 1].reshape(-1, 1) # to match shape of comp_I when k=1 ref_left_I = ref_mp[:, 2] ref_right_I = ref_mp[:, 3] @@ -279,8 +279,8 @@ def test_scrump_A_B_join_full(T_A, T_B): m = 3 ref_mp = naive.stump(T_A, m, T_B=T_B, row_wise=True) - ref_P = ref_mp[:, 0] - ref_I = ref_mp[:, 1] + ref_P = ref_mp[:, 0].reshape(-1, 1) # to match shape of comp_P when k=1 + ref_I = ref_mp[:, 1].reshape(-1, 1) # to match shape of comp_I when k=1 ref_left_I = ref_mp[:, 2] ref_right_I = ref_mp[:, 3] @@ -300,8 +300,8 @@ def test_scrump_A_B_join_full(T_A, T_B): npt.assert_almost_equal(ref_right_I, comp_right_I) ref_mp = stump(T_A, m, T_B=T_B, ignore_trivial=False) - ref_P = ref_mp[:, 0] - ref_I = ref_mp[:, 1] + ref_P = ref_mp[:, 0].reshape(-1, 1) # to match shape of comp_P when k=1 + ref_I = ref_mp[:, 1].reshape(-1, 1) # to match shape of comp_I when k=1 ref_left_I = ref_mp[:, 2] ref_right_I = ref_mp[:, 3] @@ -317,8 +317,8 @@ def test_scrump_A_B_join_full_swap(T_A, T_B): m = 3 ref_mp = naive.stump(T_B, m, T_B=T_A, row_wise=True) - ref_P = ref_mp[:, 0] - ref_I = ref_mp[:, 1] + ref_P = ref_mp[:, 0].reshape(-1, 1) # to match shape of comp_P when k=1 + ref_I = ref_mp[:, 1].reshape(-1, 1) # to match shape of comp_I when k=1 ref_left_I = ref_mp[:, 2] ref_right_I = ref_mp[:, 3] @@ -345,8 +345,8 @@ def test_scrump_self_join_full_larger_window(T_A, T_B, m): zone = int(np.ceil(m / 4)) ref_mp = naive.stump(T_B, m, exclusion_zone=zone, row_wise=True) - ref_P = ref_mp[:, 0] - ref_I = ref_mp[:, 1] + ref_P = ref_mp[:, 0].reshape(-1, 1) # to match shape of comp_P when k=1 + ref_I = ref_mp[:, 1].reshape(-1, 1) # to match shape of comp_I when k=1 ref_left_I = ref_mp[:, 2] ref_right_I = ref_mp[:, 3] @@ -383,8 +383,8 @@ def test_scrump_plus_plus_self_join(T_A, T_B, percentages): if ref_P[i] < ref_mp[i, 0]: ref_mp[i, 0] = ref_P[i] ref_mp[i, 1] = ref_I[i] - ref_P = ref_mp[:, 0] - ref_I = ref_mp[:, 1] + ref_P = ref_mp[:, 0].reshape(-1, 1) # to match shape of comp_P when k=1 + ref_I = ref_mp[:, 1].reshape(-1, 1) # to match shape of comp_I when k=1 # ref_left_I = ref_mp[:, 2] # ref_right_I = ref_mp[:, 3] @@ -424,8 +424,8 @@ def test_scrump_plus_plus_A_B_join(T_A, T_B, percentages): if ref_P[i] < ref_mp[i, 0]: ref_mp[i, 0] = ref_P[i] ref_mp[i, 1] = ref_I[i] - ref_P = ref_mp[:, 0] - ref_I = ref_mp[:, 1] + ref_P = ref_mp[:, 0].reshape(-1, 1) # to match shape of comp_P when k=1 + ref_I = ref_mp[:, 1].reshape(-1, 1) # to match shape of comp_I when k=1 ref_left_I = ref_mp[:, 2] ref_right_I = ref_mp[:, 3] @@ -459,8 +459,8 @@ def test_scrump_plus_plus_self_join_full(T_A, T_B): zone = int(np.ceil(m / 4)) ref_mp = naive.stump(T_B, m, exclusion_zone=zone, row_wise=True) - ref_P = ref_mp[:, 0] - ref_I = ref_mp[:, 1] + ref_P = ref_mp[:, 0].reshape(-1, 1) # to match shape of comp_P when k=1 + ref_I = ref_mp[:, 1].reshape(-1, 1) # to match shape of comp_I when k=1 ref_left_I = ref_mp[:, 2] ref_right_I = ref_mp[:, 3] @@ -488,8 +488,8 @@ def test_scrump_plus_plus_A_B_join_full(T_A, T_B): zone = int(np.ceil(m / 4)) ref_mp = naive.stump(T_A, m, T_B=T_B, row_wise=True) - ref_P = ref_mp[:, 0] - ref_I = ref_mp[:, 1] + ref_P = ref_mp[:, 0].reshape(-1, 1) # to match shape of comp_P when k=1 + ref_I = ref_mp[:, 1].reshape(-1, 1) # to match shape of comp_I when k=1 ref_left_I = ref_mp[:, 2] ref_right_I = ref_mp[:, 3] @@ -517,8 +517,8 @@ def test_scrump_plus_plus_A_B_join_full_swap(T_A, T_B): zone = int(np.ceil(m / 4)) ref_mp = naive.stump(T_B, m, T_B=T_A, row_wise=True) - ref_P = ref_mp[:, 0] - ref_I = ref_mp[:, 1] + ref_P = ref_mp[:, 0].reshape(-1, 1) # to match shape of comp_P when k=1 + ref_I = ref_mp[:, 1].reshape(-1, 1) # to match shape of comp_I when k=1 ref_left_I = ref_mp[:, 2] ref_right_I = ref_mp[:, 3] @@ -552,8 +552,8 @@ def test_scrump_constant_subsequence_self_join(percentages): np.random.seed(seed) ref_mp = naive.scrump(T, m, T, percentage, zone, False, None) - ref_P = ref_mp[:, 0] - ref_I = ref_mp[:, 1] + ref_P = ref_mp[:, 0].reshape(-1, 1) # to match shape of comp_P when k=1 + ref_I = ref_mp[:, 1].reshape(-1, 1) # to match shape of comp_I when k=1 ref_left_I = ref_mp[:, 2] ref_right_I = ref_mp[:, 3] @@ -590,8 +590,8 @@ def test_scrump_identical_subsequence_self_join(percentages): np.random.seed(seed) ref_mp = naive.scrump(T, m, T, percentage, zone, False, None) - ref_P = ref_mp[:, 0] - # ref_I = ref_mp[:, 1] + ref_P = ref_mp[:, 0].reshape(-1, 1) # to match shape of comp_P when k=1 + # ref_I = ref_mp[:, 1].reshape(-1, 1) # ref_left_I = ref_mp[:, 2] # ref_right_I = ref_mp[:, 3] @@ -636,8 +636,8 @@ def test_scrump_nan_inf_self_join( np.random.seed(seed) ref_mp = naive.scrump(T_B_sub, m, T_B_sub, percentage, zone, False, None) - ref_P = ref_mp[:, 0] - ref_I = ref_mp[:, 1] + ref_P = ref_mp[:, 0].reshape(-1, 1) # to match shape of comp_P when k=1 + ref_I = ref_mp[:, 1].reshape(-1, 1) # to match shape of comp_I when k=1 ref_left_I = ref_mp[:, 2] ref_right_I = ref_mp[:, 3] @@ -670,8 +670,8 @@ def test_scrump_nan_zero_mean_self_join(percentages): np.random.seed(seed) ref_mp = naive.scrump(T, m, T, percentage, zone, False, None) - ref_P = ref_mp[:, 0] - ref_I = ref_mp[:, 1] + ref_P = ref_mp[:, 0].reshape(-1, 1) # to match shape of comp_P when k=1 + ref_I = ref_mp[:, 1].reshape(-1, 1) # to match shape of comp_I when k=1 ref_left_I = ref_mp[:, 2] ref_right_I = ref_mp[:, 3] From 467f4a3171a11a0321fec64419af4b357658790e Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 16 Jun 2022 23:49:25 -0600 Subject: [PATCH 210/416] Refactored --- stumpy/stump.py | 47 +++++++++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index 901c1afe8..a1570f8df 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -212,28 +212,26 @@ def _compute_diagonal( if T_B_subseq_isconstant[i + g] and T_A_subseq_isconstant[i]: pearson = 1.0 - # ρ[thread_idx, i, :] is sorted ascendingly. To update - # it, Its first element (i.e. the smallest value - # of array ρ[thread_idx, i]) MUST be discarded. Therefore, - # if the insertion index of new value in `ρ[thread_idx, i]` is idx, - # then, it should be substracted by 1 since the left of idx is shifted - # to the left. + # ρ[thread_idx, i, :] is sorted ascendingly. It MUST be updated + # when the newly-calculated pearson value becomes greater than the + # first (i.e. smallest) element of this array. (Reminder: higher + # pearson value means lower distance, which is of our interest) if pearson > ρ[thread_idx, i, 0]: - idx = np.searchsorted(ρ[thread_idx, i], pearson) - ρ[thread_idx, i, : idx - 1] = ρ[thread_idx, i, 1:idx] - ρ[thread_idx, i, idx - 1] = pearson - - I[thread_idx, i, : idx - 1] = I[thread_idx, i, 1:idx] - I[thread_idx, i, idx - 1] = i + g + pos = np.searchsorted(ρ[thread_idx, i], pearson) + core._shift_insert_at_index( + ρ[thread_idx, i], pos, pearson, shift=-1 + ) + core._shift_insert_at_index(I[thread_idx, i], pos, i + g, shift=-1) if ignore_trivial: # self-joins only if pearson > ρ[thread_idx, i + g, 0]: - idx = np.searchsorted(ρ[thread_idx, i + g], pearson) - ρ[thread_idx, i + g, : idx - 1] = ρ[thread_idx, i + g, 1:idx] - ρ[thread_idx, i + g, idx - 1] = pearson - - I[thread_idx, i + g, : idx - 1] = I[thread_idx, i + g, 1:idx] - I[thread_idx, i + g, idx - 1] = i + pos = np.searchsorted(ρ[thread_idx, i + g], pearson) + core._shift_insert_at_index( + ρ[thread_idx, i + g], pos, pearson, shift=-1 + ) + core._shift_insert_at_index( + I[thread_idx, i + g], pos, i, shift=-1 + ) if i < i + g: # left pearson correlation and left matrix profile index @@ -477,12 +475,13 @@ def _stump( k - 1, -1, -1 ): # reverse iteration to preserve order in ties if ρ[0, i, 0] < ρ[thread_idx, i, j]: - idx = np.searchsorted(ρ[0, i], ρ[thread_idx, i, j]) - ρ[0, i, : idx - 1] = ρ[0, i, 1:idx] - ρ[0, i, idx - 1] = ρ[thread_idx, i, j] - - I[0, i, : idx - 1] = I[0, i, 1:idx] - I[0, i, idx - 1] = I[thread_idx, i, j] + pos = np.searchsorted(ρ[0, i], ρ[thread_idx, i, j]) + core._shift_insert_at_index( + ρ[0, i], pos, ρ[thread_idx, i, j], shift=-1 + ) + core._shift_insert_at_index( + I[0, i], pos, I[thread_idx, i, j], shift=-1 + ) if ρL[0, i] < ρL[thread_idx, i]: ρL[0, i] = ρL[thread_idx, i] From d0f59562c485e18210dfd92f5bc3fcccd8b196f8 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 16 Jun 2022 23:54:12 -0600 Subject: [PATCH 211/416] Fixed comment --- stumpy/scrump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 05aa6b63b..6a979d2cd 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -150,7 +150,7 @@ def _compute_PI( m * M_T[j] * μ_Q[i] ) QT_j_prime = QT_j - # Update Top-k of BOTH subsequences at i+g and j+g (i.e. left neighbor + # Update Top-k of BOTH subsequences at i+g and j+g (i.e. right neighbor # of i, j), by using the distance between `S_(i+g)` and `S_(j+g)` for g in range(1, min(s, l - max(i, j))): QT_j = ( From 80b8594543f335ed4070e3548915bac158103591 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 17 Jun 2022 00:44:18 -0600 Subject: [PATCH 212/416] Refacored and Minor restructuring of lines --- stumpy/scrump.py | 168 +++++++++++++++++++++++------------------------ 1 file changed, 84 insertions(+), 84 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 6a979d2cd..d3c938e61 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -112,10 +112,91 @@ def _compute_PI( squared_distance_profile[:] = core._mass(Q, T_B, QT, μ_Q[i], σ_Q[i], M_T, Σ_T) squared_distance_profile[:] = np.square(squared_distance_profile) if excl_zone is not None: - zone_start = max(0, i - excl_zone) - zone_stop = min(l, i + excl_zone) - squared_distance_profile[zone_start : zone_stop + 1] = np.inf + core._apply_exclusion_zone(squared_distance_profile, i, excl_zone, np.inf) + # find EXACT (not approx.) value of `P_squared[thread_idx, i, 0]` + nn = np.argmin(squared_distance_profile) + core._shift_insert_at_index( + P_squared[thread_idx, i], 0, squared_distance_profile[nn] + ) + core._shift_insert_at_index(I[thread_idx, i], 0, nn) + + if P_squared[thread_idx, i, 0] == np.inf: # pragma: no cover + I[thread_idx, i, 0] = -1 + continue + + j = I[thread_idx, i, 0] + # Given the squared distance, work backwards and compute QT + QT_j = (m - P_squared[thread_idx, i, 0] / 2.0) * (Σ_T[j] * σ_Q[i]) + ( + m * M_T[j] * μ_Q[i] + ) + QT_j_prime = QT_j + # Update Top-k of BOTH subsequences at i+g and j+g (i.e. right neighbor + # of i, j), by using the distance between `S_(i+g)` and `S_(j+g)` + for g in range(1, min(s, l - max(i, j))): + QT_j = ( + QT_j + - T_B[i + g - 1] * T_A[j + g - 1] + + T_B[i + g + m - 1] * T_A[j + g + m - 1] + ) + D_squared = core._calculate_squared_distance( + m, + QT_j, + M_T[i + g], + Σ_T[i + g], + μ_Q[j + g], + σ_Q[j + g], + ) + if D_squared < P_squared[thread_idx, i + g, -1]: + pos = np.searchsorted( + P_squared[thread_idx, i + g], D_squared, side="right" + ) + core._shift_insert_at_index( + P_squared[thread_idx, i + g], pos, D_squared + ) + core._shift_insert_at_index(I[thread_idx, i + g], pos, j + g) + + if D_squared < P_squared[thread_idx, j + g, -1]: + pos = np.searchsorted( + P_squared[thread_idx, j + g], D_squared, side="right" + ) + core._shift_insert_at_index( + P_squared[thread_idx, j + g], pos, D_squared + ) + core._shift_insert_at_index(I[thread_idx, j + g], pos, i + g) + + QT_j = QT_j_prime + # Update Top-k of BOTH subsequences at i-g and j-g (i.e. left neighbor + # of i, j), by using the distance between `S_(i-g)` and `S_(j-g)` + for g in range(1, min(s, i + 1, j + 1)): + QT_j = QT_j - T_B[i - g + m] * T_A[j - g + m] + T_B[i - g] * T_A[j - g] + D_squared = core._calculate_squared_distance( + m, + QT_j, + M_T[i - g], + Σ_T[i - g], + μ_Q[j - g], + σ_Q[j - g], + ) + if D_squared < P_squared[thread_idx, i - g, -1]: + pos = np.searchsorted( + P_squared[thread_idx, i - g], D_squared, side="right" + ) + core._shift_insert_at_index( + P_squared[thread_idx, i - g], pos, D_squared + ) + core._shift_insert_at_index(I[thread_idx, i - g], pos, j - g) + + if D_squared < P_squared[thread_idx, j - g, -1]: + pos = np.searchsorted( + P_squared[thread_idx, j - g], D_squared, side="right" + ) + core._shift_insert_at_index( + P_squared[thread_idx, j - g], pos, D_squared + ) + core._shift_insert_at_index(I[thread_idx, j - g], pos, i - g) + + # self-join only if excl_zone is not None: # Note that the squared distance, `squared_distance_profile[j]`, # between subsequences `S_i = T[i : i + m]` and `S_j = T[j : j + m]` @@ -134,87 +215,6 @@ def _compute_PI( ) core._shift_insert_at_index(I[thread_idx, j], pos, i) - # find EXACT (not approx.) value of `P_squared[thread_idx, i, 0]` - nn_of_i = np.argmin(squared_distance_profile) - core._shift_insert_at_index( - P_squared[thread_idx, i], 0, squared_distance_profile[nn_of_i] - ) - core._shift_insert_at_index(I[thread_idx, i], 0, nn_of_i) - - if P_squared[thread_idx, i, 0] == np.inf: # pragma: no cover - I[thread_idx, i, 0] = -1 - else: - j = I[thread_idx, i, 0] - # Given the squared distance, work backwards and compute QT - QT_j = (m - P_squared[thread_idx, i, 0] / 2.0) * (Σ_T[j] * σ_Q[i]) + ( - m * M_T[j] * μ_Q[i] - ) - QT_j_prime = QT_j - # Update Top-k of BOTH subsequences at i+g and j+g (i.e. right neighbor - # of i, j), by using the distance between `S_(i+g)` and `S_(j+g)` - for g in range(1, min(s, l - max(i, j))): - QT_j = ( - QT_j - - T_B[i + g - 1] * T_A[j + g - 1] - + T_B[i + g + m - 1] * T_A[j + g + m - 1] - ) - D_squared = core._calculate_squared_distance( - m, - QT_j, - M_T[i + g], - Σ_T[i + g], - μ_Q[j + g], - σ_Q[j + g], - ) - if D_squared < P_squared[thread_idx, i + g, -1]: - pos = np.searchsorted( - P_squared[thread_idx, i + g], D_squared, side="right" - ) - core._shift_insert_at_index( - P_squared[thread_idx, i + g], pos, D_squared - ) - core._shift_insert_at_index(I[thread_idx, i + g], pos, j + g) - - if D_squared < P_squared[thread_idx, j + g, -1]: - pos = np.searchsorted( - P_squared[thread_idx, j + g], D_squared, side="right" - ) - core._shift_insert_at_index( - P_squared[thread_idx, j + g], pos, D_squared - ) - core._shift_insert_at_index(I[thread_idx, j + g], pos, i + g) - - QT_j = QT_j_prime - # Update Top-k of BOTH subsequences at i-g and j-g (i.e. left neighbor - # of i, j), by using the distance between `S_(i-g)` and `S_(j-g)` - for g in range(1, min(s, i + 1, j + 1)): - QT_j = QT_j - T_B[i - g + m] * T_A[j - g + m] + T_B[i - g] * T_A[j - g] - D_squared = core._calculate_squared_distance( - m, - QT_j, - M_T[i - g], - Σ_T[i - g], - μ_Q[j - g], - σ_Q[j - g], - ) - if D_squared < P_squared[thread_idx, i - g, -1]: - pos = np.searchsorted( - P_squared[thread_idx, i - g], D_squared, side="right" - ) - core._shift_insert_at_index( - P_squared[thread_idx, i - g], pos, D_squared - ) - core._shift_insert_at_index(I[thread_idx, i - g], pos, j - g) - - if D_squared < P_squared[thread_idx, j - g, -1]: - pos = np.searchsorted( - P_squared[thread_idx, j - g], D_squared, side="right" - ) - core._shift_insert_at_index( - P_squared[thread_idx, j - g], pos, D_squared - ) - core._shift_insert_at_index(I[thread_idx, j - g], pos, i - g) - @njit( # "(f8[:], f8[:], i8, f8[:], f8[:], f8[:], f8[:], f8[:], i8, i8, f8[:], f8[:]," From 33a96c6c8392b867afb11fefa88f13154b9ddc01 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 17 Jun 2022 03:06:28 -0600 Subject: [PATCH 213/416] Modified stimp after changing output shape in scrump --- stumpy/stimp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stumpy/stimp.py b/stumpy/stimp.py index 1c285f116..19b955dbe 100644 --- a/stumpy/stimp.py +++ b/stumpy/stimp.py @@ -218,7 +218,7 @@ def update(self): approx.update() self._PAN[ self._bfs_indices[self._n_processed], : approx.P_.shape[0] - ] = approx.P_ + ] = approx.P_.ravel() else: out = self._mp_func( self._T, From 41007f65581ca0321ab924dd9b5b8b6c6f1e1d3c Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 17 Jun 2022 03:07:21 -0600 Subject: [PATCH 214/416] Add pragma no cover --- stumpy/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stumpy/core.py b/stumpy/core.py index b4b102e83..fc504c329 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2647,5 +2647,5 @@ def _shift_insert_at_index(a, idx, v, shift=1): # elements were shifted to left, and thus the insertion becomes `idx-1` a[idx - 1] = v - else: + else: # pragma: no cover pass From 68efe209c3041e596c3775a8b6f20fb05ced850b Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 17 Jun 2022 10:01:38 -0600 Subject: [PATCH 215/416] Revised Docstrings --- stumpy/core.py | 16 +++++++++------- stumpy/gpu_stump.py | 16 ++++++++-------- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index fc504c329..a2faf2cde 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2571,18 +2571,18 @@ def _merge_topk_PI(PA, PB, IA, IB): Parameters ---------- PA : numpy.ndarray - a (top-k) matrix profile, with ndim of 2, where values in each row are + A (top-k) matrix profile, with ndim of 2, where values in each row are sorted in ascending order. Also, it needs to be the same shape as PB. PB : numpy.ndarray - a (top-k) matrix profile, with ndim of 2, where values in each row are + A (top-k) matrix profile, with ndim of 2, where values in each row are sorted in ascending order. Also, it needs to be the same shape as PA. IA : numpy.ndarray - a (top-k) matrix profile indices, corresponding to PA + A (top-k) matrix profile indices, corresponding to PA IB : numpy.ndarray - a (top-k) matrix profile indices, corresponding to PB + A (top-k) matrix profile indices, corresponding to PB Returns ------- @@ -2607,8 +2607,9 @@ def _merge_topk_PI(PA, PB, IA, IB): def _shift_insert_at_index(a, idx, v, shift=1): """ Insert value `v` into array `a` at index `idx` (in place) and discard - the last element (i.e. without changing the length of `a`) when `shift=1` (default). - When `shift=-1`, the first element will be discarded instead. + the last element when `shift=1` (default). When `shift=-1`, the first element + will be discarded instead. In both cases, the length of `a` remain unchanged + at the end of function. Note ---- @@ -2630,7 +2631,8 @@ def _shift_insert_at_index(a, idx, v, shift=1): shift: int, default 1 The value 1 (default) indicates discarding the last element after inserting value `v` at index `idx`. The other value, -1, indicates discarding the first - element after inserting value `v` at index `idx` + element after inserting value `v` at index `idx`. Any value other than 1 + or -1 results in no change in the input array `a`. Returns ------- diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index 63366a183..ecd8434b9 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -26,20 +26,20 @@ def _gpu_searchsorted_left(a, v, bfs, nlevel): 1-dim array sorted in ascending order. v : float - value to insert into array `a` + Value to insert into array `a` bfs : numpy.ndarray The breadth-first-search indices where the missing leaves of its corresponding binary search tree are filled with -1. nlevel : int - the number of levels in the binary search tree from which the array + The number of levels in the binary search tree from which the array `bfs` is obtained. Returns ------- idx : int - the index of the insertion point + The index of the insertion point """ n = a.shape[0] idx = 0 @@ -71,20 +71,20 @@ def _gpu_searchsorted_right(a, v, bfs, nlevel): 1-dim array sorted in ascending order. v : float - value to insert into array `a` + Value to insert into array `a` bfs : numpy.ndarray The breadth-first-search indices where the missing leaves of its corresponding binary search tree are filled with -1. nlevel : int - the number of levels in the binary search tree from which the array + The number of levels in the binary search tree from which the array `bfs` is obtained. Returns ------- idx : int - the index of the insertion point + The index of the insertion point """ n = a.shape[0] idx = 0 @@ -142,7 +142,7 @@ def _compute_and_update_PI_kernel( Parameters ---------- i : int - sliding window `i` + Sliding window `i` T_A : numpy.ndarray The time series or sequence for which to compute the dot product @@ -214,7 +214,7 @@ def _compute_and_update_PI_kernel( binary search tree are filled with -1. nlevel : int - the number of levels in the binary search tree from which the array + The number of levels in the binary search tree from which the array `bfs` is obtained. k : int From 7cbeae945cbe4ca811c10ee802aaa2e4b786fc23 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 17 Jun 2022 10:04:12 -0600 Subject: [PATCH 216/416] Fixed docstring --- stumpy/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index a2faf2cde..e4a689807 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2623,7 +2623,7 @@ def _shift_insert_at_index(a, idx, v, shift=1): idx: int The index at which the value `v` should be inserted. This can be any - integer number from `0` to `len(a) - 1`. + integer number from `0` to `len(a)`. v: float The value that should be inserted into array `a` at index `idx` @@ -2631,7 +2631,7 @@ def _shift_insert_at_index(a, idx, v, shift=1): shift: int, default 1 The value 1 (default) indicates discarding the last element after inserting value `v` at index `idx`. The other value, -1, indicates discarding the first - element after inserting value `v` at index `idx`. Any value other than 1 + element after inserting value `v` at index `idx`. Any value other than 1 or -1 results in no change in the input array `a`. Returns From 2a38dbbe24f86056f03cb7b83d398063307d95e6 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 23 Jun 2022 10:29:30 -0600 Subject: [PATCH 217/416] Revised docstring --- stumpy/core.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index e4a689807..89ef5edbd 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2606,10 +2606,10 @@ def _merge_topk_PI(PA, PB, IA, IB): @njit def _shift_insert_at_index(a, idx, v, shift=1): """ - Insert value `v` into array `a` at index `idx` (in place) and discard - the last element when `shift=1` (default). When `shift=-1`, the first element - will be discarded instead. In both cases, the length of `a` remain unchanged - at the end of function. + If `shift=1`, all elements in `a[idx:]` are shifted to the right by one element + and the last element is discarded. If `shift=-1`, all elements in `a[:idx]` + are shifted to the left by one element and the first element is discarded. In + both cases, the length of `a` remains unchanged. Note ---- From 616332efd7bd9a4491a179a7553cdf174b883727 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 23 Jun 2022 10:31:27 -0600 Subject: [PATCH 218/416] Removed unnecessary dangling else --- stumpy/core.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 89ef5edbd..4321bce4a 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2638,16 +2638,13 @@ def _shift_insert_at_index(a, idx, v, shift=1): ------- None """ - if shift == 1: + if shift >= 0: if 0 <= idx < len(a): a[idx + 1 :] = a[idx:-1] a[idx] = v - elif shift == -1: + else: if 0 < idx <= len(a): a[: idx - 1] = a[1:idx] # elements were shifted to left, and thus the insertion becomes `idx-1` a[idx - 1] = v - - else: # pragma: no cover - pass From 97a17cef740f4d9c7efc7bfcebde178b7a3217c3 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 23 Jun 2022 10:32:27 -0600 Subject: [PATCH 219/416] Removed unnecessary comment --- stumpy/scrump.py | 1 - 1 file changed, 1 deletion(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index d3c938e61..3c3c2916e 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -114,7 +114,6 @@ def _compute_PI( if excl_zone is not None: core._apply_exclusion_zone(squared_distance_profile, i, excl_zone, np.inf) - # find EXACT (not approx.) value of `P_squared[thread_idx, i, 0]` nn = np.argmin(squared_distance_profile) core._shift_insert_at_index( P_squared[thread_idx, i], 0, squared_distance_profile[nn] From e55ee07596df203de0bf85871082d12db090594b Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 23 Jun 2022 10:38:47 -0600 Subject: [PATCH 220/416] Revised structure of test function so, it follows the structure of the performant version --- tests/naive.py | 61 ++++++++++++++++++++++++++------------------------ 1 file changed, 32 insertions(+), 29 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index cf5c2fa31..5f9dcd57c 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1417,41 +1417,44 @@ def prescrump(T_A, m, T_B, s, exclusion_zone=None, k=1): if exclusion_zone is not None: apply_exclusion_zone(distance_profile, i, exclusion_zone, np.inf) - # only for self-join - for idx in np.flatnonzero(distance_profile < P[:, -1]): - pos = np.searchsorted(P[idx], distance_profile[idx], side="right") - P[idx] = np.insert(P[idx], pos, distance_profile[idx])[:-1] - I[idx] = np.insert(I[idx], pos, i)[:-1] - I[i, 1:] = I[i, :-1] I[i, 0] = np.argmin(distance_profile) P[i, 1:] = P[i, :-1] P[i, 0] = distance_profile[I[i, 0]] + if P[i, 0] == np.inf: I[i, 0] = -1 - else: - j = I[i, 0] # index of 1st NN - for g in range(1, min(s, l - max(i, j))): - d = dist_matrix[i + g, j + g] - if d < P[i + g, -1]: - pos = np.searchsorted(P[i + g], d, side="right") - P[i + g] = np.insert(P[i + g], pos, d)[:-1] - I[i + g] = np.insert(I[i + g], pos, j + g)[:-1] - if d < P[j + g]: - pos = np.searchsorted(P[j + g], d, side="right") - P[j + g] = np.insert(P[j + g], pos, d)[:-1] - I[j + g] = np.insert(I[j + g], pos, i + g)[:-1] - - for g in range(1, min(s, i + 1, j + 1)): - d = dist_matrix[i - g, j - g] - if d < P[i - g, -1]: - pos = np.searchsorted(P[i - g], d, side="right") - P[i - g] = np.insert(P[i - g], pos, d)[:-1] - I[i - g] = np.insert(I[i - g], pos, j - g)[:-1] - if d < P[j - g]: - pos = np.searchsorted(P[j - g], d, side="right") - P[j - g] = np.insert(P[j - g], pos, d)[:-1] - I[j - g] = np.insert(I[j - g], pos, i - g)[:-1] + continue + + j = I[i, 0] # index of 1st NN + for g in range(1, min(s, l - max(i, j))): + d = dist_matrix[i + g, j + g] + if d < P[i + g, -1]: + pos = np.searchsorted(P[i + g], d, side="right") + P[i + g] = np.insert(P[i + g], pos, d)[:-1] + I[i + g] = np.insert(I[i + g], pos, j + g)[:-1] + if d < P[j + g]: + pos = np.searchsorted(P[j + g], d, side="right") + P[j + g] = np.insert(P[j + g], pos, d)[:-1] + I[j + g] = np.insert(I[j + g], pos, i + g)[:-1] + + for g in range(1, min(s, i + 1, j + 1)): + d = dist_matrix[i - g, j - g] + if d < P[i - g, -1]: + pos = np.searchsorted(P[i - g], d, side="right") + P[i - g] = np.insert(P[i - g], pos, d)[:-1] + I[i - g] = np.insert(I[i - g], pos, j - g)[:-1] + if d < P[j - g]: + pos = np.searchsorted(P[j - g], d, side="right") + P[j - g] = np.insert(P[j - g], pos, d)[:-1] + I[j - g] = np.insert(I[j - g], pos, i - g)[:-1] + + # self-join only + if exclusion_zone is not None: + for idx in np.flatnonzero(distance_profile < P[:, -1]): + pos = np.searchsorted(P[idx], distance_profile[idx], side="right") + P[idx] = np.insert(P[idx], pos, distance_profile[idx])[:-1] + I[idx] = np.insert(I[idx], pos, i)[:-1] return P, I From b17713669ed1b335ff50a8f2b692ba65d5e3f7ae Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 23 Jun 2022 10:43:42 -0600 Subject: [PATCH 221/416] Replaced ravel with flatten to get copy of array --- stumpy/stimp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stumpy/stimp.py b/stumpy/stimp.py index 19b955dbe..c67c005a6 100644 --- a/stumpy/stimp.py +++ b/stumpy/stimp.py @@ -218,7 +218,7 @@ def update(self): approx.update() self._PAN[ self._bfs_indices[self._n_processed], : approx.P_.shape[0] - ] = approx.P_.ravel() + ] = approx.P_.flatten() else: out = self._mp_func( self._T, From fc7c2106d22fc952ff18e47634d3d52fe6024bef Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 23 Jun 2022 11:02:33 -0600 Subject: [PATCH 222/416] Changed the type of input parameter and revised docstring --- stumpy/core.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 4321bce4a..c96038522 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2604,12 +2604,12 @@ def _merge_topk_PI(PA, PB, IA, IB): @njit -def _shift_insert_at_index(a, idx, v, shift=1): +def _shift_insert_at_index(a, idx, v, shift='right'): """ - If `shift=1`, all elements in `a[idx:]` are shifted to the right by one element - and the last element is discarded. If `shift=-1`, all elements in `a[:idx]` - are shifted to the left by one element and the first element is discarded. In - both cases, the length of `a` remains unchanged. + If `shift=right`, all elements in `a[idx:]` are shifted to the right by one element + and the last element is discarded. If `shift=left` or any other string value, + all elements in `a[:idx]` are shifted to the left by one element and the first + element is discarded. In both cases, the length of `a` remains unchanged. Note ---- @@ -2623,22 +2623,24 @@ def _shift_insert_at_index(a, idx, v, shift=1): idx: int The index at which the value `v` should be inserted. This can be any - integer number from `0` to `len(a)`. + integer number from `0` to `len(a)`. When `idx=0` and `shift` is set to + "right", or when `idx=len(a)` and `shift` is set to any other string value, + then no change will occur on the input array `a`. v: float The value that should be inserted into array `a` at index `idx` - shift: int, default 1 - The value 1 (default) indicates discarding the last element after inserting - value `v` at index `idx`. The other value, -1, indicates discarding the first - element after inserting value `v` at index `idx`. Any value other than 1 - or -1 results in no change in the input array `a`. + shift: str, default "right" + The value that indicates whether the shifting of elements should be to the + right or to the left. If "right" (default), all elements in `a[idx:]` are + shifted to right by one element. For any other string value, all elements + in `a[:idx]` are shifted to the left by one element. Returns ------- None """ - if shift >= 0: + if shift == 'right': if 0 <= idx < len(a): a[idx + 1 :] = a[idx:-1] a[idx] = v From 7a4b46e3286efb9b88014a842c56b7fe19bf84fe Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 23 Jun 2022 11:24:12 -0600 Subject: [PATCH 223/416] Update the value of parameter to match its type --- stumpy/stump.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index a1570f8df..52bf70e08 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -219,18 +219,18 @@ def _compute_diagonal( if pearson > ρ[thread_idx, i, 0]: pos = np.searchsorted(ρ[thread_idx, i], pearson) core._shift_insert_at_index( - ρ[thread_idx, i], pos, pearson, shift=-1 + ρ[thread_idx, i], pos, pearson, shift="left" ) - core._shift_insert_at_index(I[thread_idx, i], pos, i + g, shift=-1) + core._shift_insert_at_index(I[thread_idx, i], pos, i + g, shift="left") if ignore_trivial: # self-joins only if pearson > ρ[thread_idx, i + g, 0]: pos = np.searchsorted(ρ[thread_idx, i + g], pearson) core._shift_insert_at_index( - ρ[thread_idx, i + g], pos, pearson, shift=-1 + ρ[thread_idx, i + g], pos, pearson, shift="left" ) core._shift_insert_at_index( - I[thread_idx, i + g], pos, i, shift=-1 + I[thread_idx, i + g], pos, i, shift="left" ) if i < i + g: @@ -477,10 +477,10 @@ def _stump( if ρ[0, i, 0] < ρ[thread_idx, i, j]: pos = np.searchsorted(ρ[0, i], ρ[thread_idx, i, j]) core._shift_insert_at_index( - ρ[0, i], pos, ρ[thread_idx, i, j], shift=-1 + ρ[0, i], pos, ρ[thread_idx, i, j], shift="left" ) core._shift_insert_at_index( - I[0, i], pos, I[thread_idx, i, j], shift=-1 + I[0, i], pos, I[thread_idx, i, j], shift="left" ) if ρL[0, i] < ρL[thread_idx, i]: From 2622d13c16e691ffa13c82ac7ce25cb66c21da5e Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 23 Jun 2022 11:26:35 -0600 Subject: [PATCH 224/416] Update the value of parameter to match its type --- tests/test_core.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_core.py b/tests/test_core.py index e3854b889..5136175b1 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1098,22 +1098,22 @@ def test_shift_insert_at_index(): indices = np.arange(k + 1) values = np.random.rand(k + 1) - # test shift = 1 + # test shift = "right" for (idx, v) in zip(indices, values): ref[:] = a comp[:] = a ref = np.insert(ref, idx, v)[:-1] - core._shift_insert_at_index(comp, idx, v, shift=1) # update comp in place + core._shift_insert_at_index(comp, idx, v, shift="right") # update comp in place npt.assert_array_equal(ref, comp) - # test shift = -1 + # test shift = "left" for (idx, v) in zip(indices, values): ref[:] = a comp[:] = a ref = np.insert(ref, idx, v)[1:] - core._shift_insert_at_index(comp, idx, v, shift=-1) # update comp in place + core._shift_insert_at_index(comp, idx, v, shift="left") # update comp in place npt.assert_array_equal(ref, comp) From fc3be799b744caee61b8b50242ee80195456174f Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 23 Jun 2022 11:27:55 -0600 Subject: [PATCH 225/416] Correct format --- stumpy/core.py | 4 ++-- stumpy/stump.py | 4 +++- tests/test_core.py | 8 ++++++-- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index c96038522..0d3e2ffdd 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2604,7 +2604,7 @@ def _merge_topk_PI(PA, PB, IA, IB): @njit -def _shift_insert_at_index(a, idx, v, shift='right'): +def _shift_insert_at_index(a, idx, v, shift="right"): """ If `shift=right`, all elements in `a[idx:]` are shifted to the right by one element and the last element is discarded. If `shift=left` or any other string value, @@ -2640,7 +2640,7 @@ def _shift_insert_at_index(a, idx, v, shift='right'): ------- None """ - if shift == 'right': + if shift == "right": if 0 <= idx < len(a): a[idx + 1 :] = a[idx:-1] a[idx] = v diff --git a/stumpy/stump.py b/stumpy/stump.py index 52bf70e08..b00a4b22d 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -221,7 +221,9 @@ def _compute_diagonal( core._shift_insert_at_index( ρ[thread_idx, i], pos, pearson, shift="left" ) - core._shift_insert_at_index(I[thread_idx, i], pos, i + g, shift="left") + core._shift_insert_at_index( + I[thread_idx, i], pos, i + g, shift="left" + ) if ignore_trivial: # self-joins only if pearson > ρ[thread_idx, i + g, 0]: diff --git a/tests/test_core.py b/tests/test_core.py index 5136175b1..96854093a 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1104,7 +1104,9 @@ def test_shift_insert_at_index(): comp[:] = a ref = np.insert(ref, idx, v)[:-1] - core._shift_insert_at_index(comp, idx, v, shift="right") # update comp in place + core._shift_insert_at_index( + comp, idx, v, shift="right" + ) # update comp in place npt.assert_array_equal(ref, comp) @@ -1114,6 +1116,8 @@ def test_shift_insert_at_index(): comp[:] = a ref = np.insert(ref, idx, v)[1:] - core._shift_insert_at_index(comp, idx, v, shift="left") # update comp in place + core._shift_insert_at_index( + comp, idx, v, shift="left" + ) # update comp in place npt.assert_array_equal(ref, comp) From 6411b7a05dedb4aff8a6ffff4a24dd0b65ec12d4 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 23 Jun 2022 12:12:01 -0600 Subject: [PATCH 226/416] Changed output structure of naive.scrump --- tests/naive.py | 40 +++++++++---------- tests/test_scrump.py | 91 +++++++++----------------------------------- 2 files changed, 38 insertions(+), 93 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 5f9dcd57c..67d5d95a9 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1481,8 +1481,13 @@ def scrump(T_A, m, T_B, percentage, exclusion_zone, pre_scrump, s, k=1): diags_ranges_start = diags_ranges[0, 0] diags_ranges_stop = diags_ranges[0, 1] - P = np.full((l, k + 2), np.inf, dtype=np.float64) # Topk + left/ right - I = np.full((l, k + 2), -1, dtype=np.int64) # Topk + left/ right + P = np.full((l, k), np.inf, dtype=np.float64) # Topk + PL = np.full(l, np.inf, dtype=np.float64) + PR = np.full(l, np.inf, dtype=np.float64) + + I = np.full((l, k), -1, dtype=np.int64) + IL = np.full(l, -1, dtype=np.int64) + IR = np.full(l, -1, dtype=np.int64) for diag_idx in range(diags_ranges_start, diags_ranges_stop): g = diags[diag_idx] @@ -1491,32 +1496,27 @@ def scrump(T_A, m, T_B, percentage, exclusion_zone, pre_scrump, s, k=1): for j in range(n_B - m + 1): if j - i == g: d = dist_matrix[i, j] - if d < P[i, k - 1]: - # update TopK of P[i] + if d < P[i, - 1]: # update TopK of P[i] idx = searchsorted_right(P[i], d) - P[i, :k] = np.insert(P[i, :k], idx, d)[:-1] - I[i, :k] = np.insert(I[i, :k], idx, i + g)[:-1] + P[i] = np.insert(P[i], idx, d)[:-1] + I[i] = np.insert(I[i], idx, i + g)[:-1] - if exclusion_zone is not None and d < P[i + g, k - 1]: + if exclusion_zone is not None and d < P[i + g, -1]: idx = searchsorted_right(P[i + g], d) - P[i + g, :k] = np.insert(P[i + g, :k], idx, d)[:-1] - I[i + g, :k] = np.insert(I[i + g, :k], idx, i)[:-1] + P[i + g] = np.insert(P[i + g], idx, d)[:-1] + I[i + g] = np.insert(I[i + g], idx, i)[:-1] # left matrix profile and left matrix profile indices - if exclusion_zone is not None and i < i + g and d < P[i + g, k]: - P[i + g, k] = d - I[i + g, k] = i + if exclusion_zone is not None and i < i + g and d < PL[i + g]: + PL[i + g] = d + IL[i + g] = i # right matrix profile and right matrix profile indices - if exclusion_zone is not None and i + g > i and d < P[i, k + 1]: - P[i, k + 1] = d - I[i, k + 1] = i + g + if exclusion_zone is not None and i + g > i and d < PR[i]: + PR[i] = d + IR[i] = i + g - out = np.empty((l, 2 * k + 2), dtype=object) - out[:, :k] = P[:, :k] - out[:, k:] = I - - return out + return P, I, IL, IR def prescraamp(T_A, m, T_B, s, exclusion_zone=None, p=2.0): diff --git a/tests/test_scrump.py b/tests/test_scrump.py index 0c9d54672..3f0e96c61 100644 --- a/tests/test_scrump.py +++ b/tests/test_scrump.py @@ -107,11 +107,7 @@ def test_scrump_self_join(T_A, T_B, percentages): seed = np.random.randint(100000) np.random.seed(seed) - ref_mp = naive.scrump(T_B, m, T_B, percentage, zone, False, None) - ref_P = ref_mp[:, 0].reshape(-1, 1) # to match shape of comp_P when k=1 - ref_I = ref_mp[:, 1].reshape(-1, 1) # to match shape of comp_I when k=1 - ref_left_I = ref_mp[:, 2] - ref_right_I = ref_mp[:, 3] + ref_P, ref_I, ref_left_I, ref_right_I = naive.scrump(T_B, m, T_B, percentage, zone, False, None) np.random.seed(seed) approx = scrump( @@ -140,11 +136,7 @@ def test_scrump_A_B_join(T_A, T_B, percentages): seed = np.random.randint(100000) np.random.seed(seed) - ref_mp = naive.scrump(T_A, m, T_B, percentage, None, False, None) - ref_P = ref_mp[:, 0].reshape(-1, 1) # to match shape of comp_P when k=1 - ref_I = ref_mp[:, 1].reshape(-1, 1) # to match shape of comp_I when k=1 - ref_left_I = ref_mp[:, 2] - ref_right_I = ref_mp[:, 3] + ref_P, ref_I, ref_left_I, ref_right_I = naive.scrump(T_A, m, T_B, percentage, None, False, None) np.random.seed(seed) approx = scrump( @@ -174,11 +166,7 @@ def test_scrump_A_B_join_swap(T_A, T_B, percentages): seed = np.random.randint(100000) np.random.seed(seed) - ref_mp = naive.scrump(T_B, m, T_A, percentage, None, False, None) - ref_P = ref_mp[:, 0].reshape(-1, 1) # to match shape of comp_P when k=1 - # ref_I = ref_mp[:, 1].reshape(-1, 1) - ref_left_I = ref_mp[:, 2] - ref_right_I = ref_mp[:, 3] + ref_P, _, ref_left_I, ref_right_I = naive.scrump(T_B, m, T_A, percentage, None, False, None) np.random.seed(seed) approx = scrump( @@ -210,11 +198,7 @@ def test_scrump_self_join_larger_window(T_A, T_B, m, percentages): seed = np.random.randint(100000) np.random.seed(seed) - ref_mp = naive.scrump(T_B, m, T_B, percentage, zone, False, None) - ref_P = ref_mp[:, 0].reshape(-1, 1) # to match shape of comp_P when k=1 - ref_I = ref_mp[:, 1].reshape(-1, 1) # to match shape of comp_I when k=1 - ref_left_I = ref_mp[:, 2] - ref_right_I = ref_mp[:, 3] + ref_P, ref_I, ref_left_I, ref_right_I = naive.scrump(T_B, m, T_B, percentage, zone, False, None) np.random.seed(seed) approx = scrump( @@ -378,15 +362,8 @@ def test_scrump_plus_plus_self_join(T_A, T_B, percentages): np.random.seed(seed) ref_P, ref_I = naive.prescrump(T_B, m, T_B, s=s, exclusion_zone=zone) - ref_mp = naive.scrump(T_B, m, T_B, percentage, zone, True, s) - for i in range(ref_mp.shape[0]): - if ref_P[i] < ref_mp[i, 0]: - ref_mp[i, 0] = ref_P[i] - ref_mp[i, 1] = ref_I[i] - ref_P = ref_mp[:, 0].reshape(-1, 1) # to match shape of comp_P when k=1 - ref_I = ref_mp[:, 1].reshape(-1, 1) # to match shape of comp_I when k=1 - # ref_left_I = ref_mp[:, 2] - # ref_right_I = ref_mp[:, 3] + ref_P_aux, ref_I_aux, _, _ = naive.scrump(T_B, m, T_B, percentage, zone, True, s) + naive.merge_topk_PI(ref_P, ref_P_aux, ref_I, ref_I_aux) np.random.seed(seed) approx = scrump( @@ -395,16 +372,12 @@ def test_scrump_plus_plus_self_join(T_A, T_B, percentages): approx.update() comp_P = approx.P_ comp_I = approx.I_ - # comp_left_I = approx.left_I_ - # comp_right_I = approx.right_I_ naive.replace_inf(ref_P) naive.replace_inf(comp_I) npt.assert_almost_equal(ref_P, comp_P) npt.assert_almost_equal(ref_I, comp_I) - # npt.assert_almost_equal(ref_left_I, comp_left_I) - # npt.assert_almost_equal(ref_right_I, comp_right_I) @pytest.mark.parametrize("T_A, T_B", test_data) @@ -419,15 +392,11 @@ def test_scrump_plus_plus_A_B_join(T_A, T_B, percentages): np.random.seed(seed) ref_P, ref_I = naive.prescrump(T_A, m, T_B, s=s) - ref_mp = naive.scrump(T_A, m, T_B, percentage, None, False, None) - for i in range(ref_mp.shape[0]): - if ref_P[i] < ref_mp[i, 0]: - ref_mp[i, 0] = ref_P[i] - ref_mp[i, 1] = ref_I[i] - ref_P = ref_mp[:, 0].reshape(-1, 1) # to match shape of comp_P when k=1 - ref_I = ref_mp[:, 1].reshape(-1, 1) # to match shape of comp_I when k=1 - ref_left_I = ref_mp[:, 2] - ref_right_I = ref_mp[:, 3] + + ref_P_aux, ref_I_aux, ref_left_I_aux, ref_right_I_aux = naive.scrump(T_A, m, T_B, percentage, None, False, None) + naive.merge_topk_PI(ref_P, ref_P_aux, ref_I, ref_I_aux) + ref_left_I = ref_left_I_aux + ref_right_I = ref_right_I_aux approx = scrump( T_A, @@ -551,11 +520,7 @@ def test_scrump_constant_subsequence_self_join(percentages): seed = np.random.randint(100000) np.random.seed(seed) - ref_mp = naive.scrump(T, m, T, percentage, zone, False, None) - ref_P = ref_mp[:, 0].reshape(-1, 1) # to match shape of comp_P when k=1 - ref_I = ref_mp[:, 1].reshape(-1, 1) # to match shape of comp_I when k=1 - ref_left_I = ref_mp[:, 2] - ref_right_I = ref_mp[:, 3] + ref_P, ref_I, ref_left_I, ref_right_I = naive.scrump(T, m, T, percentage, zone, False, None) np.random.seed(seed) approx = scrump( @@ -589,11 +554,7 @@ def test_scrump_identical_subsequence_self_join(percentages): seed = np.random.randint(100000) np.random.seed(seed) - ref_mp = naive.scrump(T, m, T, percentage, zone, False, None) - ref_P = ref_mp[:, 0].reshape(-1, 1) # to match shape of comp_P when k=1 - # ref_I = ref_mp[:, 1].reshape(-1, 1) - # ref_left_I = ref_mp[:, 2] - # ref_right_I = ref_mp[:, 3] + ref_P, _, _, _ = naive.scrump(T, m, T, percentage, zone, False, None) np.random.seed(seed) approx = scrump( @@ -635,11 +596,7 @@ def test_scrump_nan_inf_self_join( seed = np.random.randint(100000) np.random.seed(seed) - ref_mp = naive.scrump(T_B_sub, m, T_B_sub, percentage, zone, False, None) - ref_P = ref_mp[:, 0].reshape(-1, 1) # to match shape of comp_P when k=1 - ref_I = ref_mp[:, 1].reshape(-1, 1) # to match shape of comp_I when k=1 - ref_left_I = ref_mp[:, 2] - ref_right_I = ref_mp[:, 3] + ref_P, ref_I, ref_left_I, ref_right_I = naive.scrump(T_B_sub, m, T_B_sub, percentage, zone, False, None) np.random.seed(seed) approx = scrump(T_B_sub, m, percentage=percentage, pre_scrump=False) @@ -669,11 +626,7 @@ def test_scrump_nan_zero_mean_self_join(percentages): seed = np.random.randint(100000) np.random.seed(seed) - ref_mp = naive.scrump(T, m, T, percentage, zone, False, None) - ref_P = ref_mp[:, 0].reshape(-1, 1) # to match shape of comp_P when k=1 - ref_I = ref_mp[:, 1].reshape(-1, 1) # to match shape of comp_I when k=1 - ref_left_I = ref_mp[:, 2] - ref_right_I = ref_mp[:, 3] + ref_P, ref_I, ref_left_I, ref_right_I = naive.scrump(T, m, T, percentage, zone, False, None) np.random.seed(seed) approx = scrump(T, m, percentage=percentage, pre_scrump=False) @@ -739,11 +692,7 @@ def test_scrump_self_join_KNN(T_A, T_B, percentages): seed = np.random.randint(100000) np.random.seed(seed) - ref_mp = naive.scrump(T_B, m, T_B, percentage, zone, False, None, k=k) - ref_P = ref_mp[:, :k] - ref_I = ref_mp[:, k : 2 * k] - ref_left_I = ref_mp[:, 2 * k] - ref_right_I = ref_mp[:, 2 * k + 1] + ref_P, ref_I, ref_left_I, ref_right_I = naive.scrump(T_B, m, T_B, percentage, zone, False, None, k=k) np.random.seed(seed) approx = scrump( @@ -777,12 +726,8 @@ def test_scrump_A_B_join_KNN(T_A, T_B, percentages): seed = np.random.randint(100000) np.random.seed(seed) - ref_mp = naive.scrump(T_A, m, T_B, percentage, None, False, None, k=k) - ref_P = ref_mp[:, :k] - ref_I = ref_mp[:, k : 2 * k] - ref_left_I = ref_mp[:, 2 * k] - ref_right_I = ref_mp[:, 2 * k + 1] - + ref_P, ref_I, ref_left_I, ref_right_I = naive.scrump(T_A, m, T_B, percentage, None, False, None, k=k) + np.random.seed(seed) approx = scrump( T_A, From 71d68c8747a8abbb7a6d748455f69768054649f4 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 23 Jun 2022 12:12:48 -0600 Subject: [PATCH 227/416] Correct format --- tests/naive.py | 2 +- tests/test_scrump.py | 46 ++++++++++++++++++++++++++++++++------------ 2 files changed, 35 insertions(+), 13 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 67d5d95a9..e3df5c236 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1496,7 +1496,7 @@ def scrump(T_A, m, T_B, percentage, exclusion_zone, pre_scrump, s, k=1): for j in range(n_B - m + 1): if j - i == g: d = dist_matrix[i, j] - if d < P[i, - 1]: # update TopK of P[i] + if d < P[i, -1]: # update TopK of P[i] idx = searchsorted_right(P[i], d) P[i] = np.insert(P[i], idx, d)[:-1] I[i] = np.insert(I[i], idx, i + g)[:-1] diff --git a/tests/test_scrump.py b/tests/test_scrump.py index 3f0e96c61..97d5164f4 100644 --- a/tests/test_scrump.py +++ b/tests/test_scrump.py @@ -107,7 +107,9 @@ def test_scrump_self_join(T_A, T_B, percentages): seed = np.random.randint(100000) np.random.seed(seed) - ref_P, ref_I, ref_left_I, ref_right_I = naive.scrump(T_B, m, T_B, percentage, zone, False, None) + ref_P, ref_I, ref_left_I, ref_right_I = naive.scrump( + T_B, m, T_B, percentage, zone, False, None + ) np.random.seed(seed) approx = scrump( @@ -136,7 +138,9 @@ def test_scrump_A_B_join(T_A, T_B, percentages): seed = np.random.randint(100000) np.random.seed(seed) - ref_P, ref_I, ref_left_I, ref_right_I = naive.scrump(T_A, m, T_B, percentage, None, False, None) + ref_P, ref_I, ref_left_I, ref_right_I = naive.scrump( + T_A, m, T_B, percentage, None, False, None + ) np.random.seed(seed) approx = scrump( @@ -166,7 +170,9 @@ def test_scrump_A_B_join_swap(T_A, T_B, percentages): seed = np.random.randint(100000) np.random.seed(seed) - ref_P, _, ref_left_I, ref_right_I = naive.scrump(T_B, m, T_A, percentage, None, False, None) + ref_P, _, ref_left_I, ref_right_I = naive.scrump( + T_B, m, T_A, percentage, None, False, None + ) np.random.seed(seed) approx = scrump( @@ -198,7 +204,9 @@ def test_scrump_self_join_larger_window(T_A, T_B, m, percentages): seed = np.random.randint(100000) np.random.seed(seed) - ref_P, ref_I, ref_left_I, ref_right_I = naive.scrump(T_B, m, T_B, percentage, zone, False, None) + ref_P, ref_I, ref_left_I, ref_right_I = naive.scrump( + T_B, m, T_B, percentage, zone, False, None + ) np.random.seed(seed) approx = scrump( @@ -362,7 +370,9 @@ def test_scrump_plus_plus_self_join(T_A, T_B, percentages): np.random.seed(seed) ref_P, ref_I = naive.prescrump(T_B, m, T_B, s=s, exclusion_zone=zone) - ref_P_aux, ref_I_aux, _, _ = naive.scrump(T_B, m, T_B, percentage, zone, True, s) + ref_P_aux, ref_I_aux, _, _ = naive.scrump( + T_B, m, T_B, percentage, zone, True, s + ) naive.merge_topk_PI(ref_P, ref_P_aux, ref_I, ref_I_aux) np.random.seed(seed) @@ -393,7 +403,9 @@ def test_scrump_plus_plus_A_B_join(T_A, T_B, percentages): np.random.seed(seed) ref_P, ref_I = naive.prescrump(T_A, m, T_B, s=s) - ref_P_aux, ref_I_aux, ref_left_I_aux, ref_right_I_aux = naive.scrump(T_A, m, T_B, percentage, None, False, None) + ref_P_aux, ref_I_aux, ref_left_I_aux, ref_right_I_aux = naive.scrump( + T_A, m, T_B, percentage, None, False, None + ) naive.merge_topk_PI(ref_P, ref_P_aux, ref_I, ref_I_aux) ref_left_I = ref_left_I_aux ref_right_I = ref_right_I_aux @@ -520,7 +532,9 @@ def test_scrump_constant_subsequence_self_join(percentages): seed = np.random.randint(100000) np.random.seed(seed) - ref_P, ref_I, ref_left_I, ref_right_I = naive.scrump(T, m, T, percentage, zone, False, None) + ref_P, ref_I, ref_left_I, ref_right_I = naive.scrump( + T, m, T, percentage, zone, False, None + ) np.random.seed(seed) approx = scrump( @@ -596,7 +610,9 @@ def test_scrump_nan_inf_self_join( seed = np.random.randint(100000) np.random.seed(seed) - ref_P, ref_I, ref_left_I, ref_right_I = naive.scrump(T_B_sub, m, T_B_sub, percentage, zone, False, None) + ref_P, ref_I, ref_left_I, ref_right_I = naive.scrump( + T_B_sub, m, T_B_sub, percentage, zone, False, None + ) np.random.seed(seed) approx = scrump(T_B_sub, m, percentage=percentage, pre_scrump=False) @@ -626,7 +642,9 @@ def test_scrump_nan_zero_mean_self_join(percentages): seed = np.random.randint(100000) np.random.seed(seed) - ref_P, ref_I, ref_left_I, ref_right_I = naive.scrump(T, m, T, percentage, zone, False, None) + ref_P, ref_I, ref_left_I, ref_right_I = naive.scrump( + T, m, T, percentage, zone, False, None + ) np.random.seed(seed) approx = scrump(T, m, percentage=percentage, pre_scrump=False) @@ -692,7 +710,9 @@ def test_scrump_self_join_KNN(T_A, T_B, percentages): seed = np.random.randint(100000) np.random.seed(seed) - ref_P, ref_I, ref_left_I, ref_right_I = naive.scrump(T_B, m, T_B, percentage, zone, False, None, k=k) + ref_P, ref_I, ref_left_I, ref_right_I = naive.scrump( + T_B, m, T_B, percentage, zone, False, None, k=k + ) np.random.seed(seed) approx = scrump( @@ -726,8 +746,10 @@ def test_scrump_A_B_join_KNN(T_A, T_B, percentages): seed = np.random.randint(100000) np.random.seed(seed) - ref_P, ref_I, ref_left_I, ref_right_I = naive.scrump(T_A, m, T_B, percentage, None, False, None, k=k) - + ref_P, ref_I, ref_left_I, ref_right_I = naive.scrump( + T_A, m, T_B, percentage, None, False, None, k=k + ) + np.random.seed(seed) approx = scrump( T_A, From 3e4234329400624a2543f5d921fd89448e74b1a2 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 23 Jun 2022 12:28:51 -0600 Subject: [PATCH 228/416] Add test function for scrump_plus_plus for TopK --- tests/test_scrump.py | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tests/test_scrump.py b/tests/test_scrump.py index 97d5164f4..3bd43b423 100644 --- a/tests/test_scrump.py +++ b/tests/test_scrump.py @@ -773,3 +773,44 @@ def test_scrump_A_B_join_KNN(T_A, T_B, percentages): npt.assert_almost_equal(ref_I, comp_I) npt.assert_almost_equal(ref_left_I, comp_left_I) npt.assert_almost_equal(ref_right_I, comp_right_I) + + +@pytest.mark.parametrize("T_A, T_B", test_data) +@pytest.mark.parametrize("percentages", percentages) +def test_scrump_plus_plus_self_join_KNN(T_A, T_B, percentages): + m = 3 + zone = int(np.ceil(m / 4)) + + for k in range(2, 4): + for s in range(1, zone + 1): + for percentage in percentages: + seed = np.random.randint(100000) + + np.random.seed(seed) + ref_P, ref_I = naive.prescrump( + T_B, m, T_B, s=s, exclusion_zone=zone, k=k + ) + ref_P_aux, ref_I_aux, _, _ = naive.scrump( + T_B, m, T_B, percentage, zone, True, s, k=k + ) + naive.merge_topk_PI(ref_P, ref_P_aux, ref_I, ref_I_aux) + + np.random.seed(seed) + approx = scrump( + T_B, + m, + ignore_trivial=True, + percentage=percentage, + pre_scrump=True, + s=s, + k=k, + ) + approx.update() + comp_P = approx.P_ + comp_I = approx.I_ + + naive.replace_inf(ref_P) + naive.replace_inf(comp_I) + + npt.assert_almost_equal(ref_P, comp_P) + npt.assert_almost_equal(ref_I, comp_I) From e512a63bc941387bae8daf69c5a245a015f5aba8 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 24 Jun 2022 12:43:48 -0600 Subject: [PATCH 229/416] Add naive version to merge peason profiles --- tests/naive.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tests/naive.py b/tests/naive.py index e3df5c236..cd8077ffb 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1790,3 +1790,29 @@ def merge_topk_PI(PA, PB, IA, IB): PA[:, :] = profile[:, : PA.shape[1]] IA[:, :] = indices[:, : PA.shape[1]] + + +def merge_topk_ρI(ρA, ρB, IA, IB): + # this is to merge two pearson profiles, each is a 2D array where each row + # contains an ascendingly-sorted values. + # Note that we are interested in keeping the top-k largest values. + # In the merged array (from right to left): the priority is with ρA (from right + # to left), and then with ρB(from right to left) + + # Example: + # ρA = [0(I), 0(II), 1], and ρB = [0', 1'(I), 1'(II)]. + # the prime symbol is to indicate that the values are from ρB + # and the greek numbers are to differntiate two same values in one array + + # so, the outcome of merging process should be: + # [0', 0(I), 0(II), 1'(I), 1'(II), 1] + + profile = np.column_stack((ρB, ρA)) + indices = np.column_stack((IB, IA)) + + idx = np.argsort(profile, axis=1) + profile = np.take_along_axis(profile, idx, axis=1) + indices = np.take_along_axis(indices, idx, axis=1) + + ρA[:, :] = profile[:, ρA.shape[1]: ] + IA[:, :] = indices[:, ρA.shape[1]: ] From 382bda27da190c0da9129de854ea72f3f924aabc Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 24 Jun 2022 12:44:21 -0600 Subject: [PATCH 230/416] Add test function for merging pearson profiles --- tests/test_core.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/test_core.py b/tests/test_core.py index 96854093a..69f4a886f 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1089,6 +1089,34 @@ def test_merge_topk_PI(): npt.assert_array_equal(ref_I, comp_I) +def test_merge_topk_ρI(): + n = 50 + for k in range(1, 6): + ρA = np.random.rand(n * k).reshape(n, k) + ρA[:, :] = np.sort(ρA, axis=1) # sorting each row separately + + ρB = np.random.rand(n * k).reshape(n, k) + col_idx = np.random.randint(0, k, size=n) + for i in range(n): # creating ties between values of PA and PB + ρB[i, col_idx[i]] = np.random.choice(ρA[i], size=1, replace=False) + ρB[:, :] = np.sort(ρB, axis=1) # sorting each row separately + + IA = np.arange(n * k).reshape(n, k) + IB = IA + n * k + + ref_ρ = ρA.copy() + ref_I = IA.copy() + + comp_ρ = ρA.copy() + comp_I = IA.copy() + + naive.merge_topk_ρI(ref_ρ, ρB, ref_I, IB) + core._merge_topk_ρI(comp_ρ, ρB, comp_I, IB) + + npt.assert_array_equal(ref_ρ, comp_ρ) + npt.assert_array_equal(ref_I, comp_I) + + def test_shift_insert_at_index(): for k in range(1, 6): a = np.random.rand(k) From b0a56f768031f77d7eb3332acf2d87a16b4e0bbe Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 24 Jun 2022 12:45:37 -0600 Subject: [PATCH 231/416] Corret format --- tests/naive.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index cd8077ffb..9858005bc 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1814,5 +1814,5 @@ def merge_topk_ρI(ρA, ρB, IA, IB): profile = np.take_along_axis(profile, idx, axis=1) indices = np.take_along_axis(indices, idx, axis=1) - ρA[:, :] = profile[:, ρA.shape[1]: ] - IA[:, :] = indices[:, ρA.shape[1]: ] + ρA[:, :] = profile[:, ρA.shape[1] :] + IA[:, :] = indices[:, ρA.shape[1] :] From b5a4e1553847cbe16e94d9661a061b3b2419c343 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 24 Jun 2022 13:00:15 -0600 Subject: [PATCH 232/416] Add performant function to merge pearson profiles --- stumpy/core.py | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/stumpy/core.py b/stumpy/core.py index 0d3e2ffdd..b93e074d7 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2603,6 +2603,51 @@ def _merge_topk_PI(PA, PB, IA, IB): stop += 1 # because of shifting elements to the right by one +@njit(parallel=True) +def _merge_topk_ρI(ρA, ρB, IA, IB): + """ + Merge two top-k pearson profiles ρA and ρB, and update ρA (in place). In the + merged array (from right to left): the priority is with ρA (from right to left), + and then with ρB(from right to left) Also, update IA accordingly. + + Unlike `_merge_topk_PI`, where `top-k` smallest values are kept, this function + keeps `top-k` largest values. + + Parameters + ---------- + ρA : numpy.ndarray + A (top-k) pearson profile, with ndim of 2, where values in each row are + sorted in ascending order. Also, it needs to be the same shape as ρB. + + ρB : numpy.ndarray + A (top-k) pearson profile, with ndim of 2, where values in each row are + sorted in ascending order. Also, it needs to be the same shape as ρA. + + IA : numpy.ndarray + A (top-k) matrix profile indices, corresponding to ρA + + IB : numpy.ndarray + A (top-k) matrix profile indices, corresponding to ρB + + Returns + ------- + None + """ + for i in range(ρB.shape[0]): + # start = 0 + # stop = np.searchsorted(PA[i], PB[i, -1], side="right") + + for j in range(ρB.shape[1] - 1, -1, -1): + if ρB[i, j] > ρA[i, 0]: + idx = np.searchsorted(ρA[i], ρB[i, j], side="left") # + start + + _shift_insert_at_index(ρA[i], idx, ρB[i, j], shift="left") + _shift_insert_at_index(IA[i], idx, IB[i, j], shift="left") + + # start = idx + # stop += 1 # because of shifting elements to the right by one + + @njit def _shift_insert_at_index(a, idx, v, shift="right"): """ From d4d28feafe5ed2dad2f92e764846093324301f05 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 24 Jun 2022 13:04:26 -0600 Subject: [PATCH 233/416] Optimize function --- stumpy/core.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index b93e074d7..a25432b54 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2634,18 +2634,19 @@ def _merge_topk_ρI(ρA, ρB, IA, IB): None """ for i in range(ρB.shape[0]): - # start = 0 - # stop = np.searchsorted(PA[i], PB[i, -1], side="right") + start = np.searchsorted(ρA[i], ρB[i, 0], side="left") + stop = ρB.shape[1] for j in range(ρB.shape[1] - 1, -1, -1): if ρB[i, j] > ρA[i, 0]: - idx = np.searchsorted(ρA[i], ρB[i, j], side="left") # + start + idx = np.searchsorted(ρA[i, start:stop], ρB[i, j], side="left") + start _shift_insert_at_index(ρA[i], idx, ρB[i, j], shift="left") _shift_insert_at_index(IA[i], idx, IB[i, j], shift="left") - # start = idx - # stop += 1 # because of shifting elements to the right by one + stop = idx # because of shifting elements to the left by one + if start > 0: + start -= 1 @njit From 99f2a570623b714be46b288a322c4b660b763194 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 24 Jun 2022 13:05:31 -0600 Subject: [PATCH 234/416] Avoid creating new memory --- tests/naive.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 9858005bc..03de99bf5 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1785,8 +1785,8 @@ def merge_topk_PI(PA, PB, IA, IB): indices = np.column_stack((IA, IB)) idx = np.argsort(profile, axis=1) - profile = np.take_along_axis(profile, idx, axis=1) - indices = np.take_along_axis(indices, idx, axis=1) + profile[:, :] = np.take_along_axis(profile, idx, axis=1) + indices[:, :] = np.take_along_axis(indices, idx, axis=1) PA[:, :] = profile[:, : PA.shape[1]] IA[:, :] = indices[:, : PA.shape[1]] @@ -1811,8 +1811,8 @@ def merge_topk_ρI(ρA, ρB, IA, IB): indices = np.column_stack((IB, IA)) idx = np.argsort(profile, axis=1) - profile = np.take_along_axis(profile, idx, axis=1) - indices = np.take_along_axis(indices, idx, axis=1) + profile[:, :] = np.take_along_axis(profile, idx, axis=1) + indices[:, :] = np.take_along_axis(indices, idx, axis=1) ρA[:, :] = profile[:, ρA.shape[1] :] IA[:, :] = indices[:, ρA.shape[1] :] From 855c429daf1ea3547a4c57fc737f6286517a086f Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 24 Jun 2022 13:07:22 -0600 Subject: [PATCH 235/416] Improve docstring --- stumpy/core.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/stumpy/core.py b/stumpy/core.py index a25432b54..379fe8c60 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2568,6 +2568,9 @@ def _merge_topk_PI(PA, PB, IA, IB): always choosing values of PA over values of PB in case of ties. Also, update IA accordingly. + Unlike `_merge_topk_ρI`, where `top-k` largest values are kept, this function + keeps `top-k` smallest values. + Parameters ---------- PA : numpy.ndarray From 9e02bac4d5bd84e1131b80a869182de87541d004 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 24 Jun 2022 13:15:36 -0600 Subject: [PATCH 236/416] Refactored --- stumpy/stump.py | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index b00a4b22d..80923f33c 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -471,27 +471,18 @@ def _stump( # Reduction of results from all threads for thread_idx in range(1, n_threads): - for i in prange(l): - # top-k - for j in range( - k - 1, -1, -1 - ): # reverse iteration to preserve order in ties - if ρ[0, i, 0] < ρ[thread_idx, i, j]: - pos = np.searchsorted(ρ[0, i], ρ[thread_idx, i, j]) - core._shift_insert_at_index( - ρ[0, i], pos, ρ[thread_idx, i, j], shift="left" - ) - core._shift_insert_at_index( - I[0, i], pos, I[thread_idx, i, j], shift="left" - ) - - if ρL[0, i] < ρL[thread_idx, i]: - ρL[0, i] = ρL[thread_idx, i] - IL[0, i] = IL[thread_idx, i] - - if ρR[0, i] < ρR[thread_idx, i]: - ρR[0, i] = ρR[thread_idx, i] - IR[0, i] = IR[thread_idx, i] + # update top-k arrays + core._merge_topk_ρI(ρ[0], ρ[thread_idx], I[0], I[thread_idx]) + + # update left matrix profile and matrix profile indices + cond = ρL[0] < ρL[thread_idx] + ρL[0] = np.where(cond, ρL[thread_idx], ρL[0]) + IL[0] = np.where(cond, IL[thread_idx], IL[0]) + + # update right matrix profile and matrix profile indices + cond = ρR[0] < ρR[thread_idx] + ρR[0] = np.where(cond, ρR[thread_idx], ρR[0]) + IR[0] = np.where(cond, IR[thread_idx], IR[0]) # Reverse top-k rho (and its associated I) to be in descending order and # then convert from Pearson correlations to Euclidean distances (ascending order) From be44ab06c5b10c2607c95e7de0277c18da762c39 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 24 Jun 2022 13:21:56 -0600 Subject: [PATCH 237/416] Avoid creating new memory in for-loop --- stumpy/stumped.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/stumpy/stumped.py b/stumpy/stumped.py index dc2978318..1246bbb2c 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -274,13 +274,13 @@ def stumped( # Update top-1 left matrix profile and matrix profile index cond = PL < profile_L - profile_L = np.where(cond, PL, profile_L) - indices_L = np.where(cond, IL, indices_L) + profile_L[:] = np.where(cond, PL, profile_L) + indices_L[:] = np.where(cond, IL, indices_L) # Update top-1 right matrix profile and matrix profile index cond = PR < profile_R - profile_R = np.where(cond, PR, profile_R) - indices_R = np.where(cond, IR, indices_R) + profile_R[:] = np.where(cond, PR, profile_R) + indices_R[:] = np.where(cond, IR, indices_R) out = np.empty((l, 2 * k + 2), dtype=object) out[:, :k] = profile From e0ad42af65f80df0565c56055861ee8d031b3ed0 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 24 Jun 2022 16:11:43 -0600 Subject: [PATCH 238/416] Update test function --- tests/test_stimp.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/tests/test_stimp.py b/tests/test_stimp.py index 089c1f1f9..f30514193 100644 --- a/tests/test_stimp.py +++ b/tests/test_stimp.py @@ -50,12 +50,9 @@ def test_stimp_1_percent(T): zone = int(np.ceil(m / 4)) s = zone tmp_P, tmp_I = naive.prescrump(T, m, T, s=s, exclusion_zone=zone) - ref_mp = naive.scrump(T, m, T, percentage, zone, True, s) - for i in range(ref_mp.shape[0]): - if tmp_P[i] < ref_mp[i, 0]: - ref_mp[i, 0] = tmp_P[i] - ref_mp[i, 1] = tmp_I[i] - ref_PAN[pan._bfs_indices[idx], : ref_mp.shape[0]] = ref_mp[:, 0] + ref_P, ref_I, _, _ = naive.scrump(T, m, T, percentage, zone, True, s) + naive.merge_topk_PI(ref_P, tmp_P, ref_I, tmp_I) + ref_PAN[pan._bfs_indices[idx], : ref_P.shape[0]] = ref_P.flatten() # Compare raw pan cmp_PAN = pan._PAN @@ -108,12 +105,9 @@ def test_stimp_max_m(T): zone = int(np.ceil(m / 4)) s = zone tmp_P, tmp_I = naive.prescrump(T, m, T, s=s, exclusion_zone=zone) - ref_mp = naive.scrump(T, m, T, percentage, zone, True, s) - for i in range(ref_mp.shape[0]): - if tmp_P[i] < ref_mp[i, 0]: - ref_mp[i, 0] = tmp_P[i] - ref_mp[i, 1] = tmp_I[i] - ref_PAN[pan._bfs_indices[idx], : ref_mp.shape[0]] = ref_mp[:, 0] + ref_P, ref_I, _, _ = naive.scrump(T, m, T, percentage, zone, True, s) + naive.merge_topk_PI(ref_P, tmp_P, ref_I, tmp_I) + ref_PAN[pan._bfs_indices[idx], : ref_P.shape[0]] = ref_P.flatten() # Compare raw pan cmp_PAN = pan._PAN From 33c215147831daac885868129a676af6587ff67c Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 24 Jun 2022 17:09:00 -0600 Subject: [PATCH 239/416] Revise function to make it parallelizable --- stumpy/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stumpy/core.py b/stumpy/core.py index 379fe8c60..90abbcb51 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2636,7 +2636,7 @@ def _merge_topk_ρI(ρA, ρB, IA, IB): ------- None """ - for i in range(ρB.shape[0]): + for i in prange(ρB.shape[0]): start = np.searchsorted(ρA[i], ρB[i, 0], side="left") stop = ρB.shape[1] From 4a995d14a3ff56ebf1964f6c9a56cf5609fb9914 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 24 Jun 2022 18:30:04 -0600 Subject: [PATCH 240/416] Full test and coverage in 1hr From 2c55c885e5fd7212d34b90afcf7ebb5ad9094caa Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 24 Jun 2022 18:56:03 -0600 Subject: [PATCH 241/416] Revise docstrings --- stumpy/core.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 90abbcb51..86517ecac 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2575,17 +2575,17 @@ def _merge_topk_PI(PA, PB, IA, IB): ---------- PA : numpy.ndarray A (top-k) matrix profile, with ndim of 2, where values in each row are - sorted in ascending order. Also, it needs to be the same shape as PB. + sorted in ascending order. PB : numpy.ndarray A (top-k) matrix profile, with ndim of 2, where values in each row are - sorted in ascending order. Also, it needs to be the same shape as PA. + sorted in ascending order. `PB` must have the same shape as `PA`. IA : numpy.ndarray - A (top-k) matrix profile indices, corresponding to PA + A (top-k) matrix profile indices corresponding to PA IB : numpy.ndarray - A (top-k) matrix profile indices, corresponding to PB + A (top-k) matrix profile indices corresponding to PB Returns ------- @@ -2620,17 +2620,18 @@ def _merge_topk_ρI(ρA, ρB, IA, IB): ---------- ρA : numpy.ndarray A (top-k) pearson profile, with ndim of 2, where values in each row are - sorted in ascending order. Also, it needs to be the same shape as ρB. + sorted in ascending order. ρB : numpy.ndarray A (top-k) pearson profile, with ndim of 2, where values in each row are - sorted in ascending order. Also, it needs to be the same shape as ρA. + sorted in ascending order. Also, it needs to be the same shape as ρA. `ρB` + must have the same shape as `ρA`. IA : numpy.ndarray - A (top-k) matrix profile indices, corresponding to ρA + A (top-k) matrix profile indices corresponding to ρA IB : numpy.ndarray - A (top-k) matrix profile indices, corresponding to ρB + A (top-k) matrix profile indices corresponding to ρB Returns ------- From 07b83ab1c8df656d70d3a6ceb83cbcf7964f618b Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 24 Jun 2022 18:58:09 -0600 Subject: [PATCH 242/416] Revise docstrings --- stumpy/core.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 86517ecac..951b13721 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2624,8 +2624,7 @@ def _merge_topk_ρI(ρA, ρB, IA, IB): ρB : numpy.ndarray A (top-k) pearson profile, with ndim of 2, where values in each row are - sorted in ascending order. Also, it needs to be the same shape as ρA. `ρB` - must have the same shape as `ρA`. + sorted in ascending order. `ρB` must have the same shape as `ρA`. IA : numpy.ndarray A (top-k) matrix profile indices corresponding to ρA From 5be2cf662406901569357b038215581cf19bcbd8 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 24 Jun 2022 19:15:29 -0600 Subject: [PATCH 243/416] Optimize function --- stumpy/core.py | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 951b13721..c95e869f7 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2595,15 +2595,28 @@ def _merge_topk_PI(PA, PB, IA, IB): start = 0 stop = np.searchsorted(PA[i], PB[i, -1], side="right") + if stop == 0: + # means PB[i, -1] < PA[i, 0], i.e. the maximum value in PB[i] is less + # than smallest value in PA[i]. So, we should replace PA[i] with PB[i]. + PA[i] = PB[i] + IA[i] = IB[i] + continue + for j in range(PB.shape[1]): - if PB[i, j] < PA[i, -1]: - idx = np.searchsorted(PA[i, start:stop], PB[i, j], side="right") + start + if PB[i, j] >= PA[i, -1]: + # PB[i] is sorted ascaendingly. + # Hence: PB[i, j+1] >= PB[i, j] >= PA[i, -1] + break + + # PB[i, j] is less than PA[i, -1], the maximum value in PA[i]. so, + # we MUST update PA[i]. + idx = np.searchsorted(PA[i, start:stop], PB[i, j], side="right") + start - _shift_insert_at_index(PA[i], idx, PB[i, j]) - _shift_insert_at_index(IA[i], idx, IB[i, j]) + _shift_insert_at_index(PA[i], idx, PB[i, j]) + _shift_insert_at_index(IA[i], idx, IB[i, j]) - start = idx - stop += 1 # because of shifting elements to the right by one + start = idx + stop += 1 # because of shifting elements to the right by one @njit(parallel=True) From 4896fe8dcdc4a6cb77226d000b7c87b0ce96e1c7 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 24 Jun 2022 19:32:18 -0600 Subject: [PATCH 244/416] Optimize function --- stumpy/core.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index c95e869f7..c643f668c 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2653,16 +2653,29 @@ def _merge_topk_ρI(ρA, ρB, IA, IB): start = np.searchsorted(ρA[i], ρB[i, 0], side="left") stop = ρB.shape[1] + if start == ρB.shape[1]: + # means ρB[i, 0] > ρA[i, -1], i.e. the minimum value in ρB[i] is greater + # than greatest value in ρA[i]. So, we should replace ρA[i] with ρB[i]. + ρA[i] = ρB[i] + IA[i] = IB[i] + continue + for j in range(ρB.shape[1] - 1, -1, -1): - if ρB[i, j] > ρA[i, 0]: - idx = np.searchsorted(ρA[i, start:stop], ρB[i, j], side="left") + start + if ρB[i, j] <= ρA[i, 0]: + # ρB[i] is sorted ascaendingly. + # Hence, next iteration: ρB[i, j-1] <= ρB[i, j] <= ρA[i, 0] + break + + # ρB[i, j] is greater than ρA[i, 0], the minimum value in ρA[i]. so, + # we MUST update ρA[i] to make sure we are keeping top-k largest values. + idx = np.searchsorted(ρA[i, start:stop], ρB[i, j], side="left") + start - _shift_insert_at_index(ρA[i], idx, ρB[i, j], shift="left") - _shift_insert_at_index(IA[i], idx, IB[i, j], shift="left") + _shift_insert_at_index(ρA[i], idx, ρB[i, j], shift="left") + _shift_insert_at_index(IA[i], idx, IB[i, j], shift="left") - stop = idx # because of shifting elements to the left by one - if start > 0: - start -= 1 + stop = idx # because of shifting elements to the left by one + if start > 0: + start -= 1 @njit From d9a0a20c9d65d223f3b95de15c54703133146f8b Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 24 Jun 2022 19:34:53 -0600 Subject: [PATCH 245/416] Rename variable to improve readability --- stumpy/scrump.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 3c3c2916e..1532e8056 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -114,11 +114,11 @@ def _compute_PI( if excl_zone is not None: core._apply_exclusion_zone(squared_distance_profile, i, excl_zone, np.inf) - nn = np.argmin(squared_distance_profile) + nn_idx = np.argmin(squared_distance_profile) core._shift_insert_at_index( - P_squared[thread_idx, i], 0, squared_distance_profile[nn] + P_squared[thread_idx, i], 0, squared_distance_profile[nn_idx] ) - core._shift_insert_at_index(I[thread_idx, i], 0, nn) + core._shift_insert_at_index(I[thread_idx, i], 0, nn_idx) if P_squared[thread_idx, i, 0] == np.inf: # pragma: no cover I[thread_idx, i, 0] = -1 From eabe0fbff4819b324dba25da53268a53f1e1ea71 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 24 Jun 2022 19:41:02 -0600 Subject: [PATCH 246/416] Revise comments --- stumpy/scrump.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 1532e8056..9d1f69d3e 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -130,8 +130,9 @@ def _compute_PI( m * M_T[j] * μ_Q[i] ) QT_j_prime = QT_j - # Update Top-k of BOTH subsequences at i+g and j+g (i.e. right neighbor - # of i, j), by using the distance between `S_(i+g)` and `S_(j+g)` + # Update top-k for both subsequences `S[i+g] = T[i+g:i+g+m]`` and + # `S[j+g] = T[j+g:j+g+m]` (i.e., the right neighbors of `T[i : i+m]` and + # `T[j:j+m]`) by using the distance between `S[i+g]` and `S[j+g]` for g in range(1, min(s, l - max(i, j))): QT_j = ( QT_j @@ -165,8 +166,9 @@ def _compute_PI( core._shift_insert_at_index(I[thread_idx, j + g], pos, i + g) QT_j = QT_j_prime - # Update Top-k of BOTH subsequences at i-g and j-g (i.e. left neighbor - # of i, j), by using the distance between `S_(i-g)` and `S_(j-g)` + # Update top-k for both subsequences `S[i-g] = T[i-g:i-g+m]` and + # `S[j-g] = T[j-g:j-g+m]` (i.e., the left neighbors of `T[i : i+m]` and + # `T[j:j+m]`) by using the distance between `S[i-g]` and `S[j-g]` for g in range(1, min(s, i + 1, j + 1)): QT_j = QT_j - T_B[i - g + m] * T_A[j - g + m] + T_B[i - g] * T_A[j - g] D_squared = core._calculate_squared_distance( @@ -195,7 +197,8 @@ def _compute_PI( ) core._shift_insert_at_index(I[thread_idx, j - g], pos, i - g) - # self-join only + # In the case of a self-join, the calculated distances can also be used + # to refine the top-k for all non-trivial subsequences if excl_zone is not None: # Note that the squared distance, `squared_distance_profile[j]`, # between subsequences `S_i = T[i : i + m]` and `S_j = T[j : j + m]` From e298fd36670438726f401c06fbb92f0b89b7ebaa Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 25 Jun 2022 09:39:35 -0600 Subject: [PATCH 247/416] Improve comments and docstrings --- stumpy/core.py | 64 ++++++++++++++++++++++++++------------------------ tests/naive.py | 31 ++++++++++++++---------- 2 files changed, 52 insertions(+), 43 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index c643f668c..92d4cbde3 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2596,24 +2596,25 @@ def _merge_topk_PI(PA, PB, IA, IB): stop = np.searchsorted(PA[i], PB[i, -1], side="right") if stop == 0: - # means PB[i, -1] < PA[i, 0], i.e. the maximum value in PB[i] is less - # than smallest value in PA[i]. So, we should replace PA[i] with PB[i]. + # means `PB[i, -1] < PA[i, 0]`, i.e. the maximum value in `PB[i]` is + # less than smallest value in `PA[i]`. So, we should replace `PA[i]` + # with `PB[i]` so that we have the top-k smallest. PA[i] = PB[i] IA[i] = IB[i] continue for j in range(PB.shape[1]): if PB[i, j] >= PA[i, -1]: - # PB[i] is sorted ascaendingly. - # Hence: PB[i, j+1] >= PB[i, j] >= PA[i, -1] + # `PB[i]` is sorted ascaendingly. + # Hence, in next iteration: `PB[i, j+1] >= PB[i, j] >= PA[i, -1]` break - # PB[i, j] is less than PA[i, -1], the maximum value in PA[i]. so, - # we MUST update PA[i]. + # `PB[i, j]` is less than `PA[i, -1]`, the maximum value in `PA[i]`. + # so, we must update `PA[i]` to have the top-k smallest values. idx = np.searchsorted(PA[i, start:stop], PB[i, j], side="right") + start - _shift_insert_at_index(PA[i], idx, PB[i, j]) - _shift_insert_at_index(IA[i], idx, IB[i, j]) + _shift_insert_at_index(PA[i], idx, PB[i, j], shift="right") + _shift_insert_at_index(IA[i], idx, IB[i, j], shift="right") start = idx stop += 1 # because of shifting elements to the right by one @@ -2622,9 +2623,13 @@ def _merge_topk_PI(PA, PB, IA, IB): @njit(parallel=True) def _merge_topk_ρI(ρA, ρB, IA, IB): """ - Merge two top-k pearson profiles ρA and ρB, and update ρA (in place). In the - merged array (from right to left): the priority is with ρA (from right to left), - and then with ρB(from right to left) Also, update IA accordingly. + Merge two top-k pearson profiles ρA and ρB, and update ρA (in place). The priority + is with ρA (from right to left) and then ρB (from right to left). + + Example: + note: the prime symbol below is to distinguish two elements with same value + ρA = [0, 0', 1], and ρB = [0, 1, 1']. + merging outcome: [1_B, 1'_B, 1_A] Unlike `_merge_topk_PI`, where `top-k` smallest values are kept, this function keeps `top-k` largest values. @@ -2654,42 +2659,39 @@ def _merge_topk_ρI(ρA, ρB, IA, IB): stop = ρB.shape[1] if start == ρB.shape[1]: - # means ρB[i, 0] > ρA[i, -1], i.e. the minimum value in ρB[i] is greater - # than greatest value in ρA[i]. So, we should replace ρA[i] with ρB[i]. + # means `ρB[i, 0] > ρA[i, -1]`, i.e. the minimum value in `ρB[i]` is + # greater than greatest value in `ρA[i]`. So, we should replace `ρA[i]` + # with `ρB[i]` so that we have top-k largest values ρA[i] = ρB[i] IA[i] = IB[i] continue for j in range(ρB.shape[1] - 1, -1, -1): if ρB[i, j] <= ρA[i, 0]: - # ρB[i] is sorted ascaendingly. - # Hence, next iteration: ρB[i, j-1] <= ρB[i, j] <= ρA[i, 0] + # `ρB[i]` is sorted ascaendingly. + # Hence, in the next iteration: `ρB[i, j-1] <= ρB[i, j] <= ρA[i, 0]` break - # ρB[i, j] is greater than ρA[i, 0], the minimum value in ρA[i]. so, - # we MUST update ρA[i] to make sure we are keeping top-k largest values. + # `ρB[i, j]` is greater than `ρA[i, 0]`, the minimum value in `ρA[i]`. + # so, we must update `ρA[i]` to make sure we have top-k largest values. idx = np.searchsorted(ρA[i, start:stop], ρB[i, j], side="left") + start _shift_insert_at_index(ρA[i], idx, ρB[i, j], shift="left") _shift_insert_at_index(IA[i], idx, IB[i, j], shift="left") - stop = idx # because of shifting elements to the left by one + stop = idx if start > 0: - start -= 1 + start -= 1 # because of shifting elements to the left by one @njit def _shift_insert_at_index(a, idx, v, shift="right"): """ If `shift=right`, all elements in `a[idx:]` are shifted to the right by one element - and the last element is discarded. If `shift=left` or any other string value, - all elements in `a[:idx]` are shifted to the left by one element and the first - element is discarded. In both cases, the length of `a` remains unchanged. - - Note - ---- - No check is performed to ensure the value of parameter `shift` is 1 or -1. - It is user's responsibility to provide a valid value for this parameter. + and the last element is discarded. If `shift=left` (or any string value other + than "right") all elements in `a[:idx]` are shifted to the left by one element + and the first element is discarded. In both cases, the length of `a` remains + unchanged. Parameters ---------- @@ -2698,9 +2700,9 @@ def _shift_insert_at_index(a, idx, v, shift="right"): idx: int The index at which the value `v` should be inserted. This can be any - integer number from `0` to `len(a)`. When `idx=0` and `shift` is set to - "right", or when `idx=len(a)` and `shift` is set to any other string value, - then no change will occur on the input array `a`. + integer number from `0` to `len(a)`. When `idx=0` and `shift="right"`, + OR when `idx=len(a)` and `shift != "right"`, then no change will occur on + the input array `a`. v: float The value that should be inserted into array `a` at index `idx` @@ -2723,5 +2725,5 @@ def _shift_insert_at_index(a, idx, v, shift="right"): else: if 0 < idx <= len(a): a[: idx - 1] = a[1:idx] - # elements were shifted to left, and thus the insertion becomes `idx-1` + # elements were shifted to left, thus the insertion index becomes `idx-1` a[idx - 1] = v diff --git a/tests/naive.py b/tests/naive.py index 03de99bf5..47fe32c21 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1793,20 +1793,26 @@ def merge_topk_PI(PA, PB, IA, IB): def merge_topk_ρI(ρA, ρB, IA, IB): - # this is to merge two pearson profiles, each is a 2D array where each row - # contains an ascendingly-sorted values. - # Note that we are interested in keeping the top-k largest values. - # In the merged array (from right to left): the priority is with ρA (from right - # to left), and then with ρB(from right to left) + # this is to merge two pearson profiles `ρA` and `ρB`, where each is a 2D array + # and each row is sorted ascendingly. Smaller distance corresponds to larger + # pearson values. Therefore, we want to keep top-k largest values in merging + # row `ρA[i]` and `ρB[i]`. The priority is with `ρA (from right to left)` and + # then `ρB (from right to left)`. # Example: - # ρA = [0(I), 0(II), 1], and ρB = [0', 1'(I), 1'(II)]. - # the prime symbol is to indicate that the values are from ρB - # and the greek numbers are to differntiate two same values in one array - - # so, the outcome of merging process should be: - # [0', 0(I), 0(II), 1'(I), 1'(II), 1] - + # note: the prime symbol below is to distinguish two elements with same value + # ρA = [0, 0', 1], and ρB = [0, 1, 1']. + # merging outcome: [1_B, 1'_B, 1_A] + + # Naive Implementation: + # keeping top-k largest with the aforementioned priority rule is the same as + # sorting ascendingly while prioritizing `ρB` (from left to right) over `ρA` + # (from left to right), and then keep the second half of merged array. + + # In our example, it would be like this: + # merging `ρB` and `ρA` (prioritizing smaller values in `ρB`): + # [0_B, 0_A, 0'_A, 1_B, 1'_B, 1_A], and we just need to keep the second half + # of this array (and discard the first half) profile = np.column_stack((ρB, ρA)) indices = np.column_stack((IB, IA)) @@ -1814,5 +1820,6 @@ def merge_topk_ρI(ρA, ρB, IA, IB): profile[:, :] = np.take_along_axis(profile, idx, axis=1) indices[:, :] = np.take_along_axis(indices, idx, axis=1) + # keep the last k elements (top-k largest values) ρA[:, :] = profile[:, ρA.shape[1] :] IA[:, :] = indices[:, ρA.shape[1] :] From 9afba6c3d1460fea3b906623870678ee5364dc70 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 25 Jun 2022 10:15:58 -0600 Subject: [PATCH 248/416] Correct naive implementation --- tests/naive.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 47fe32c21..802d95a73 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -720,7 +720,7 @@ def __init__(self, T, m, excl_zone=None, p=2.0): self._T_isfinite = np.isfinite(self._T) self._m = m self._p = p - if excl_zone is None: + if excl_zone is None: # see stumpi, and make similar changes here self._excl_zone = int(np.ceil(self._m / config.STUMPY_EXCL_ZONE_DENOM)) self._l = self._T.shape[0] - m + 1 @@ -792,11 +792,13 @@ def __init__(self, T, m, excl_zone=None): self._T = self._T.copy() self._T_isfinite = np.isfinite(self._T) self._m = m - if excl_zone is None: + + self._excl_zone = excl_zone + if self._excl_zone is None: self._excl_zone = int(np.ceil(self._m / config.STUMPY_EXCL_ZONE_DENOM)) self._l = self._T.shape[0] - m + 1 - mp = stump(T, m) + mp = stump(T, m, exclusion_zone=self._excl_zone) self.P_ = mp[:, 0] self.I_ = mp[:, 1].astype(np.int64) self.left_P_ = np.full(self.P_.shape, np.inf) From 05945a026de823314840bd86cd1f87da5a65e312 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 25 Jun 2022 11:20:51 -0600 Subject: [PATCH 249/416] Enhance naive function to support top matrix profile --- tests/naive.py | 54 ++++++++++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 802d95a73..ab8d251cb 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -787,25 +787,28 @@ def update(self, t): class stumpi_egress(object): - def __init__(self, T, m, excl_zone=None): + def __init__(self, T, m, excl_zone=None, k=1): self._T = np.asarray(T) self._T = self._T.copy() self._T_isfinite = np.isfinite(self._T) self._m = m + self._k = k self._excl_zone = excl_zone if self._excl_zone is None: self._excl_zone = int(np.ceil(self._m / config.STUMPY_EXCL_ZONE_DENOM)) self._l = self._T.shape[0] - m + 1 - mp = stump(T, m, exclusion_zone=self._excl_zone) - self.P_ = mp[:, 0] - self.I_ = mp[:, 1].astype(np.int64) - self.left_P_ = np.full(self.P_.shape, np.inf) - self.left_I_ = mp[:, 2].astype(np.int64) - for i, j in enumerate(self.left_I_): - if j >= 0: - D = core.mass(self._T[i : i + self._m], self._T[j : j + self._m]) + mp = stump(T, m, exclusion_zone=self._excl_zone, k=k) + self.P_ = mp[:, :k].astype(np.float64) + self.I_ = mp[:, k : 2 * k].astype(np.int64) + + self.left_I_ = mp[:, 2 * k].astype(np.int64) + self.left_P_ = np.full_like(self.left_I_, np.inf, dtype=np.float64) + + for i, nn_i in enumerate(self.left_I_): + if nn_i >= 0: + D = core.mass(self._T[i : i + self._m], self._T[nn_i : nn_i + self._m]) self.left_P_[i] = D[0] self._n_appended = 0 @@ -821,8 +824,8 @@ def update(self, t): self._T[-1] = 0 self._n_appended += 1 - self.P_[:] = np.roll(self.P_, -1) - self.I_[:] = np.roll(self.I_, -1) + self.P_[:, :] = np.roll(self.P_, -1, axis=0) + self.I_[:, :] = np.roll(self.I_, -1, axis=0) self.left_P_[:] = np.roll(self.left_P_, -1) self.left_I_[:] = np.roll(self.left_I_, -1) @@ -835,22 +838,25 @@ def update(self, t): D[:] = np.inf apply_exclusion_zone(D, D.shape[0] - 1, self._excl_zone, np.inf) + # update top-k matrix profile using newly calculated distance profile `D` for j in range(D.shape[0]): - if D[j] < self.P_[j]: - self.I_[j] = D.shape[0] - 1 + self._n_appended - self.P_[j] = D[j] - - I_last = np.argmin(D) + if D[j] < self.P_[j, -1]: + pos = np.searchsorted(self.P_[j], D[j], side="right") + self.P_[j] = np.insert(self.P_[j], pos, D[j])[:-1] + self.I_[j] = np.insert( + self.I_[j], pos, D.shape[0] - 1 + self._n_appended + ) - if np.isinf(D[I_last]): - self.I_[-1] = -1 - self.P_[-1] = np.inf - else: - self.I_[-1] = I_last + self._n_appended - self.P_[-1] = D[I_last] + # update top-k for the last, newly-updated index + I_last_topk = np.argsort(D)[: self._k] + self.P_[-1] = D[I_last_topk] + self.I_[-1] = I_last_topk + self._n_appended + self.I_[-1][self.P_[-1] == np.inf] = -1 - self.left_I_[-1] = I_last + self._n_appended - self.left_P_[-1] = D[I_last] + # for last indx, the left matrix profile value is self.P_[-1, 0] + # and the same goes for left matrix profile index + self.left_P_[-1] = self.P_[-1, 0] + self.left_I_[-1] = self.I_[-1, 0] def across_series_nearest_neighbors(Ts, Ts_idx, subseq_idx, m): From a72aeb76a5670a218955c472693b0b7373fc6251 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 25 Jun 2022 12:19:51 -0600 Subject: [PATCH 250/416] Enhace performant function to support topk matrix profile --- stumpy/stumpi.py | 144 +++++++++++++++++++++++++++++------------------ 1 file changed, 88 insertions(+), 56 deletions(-) diff --git a/stumpy/stumpi.py b/stumpy/stumpi.py index 1a8bbf72b..63c6ff42e 100644 --- a/stumpy/stumpi.py +++ b/stumpy/stumpi.py @@ -36,19 +36,30 @@ class stumpi: The p-norm to apply for computing the Minkowski distance. This parameter is ignored when `normalize == True`. + k : int, default 1 + The number of top `k` smallest distances used to construct the matrix profile. + Note that this will increase the total computational time and memory usage + when k > 1. + Attributes ---------- P_ : numpy.ndarray - The updated matrix profile for `T` + The updated (top-k) matrix profile for `T`. When `k=1` (default), the first + (and only) column in this 2D array consists of the matrix profile. When + `k > 1`, the output has exactly `k` columns consist of the top-k matrix + profile. I_ : numpy.ndarray - The updated matrix profile indices for `T` + The updated (top-k) matrix profile indices for `T`. When `k=1` (default), + the first (and only) column in this 2D array consists of the matrix profile + indices. When `k > 1`, the output has exactly `k` columns consist of the + top-k matrix profile indices. left_P_ : numpy.ndarray - The updated left matrix profile for `T` + The updated left (top-1) matrix profile for `T` left_I_ : numpy.ndarray - The updated left matrix profile indices for `T` + The updated left (top-1) matrix profile indices for `T` T_ : numpy.ndarray The updated time series or sequence for which the matrix profile and matrix @@ -81,7 +92,7 @@ class stumpi: array([-1, 0, 1, 2]) """ - def __init__(self, T, m, egress=True, normalize=True, p=2.0): + def __init__(self, T, m, egress=True, normalize=True, p=2.0, k=1): """ Initialize the `stumpi` object @@ -106,27 +117,34 @@ def __init__(self, T, m, egress=True, normalize=True, p=2.0): p : float, default 2.0 The p-norm to apply for computing the Minkowski distance. This parameter is ignored when `normalize == True`. + + k : int, default 1 + The number of top `k` smallest distances used to construct the matrix profile. + Note that this will increase the total computational time and memory usage + when k > 1. """ self._T = core._preprocess(T) core.check_window_size(m, max_size=self._T.shape[-1]) self._m = m + self._k = k + self._n = self._T.shape[0] self._excl_zone = int(np.ceil(self._m / config.STUMPY_EXCL_ZONE_DENOM)) self._T_isfinite = np.isfinite(self._T) self._egress = egress - mp = stump(self._T, self._m) - self._P = mp[:, 0].astype(np.float64) - self._I = mp[:, 1].astype(np.int64) - self._left_I = mp[:, 2].astype(np.int64) - self._left_P = np.empty(self._P.shape, dtype=np.float64) - self._left_P[:] = np.inf + mp = stump(self._T, self._m, k=self._k) + self._P = mp[:, :k].astype(np.float64) + self._I = mp[:, k : 2 * k].astype(np.int64) + + self._left_I = mp[:, 2 * k].astype(np.int64) + self._left_P = np.full_like(self._left_I, np.inf, dtype=np.float64) self._T, self._M_T, self._Σ_T = core.preprocess(self._T, self._m) # Retrieve the left matrix profile values - for i, j in enumerate(self._left_I): - if j >= 0: - D = core.mass(self._T[i : i + self._m], self._T[j : j + self._m]) + for i, nn_i in enumerate(self._left_I): + if nn_i >= 0: + D = core.mass(self._T[i : i + self._m], self._T[nn_i : nn_i + self._m]) self._left_P[i] = D[0] Q = self._T[-m:] @@ -138,7 +156,7 @@ def __init__(self, T, m, egress=True, normalize=True, p=2.0): def update(self, t): """ Append a single new data point, `t`, to the existing time series `T` and update - the matrix profile and matrix profile indices. + the (top-k) matrix profile and matrix profile indices. Parameters ---------- @@ -161,8 +179,8 @@ def update(self, t): def _update_egress(self, t): """ - Ingress a new data point, egress the oldest data point, and update the matrix - profile and matrix profile indices + Ingress a new data point, egress the oldest data point, and update the (top-k) + matrix profile and matrix profile indices """ self._n = self._T.shape[0] l = self._n - self._m + 1 - 1 # Subtract 1 due to egress @@ -174,8 +192,8 @@ def _update_egress(self, t): t_drop = self._T[l - 1] self._T_isfinite[:-1] = self._T_isfinite[1:] - self._I[:-1] = self._I[1:] - self._P[:-1] = self._P[1:] + self._I[:-1, :] = self._I[1:, :] + self._P[:-1, :] = self._P[1:, :] self._left_I[:-1] = self._left_I[1:] self._left_P[:-1] = self._left_P[1:] @@ -211,28 +229,34 @@ def _update_egress(self, t): core.apply_exclusion_zone(D, D.shape[0] - 1, self._excl_zone, np.inf) - update_idx = np.argwhere(D < self._P).flatten() - self._I[update_idx] = D.shape[0] + self._n_appended - 1 # D.shape[0] is base-1 - self._P[update_idx] = D[update_idx] - - I_last = np.argmin(D) - - if np.isinf(D[I_last]): - self._I[-1] = -1 - self._P[-1] = np.inf - else: - self._I[-1] = I_last + self._n_appended - self._P[-1] = D[I_last] - - self._left_I[-1] = I_last + self._n_appended - self._left_P[-1] = D[I_last] + update_idx = np.argwhere(D < self._P[:, -1]).flatten() + for i in update_idx: + pos = np.searchsorted(self._P[i], D[i], side="right") + core._shift_insert_at_index(self._P[i], pos, D[i]) + core._shift_insert_at_index( + self._I[i], pos, D.shape[0] + self._n_appended - 1 + ) + # D.shape[0] is base-1 + + # O(Nlog(K)) time complexity + self._P[-1] = np.inf + self._I[-1] = -1 + for i, d in enumerate(D): + if d < self._P[-1, -1]: # mean last index, maximum value (k-th value) + pos = np.searchsorted(self._P[-1], d, side="right") + core._shift_insert_at_index(self._P[-1], pos, d) + core._shift_insert_at_index(self._I[-1], pos, i + self._n_appended) + + # for last index, the left matrix profile is basically `self._P[-1, 0]` + self._left_P[-1] = self._P[-1, 0] + self._left_I[-1] = self._I[-1, 0] self._QT[:] = self._QT_new def _update(self, t): """ - Ingress a new data point and update the matrix profile and matrix profile - indices without egressing the oldest data point + Ingress a new data point and update the (top-k) matrix profile and matrix + profile indices without egressing the oldest data point """ n = self._T.shape[0] l = n - self._m + 1 @@ -269,25 +293,33 @@ def _update(self, t): core.apply_exclusion_zone(D, D.shape[0] - 1, self._excl_zone, np.inf) - update_idx = np.argwhere(D[:l] < self._P[:l]).flatten() - self._I[update_idx] = l - self._P[update_idx] = D[update_idx] + update_idx = np.argwhere(D[:l] < self._P[:l, -1]).flatten() + for i in update_idx: + pos = np.searchsorted(self._P[i], D[i], side="right") + core._shift_insert_at_index(self._P[i], pos, D[i]) + core._shift_insert_at_index(self._I[i], pos, l) - I_last = np.argmin(D) - if np.isinf(D[I_last]): - I_new = np.append(self._I, -1) - P_new = np.append(self._P, np.inf) - else: - I_new = np.append(self._I, I_last) - P_new = np.append(self._P, D[I_last]) - left_I_new = np.append(self._left_I, I_last) - left_P_new = np.append(self._left_P, D[I_last]) + # Calculating top-k and left matrix profile for new subsequence whose + # distance profie is D + + # O(Nlog(K)) time complexity for obtaining top-k + P_new = np.full((1, self._k), np.inf, dtype=np.float64) + I_new = np.full((1, self._k), -1, dtype=np.int64) + + for i, d in enumerate(D): + if d < P_new[-1]: # maximum value in sorted array P_new + pos = np.searchsorted(P_new, d, side="right") + core._shift_insert_at_index(P_new, pos, d) + core._shift_insert_at_index(I_new, pos, i) + + left_I_new = P_new[0, 0] + left_P_new = I_new[0, 0] self._T = T_new - self._P = P_new - self._I = I_new - self._left_I = left_I_new - self._left_P = left_P_new + self._P = np.append(self._P, P_new, axis=0) + self._I = np.append(self._I, I_new, axis=0) + self._left_P = np.append(self._left_P, left_P_new) + self._left_I = np.append(self._left_I, left_I_new) self._QT = QT_new self._M_T = M_T_new self._Σ_T = Σ_T_new @@ -295,28 +327,28 @@ def _update(self, t): @property def P_(self): """ - Get the matrix profile + Get the (top-k) matrix profile """ return self._P.astype(np.float64) @property def I_(self): """ - Get the matrix profile indices + Get the (top-k) matrix profile indices """ return self._I.astype(np.int64) @property def left_P_(self): """ - Get the left matrix profile + Get the (top-1) left matrix profile """ return self._left_P.astype(np.float64) @property def left_I_(self): """ - Get the left matrix profile indices + Get the (top-1) sleft matrix profile indices """ return self._left_I.astype(np.int64) From c7ffdac6b94a5615c8c4bd2a5c9818af2d6af6c1 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 25 Jun 2022 12:21:13 -0600 Subject: [PATCH 251/416] Update existing test functions --- tests/test_stumpi.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/test_stumpi.py b/tests/test_stumpi.py index 0fa2c3066..e67446d0a 100644 --- a/tests/test_stumpi.py +++ b/tests/test_stumpi.py @@ -33,8 +33,8 @@ def test_stumpi_self_join(): comp_left_I = stream.left_I_ ref_mp = naive.stump(stream.T_, m, exclusion_zone=zone, row_wise=True) - ref_P = ref_mp[:, 0] - ref_I = ref_mp[:, 1] + ref_P = ref_mp[:, 0].reshape(-1, 1) + ref_I = ref_mp[:, 1].reshape(-1, 1) ref_left_P = np.empty(ref_P.shape) ref_left_P[:] = np.inf ref_left_I = ref_mp[:, 2] @@ -211,8 +211,8 @@ def test_stumpi_init_nan_inf_self_join(substitute, substitution_locations): stream.T_[substitution_location] = substitute ref_mp = naive.stump(stream.T_, m, exclusion_zone=zone, row_wise=True) - ref_P = ref_mp[:, 0] - ref_I = ref_mp[:, 1] + ref_P = ref_mp[:, 0].reshape(-1, 1) + ref_I = ref_mp[:, 1].reshape(-1, 1) naive.replace_inf(ref_P) naive.replace_inf(comp_P) @@ -386,8 +386,8 @@ def test_stumpi_stream_nan_inf_self_join(substitute, substitution_locations): stream.T_[30:][substitution_location] = substitute ref_mp = naive.stump(stream.T_, m, exclusion_zone=zone, row_wise=True) - ref_P = ref_mp[:, 0] - ref_I = ref_mp[:, 1] + ref_P = ref_mp[:, 0].reshape(-1, 1) + ref_I = ref_mp[:, 1].reshape(-1, 1) naive.replace_inf(ref_P) naive.replace_inf(comp_P) @@ -547,7 +547,7 @@ def test_stumpi_constant_subsequence_self_join(): # comp_I = stream.I_ ref_mp = naive.stump(stream.T_, m, exclusion_zone=zone, row_wise=True) - ref_P = ref_mp[:, 0] + ref_P = ref_mp[:, 0].reshape(-1, 1) # ref_I = ref_mp[:, 1] naive.replace_inf(ref_P) @@ -702,7 +702,7 @@ def test_stumpi_identical_subsequence_self_join(): # comp_I = stream.I_ ref_mp = naive.stump(stream.T_, m, exclusion_zone=zone, row_wise=True) - ref_P = ref_mp[:, 0] + ref_P = ref_mp[:, 0].reshape(-1, 1) # ref_I = ref_mp[:, 1] naive.replace_inf(ref_P) From 34941c2e6eb602d1f1e02a87056870abcd36e8ca Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 25 Jun 2022 12:23:23 -0600 Subject: [PATCH 252/416] Correct format --- stumpy/stumpi.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/stumpy/stumpi.py b/stumpy/stumpi.py index 63c6ff42e..c147f7a1c 100644 --- a/stumpy/stumpi.py +++ b/stumpy/stumpi.py @@ -119,9 +119,9 @@ def __init__(self, T, m, egress=True, normalize=True, p=2.0, k=1): ignored when `normalize == True`. k : int, default 1 - The number of top `k` smallest distances used to construct the matrix profile. - Note that this will increase the total computational time and memory usage - when k > 1. + The number of top `k` smallest distances used to construct the matrix + profile. Note that this will increase the total computational time and + memory usage when `k > 1`. """ self._T = core._preprocess(T) core.check_window_size(m, max_size=self._T.shape[-1]) From 8cbe3081fea873cce2caea20bba62662e4183630 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 25 Jun 2022 12:32:54 -0600 Subject: [PATCH 253/416] Fix shape of array --- stumpy/stumpi.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/stumpy/stumpi.py b/stumpy/stumpi.py index c147f7a1c..4482c23e4 100644 --- a/stumpy/stumpi.py +++ b/stumpy/stumpi.py @@ -303,8 +303,8 @@ def _update(self, t): # distance profie is D # O(Nlog(K)) time complexity for obtaining top-k - P_new = np.full((1, self._k), np.inf, dtype=np.float64) - I_new = np.full((1, self._k), -1, dtype=np.int64) + P_new = np.full(self._k, np.inf, dtype=np.float64) + I_new = np.full(self._k, -1, dtype=np.int64) for i, d in enumerate(D): if d < P_new[-1]: # maximum value in sorted array P_new @@ -312,12 +312,12 @@ def _update(self, t): core._shift_insert_at_index(P_new, pos, d) core._shift_insert_at_index(I_new, pos, i) - left_I_new = P_new[0, 0] - left_P_new = I_new[0, 0] + left_I_new = P_new[0] + left_P_new = I_new[0] self._T = T_new - self._P = np.append(self._P, P_new, axis=0) - self._I = np.append(self._I, I_new, axis=0) + self._P = np.append(self._P, P_new.reshape(1, -1), axis=0) + self._I = np.append(self._I, I_new.reshape(1, -1), axis=0) self._left_P = np.append(self._left_P, left_P_new) self._left_I = np.append(self._left_I, left_I_new) self._QT = QT_new From 2b5038dcc6fc3099db6ba1d4f07b1029ad4e3967 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 25 Jun 2022 12:35:43 -0600 Subject: [PATCH 254/416] Fix shape of array --- tests/test_stumpi.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_stumpi.py b/tests/test_stumpi.py index e67446d0a..b43a6e8d1 100644 --- a/tests/test_stumpi.py +++ b/tests/test_stumpi.py @@ -35,9 +35,8 @@ def test_stumpi_self_join(): ref_mp = naive.stump(stream.T_, m, exclusion_zone=zone, row_wise=True) ref_P = ref_mp[:, 0].reshape(-1, 1) ref_I = ref_mp[:, 1].reshape(-1, 1) - ref_left_P = np.empty(ref_P.shape) - ref_left_P[:] = np.inf ref_left_I = ref_mp[:, 2] + ref_left_P = np.full_like(ref_left_I, np.inf, dtype=np.float64) for i, j in enumerate(ref_left_I): if j >= 0: D = core.mass(stream.T_[i : i + m], stream.T_[j : j + m]) From 9cc800a90e8057a1d8e7978a8ffdbad5ff09cd29 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 25 Jun 2022 12:42:35 -0600 Subject: [PATCH 255/416] Add kind keyword for sorting --- tests/naive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/naive.py b/tests/naive.py index ab8d251cb..721e89aa2 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -848,7 +848,7 @@ def update(self, t): ) # update top-k for the last, newly-updated index - I_last_topk = np.argsort(D)[: self._k] + I_last_topk = np.argsort(D, kind="mergesort")[: self._k] self.P_[-1] = D[I_last_topk] self.I_[-1] = I_last_topk + self._n_appended self.I_[-1][self.P_[-1] == np.inf] = -1 From 5b0b3fefbbde49ad6c417819c29706cfb647fadf Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 25 Jun 2022 14:29:23 -0600 Subject: [PATCH 256/416] Fix bugs --- stumpy/stumpi.py | 4 ++-- tests/naive.py | 2 +- tests/test_stumpi.py | 23 +++++++++++++---------- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/stumpy/stumpi.py b/stumpy/stumpi.py index 4482c23e4..727c9c64b 100644 --- a/stumpy/stumpi.py +++ b/stumpy/stumpi.py @@ -312,8 +312,8 @@ def _update(self, t): core._shift_insert_at_index(P_new, pos, d) core._shift_insert_at_index(I_new, pos, i) - left_I_new = P_new[0] - left_P_new = I_new[0] + left_I_new = I_new[0] + left_P_new = P_new[0] self._T = T_new self._P = np.append(self._P, P_new.reshape(1, -1), axis=0) diff --git a/tests/naive.py b/tests/naive.py index 721e89aa2..5b404f50c 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -845,7 +845,7 @@ def update(self, t): self.P_[j] = np.insert(self.P_[j], pos, D[j])[:-1] self.I_[j] = np.insert( self.I_[j], pos, D.shape[0] - 1 + self._n_appended - ) + )[:-1] # update top-k for the last, newly-updated index I_last_topk = np.argsort(D, kind="mergesort")[: self._k] diff --git a/tests/test_stumpi.py b/tests/test_stumpi.py index b43a6e8d1..262d88495 100644 --- a/tests/test_stumpi.py +++ b/tests/test_stumpi.py @@ -35,7 +35,7 @@ def test_stumpi_self_join(): ref_mp = naive.stump(stream.T_, m, exclusion_zone=zone, row_wise=True) ref_P = ref_mp[:, 0].reshape(-1, 1) ref_I = ref_mp[:, 1].reshape(-1, 1) - ref_left_I = ref_mp[:, 2] + ref_left_I = ref_mp[:, 2].astype(np.int64) ref_left_P = np.full_like(ref_left_I, np.inf, dtype=np.float64) for i, j in enumerate(ref_left_I): if j >= 0: @@ -860,21 +860,24 @@ def test_stumpi_profile_index_match(): T_stream = T_full[:warm_start].copy() stream = stumpi(T_stream, m, egress=True) - P = np.full(stream.P_.shape, np.inf) - left_P = np.full(stream.left_P_.shape, np.inf) + P = np.full_like(stream.P_, np.inf, dtype=np.float64) + left_P = np.full_like(stream.left_P_, np.inf, dtype=np.float64) n = 0 for i in range(len(T_stream), len(T_full)): t = T_full[i] stream.update(t) - P[:] = np.inf - idx = np.argwhere(stream.I_ >= 0).flatten() - P[idx] = naive.distance( - naive.z_norm(T_full_subseq[idx + n + 1], axis=1), - naive.z_norm(T_full_subseq[stream.I_[idx]], axis=1), - axis=1, - ) + P[:, :] = np.inf + mask = stream.I_ >= 0 + + for j in range(P.shape[1]): # `j` as j-th nearest neighbor + IDX = np.flatnonzero(mask[:, j]) + P[IDX, j] = naive.distance( + naive.z_norm(T_full_subseq[IDX + n + 1], axis=1), + naive.z_norm(T_full_subseq[stream.I_[IDX, j]], axis=1), + axis=1, + ) left_P[:] = np.inf idx = np.argwhere(stream.left_I_ >= 0).flatten() From 0335294ae9ee6d195ec3017a82cee6a66d59f699 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 25 Jun 2022 21:49:48 -0600 Subject: [PATCH 257/416] Remove ineffective inner prange --- stumpy/stump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index 80923f33c..59be1e61d 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -493,7 +493,7 @@ def _stump( p_norm_R = np.abs(2 * m * (1 - ρR[0, :])) for i in prange(p_norm.shape[0]): - for j in prange(p_norm.shape[1]): + for j in range(p_norm.shape[1]): if p_norm[i, j] < config.STUMPY_P_NORM_THRESHOLD: p_norm[i, j] = 0.0 From 6c05e30794cddc1a7f6e3c3c4579071c152fed20 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 25 Jun 2022 22:28:50 -0600 Subject: [PATCH 258/416] Temporarily added parameter k to avoid decorator failure --- stumpy/aampi.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/stumpy/aampi.py b/stumpy/aampi.py index de7c24126..938b5b488 100644 --- a/stumpy/aampi.py +++ b/stumpy/aampi.py @@ -8,6 +8,7 @@ class aampi: + # needs to be enhanced to support top-k matrix profile """ Compute an incremental non-normalized (i.e., without z-normalization) matrix profile for streaming data @@ -28,6 +29,11 @@ class aampi: p : float, default 2.0 The p-norm to apply for computing the Minkowski distance. + k : int, default 1 + The number of top `k` smallest distances used to construct the matrix profile. + Note that this will increase the total computational time and memory usage + when k > 1. + Attributes ---------- P_ : numpy.ndarray @@ -62,7 +68,7 @@ class aampi: Note that we have extended this algorithm for AB-joins as well. """ - def __init__(self, T, m, egress=True, p=2.0): + def __init__(self, T, m, egress=True, p=2.0, k=1): """ Initialize the `stumpi` object @@ -81,6 +87,12 @@ def __init__(self, T, m, egress=True, p=2.0): p : float, default 2.0 The p-norm to apply for computing the Minkowski distance. + + + k : int, default 1 + The number of top `k` smallest distances used to construct the matrix + profile. Note that this will increase the total computational time and + memory usage when k > 1. """ self._T = core._preprocess(T) core.check_window_size(m, max_size=self._T.shape[-1]) From 23a54ba827af4913fbad9b91660c189c3f983ca2 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 25 Jun 2022 22:31:29 -0600 Subject: [PATCH 259/416] Improve comments --- stumpy/scrump.py | 4 ++-- tests/naive.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 9d1f69d3e..0dd4b25dd 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -197,8 +197,8 @@ def _compute_PI( ) core._shift_insert_at_index(I[thread_idx, j - g], pos, i - g) - # In the case of a self-join, the calculated distances can also be used - # to refine the top-k for all non-trivial subsequences + # In the case of a self-join, the calculated distance profile can also be + # used to refine the top-k for all non-trivial subsequences if excl_zone is not None: # Note that the squared distance, `squared_distance_profile[j]`, # between subsequences `S_i = T[i : i + m]` and `S_j = T[j : j + m]` diff --git a/tests/naive.py b/tests/naive.py index 5b404f50c..f4c39a172 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1457,7 +1457,8 @@ def prescrump(T_A, m, T_B, s, exclusion_zone=None, k=1): P[j - g] = np.insert(P[j - g], pos, d)[:-1] I[j - g] = np.insert(I[j - g], pos, i - g)[:-1] - # self-join only + # In the case of a self-join, the calculated distance profile can also be + # used to refine the top-k for all non-trivial subsequences if exclusion_zone is not None: for idx in np.flatnonzero(distance_profile < P[:, -1]): pos = np.searchsorted(P[idx], distance_profile[idx], side="right") From 5af6ec00b320cec630eb9a61341c851db93bb610 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 25 Jun 2022 22:40:19 -0600 Subject: [PATCH 260/416] Improve comments --- tests/naive.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index f4c39a172..f257dc4a0 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1803,25 +1803,24 @@ def merge_topk_PI(PA, PB, IA, IB): def merge_topk_ρI(ρA, ρB, IA, IB): # this is to merge two pearson profiles `ρA` and `ρB`, where each is a 2D array - # and each row is sorted ascendingly. Smaller distance corresponds to larger - # pearson values. Therefore, we want to keep top-k largest values in merging - # row `ρA[i]` and `ρB[i]`. The priority is with `ρA (from right to left)` and - # then `ρB (from right to left)`. + # and each row is sorted ascendingly. we want to keep top-k largest values in + # merging row `ρA[i]` and `ρB[i]`. - # Example: - # note: the prime symbol below is to distinguish two elements with same value + # In case of ties between `ρA` and `ρB`, the priority is with `ρA`. In case + # of ties within `ρA, the priority is with an element with greater index. + # Example + # note: the prime symbol is to distinguish two elements with same value # ρA = [0, 0', 1], and ρB = [0, 1, 1']. # merging outcome: [1_B, 1'_B, 1_A] # Naive Implementation: - # keeping top-k largest with the aforementioned priority rule is the same as - # sorting ascendingly while prioritizing `ρB` (from left to right) over `ρA` - # (from left to right), and then keep the second half of merged array. - - # In our example, it would be like this: - # merging `ρB` and `ρA` (prioritizing smaller values in `ρB`): - # [0_B, 0_A, 0'_A, 1_B, 1'_B, 1_A], and we just need to keep the second half - # of this array (and discard the first half) + # keeping top-k largest with the aforementioned priority rules is the same as + # `merge_topk_PI` but with swapping `ρA` and `ρB` + + # For the same example: + # merging `ρB` and `ρA` ascendingly while choosing `ρB` over `ρA` in case of + # ties: [0_B, 0_A, 0'_A, 1_B, 1'_B, 1_A], and we just need to keep the second + # half of this array, and discard the first half. profile = np.column_stack((ρB, ρA)) indices = np.column_stack((IB, IA)) From 26cec6e1a2ba8d11d562483e16421584cdc2bf5e Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 25 Jun 2022 22:48:51 -0600 Subject: [PATCH 261/416] Improve docstring --- stumpy/core.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 92d4cbde3..74df988b2 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2623,11 +2623,16 @@ def _merge_topk_PI(PA, PB, IA, IB): @njit(parallel=True) def _merge_topk_ρI(ρA, ρB, IA, IB): """ - Merge two top-k pearson profiles ρA and ρB, and update ρA (in place). The priority - is with ρA (from right to left) and then ρB (from right to left). + Merge two top-k pearson profiles `ρA` and `ρB`, and update `ρA` (in place) by + keeping the top-k largest values in merging two `top-k` rows `ρA[i]` and `ρB[i]`, + each sorted ascendingly. + + from right to left of the merged array: In case of ties between `ρA[i]` and + `ρB[i]`, the priority is with `ρA[i]`, and in case of ties within `ρA[i]`, + the priority is with element with greater index. Example: - note: the prime symbol below is to distinguish two elements with same value + note: the prime symbol is to distinguish two elements with same value ρA = [0, 0', 1], and ρB = [0, 1, 1']. merging outcome: [1_B, 1'_B, 1_A] From b177c84eccd3d01ed72a7b1c80f1988725342afa Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 25 Jun 2022 22:59:16 -0600 Subject: [PATCH 262/416] Add KNN test function for stumpi --- tests/test_stumpi.py | 172 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 172 insertions(+) diff --git a/tests/test_stumpi.py b/tests/test_stumpi.py index 262d88495..2d00ab525 100644 --- a/tests/test_stumpi.py +++ b/tests/test_stumpi.py @@ -891,3 +891,175 @@ def test_stumpi_profile_index_match(): npt.assert_almost_equal(stream.left_P_, left_P) n += 1 + + +def test_stumpi_self_join_KNN(): + m = 3 + zone = int(np.ceil(m / 4)) + + for k in range(2, 4): + seed = np.random.randint(100000) + np.random.seed(seed) + + T = np.random.rand(30) + stream = stumpi(T, m, egress=False, k=k) + for i in range(34): + t = np.random.rand() + stream.update(t) + + comp_P = stream.P_ + comp_I = stream.I_ + comp_left_P = stream.left_P_ + comp_left_I = stream.left_I_ + + ref_mp = naive.stump(stream.T_, m, exclusion_zone=zone, row_wise=True, k=k) + ref_P = ref_mp[:, 0].reshape(-1, 1) + ref_I = ref_mp[:, 1].reshape(-1, 1) + ref_left_I = ref_mp[:, 2].astype(np.int64) + ref_left_P = np.full_like(ref_left_I, np.inf, dtype=np.float64) + for i, j in enumerate(ref_left_I): + if j >= 0: + D = core.mass(stream.T_[i : i + m], stream.T_[j : j + m]) + ref_left_P[i] = D[0] + + naive.replace_inf(ref_P) + naive.replace_inf(ref_left_P) + naive.replace_inf(comp_P) + naive.replace_inf(comp_left_P) + + npt.assert_almost_equal(ref_P, comp_P) + npt.assert_almost_equal(ref_I, comp_I) + npt.assert_almost_equal(ref_left_P, comp_left_P) + npt.assert_almost_equal(ref_left_I, comp_left_I) + + np.random.seed(seed) + T = np.random.rand(30) + T = pd.Series(T) + stream = stumpi(T, m, egress=False, k=k) + for i in range(34): + t = np.random.rand() + stream.update(t) + + comp_P = stream.P_ + comp_I = stream.I_ + comp_left_P = stream.left_P_ + comp_left_I = stream.left_I_ + + naive.replace_inf(comp_P) + naive.replace_inf(comp_left_P) + + npt.assert_almost_equal(ref_P, comp_P) + npt.assert_almost_equal(ref_I, comp_I) + npt.assert_almost_equal(ref_left_P, comp_left_P) + npt.assert_almost_equal(ref_left_I, comp_left_I) + + +def test_stumpi_self_join_egress_KNN(): + m = 3 + + for k in range(2, 4): + seed = np.random.randint(100000) + np.random.seed(seed) + n = 30 + T = np.random.rand(n) + + ref_mp = naive.stumpi_egress(T, m, k=k) + ref_P = ref_mp.P_.copy() + ref_I = ref_mp.I_ + ref_left_P = ref_mp.left_P_.copy() + ref_left_I = ref_mp.left_I_ + + stream = stumpi(T, m, egress=True, k=k) + + comp_P = stream.P_.copy() + comp_I = stream.I_ + comp_left_P = stream.left_P_.copy() + comp_left_I = stream.left_I_ + + naive.replace_inf(ref_P) + naive.replace_inf(ref_left_P) + naive.replace_inf(comp_P) + naive.replace_inf(comp_left_P) + + npt.assert_almost_equal(ref_P, comp_P) + npt.assert_almost_equal(ref_I, comp_I) + npt.assert_almost_equal(ref_left_P, comp_left_P) + npt.assert_almost_equal(ref_left_I, comp_left_I) + + for i in range(34): + t = np.random.rand() + ref_mp.update(t) + stream.update(t) + + comp_P = stream.P_.copy() + comp_I = stream.I_ + comp_left_P = stream.left_P_.copy() + comp_left_I = stream.left_I_ + + ref_P = ref_mp.P_.copy() + ref_I = ref_mp.I_ + ref_left_P = ref_mp.left_P_.copy() + ref_left_I = ref_mp.left_I_ + + naive.replace_inf(ref_P) + naive.replace_inf(ref_left_P) + naive.replace_inf(comp_P) + naive.replace_inf(comp_left_P) + + npt.assert_almost_equal(ref_P, comp_P) + npt.assert_almost_equal(ref_I, comp_I) + npt.assert_almost_equal(ref_left_P, comp_left_P) + npt.assert_almost_equal(ref_left_I, comp_left_I) + + np.random.seed(seed) + T = np.random.rand(n) + T = pd.Series(T) + + ref_mp = naive.stumpi_egress(T, m, k=k) + ref_P = ref_mp.P_.copy() + ref_I = ref_mp.I_ + ref_left_P = ref_mp.left_P_.copy() + ref_left_I = ref_mp.left_I_ + + stream = stumpi(T, m, egress=True, k=k) + + comp_P = stream.P_.copy() + comp_I = stream.I_ + comp_left_P = stream.left_P_.copy() + comp_left_I = stream.left_I_ + + naive.replace_inf(ref_P) + naive.replace_inf(ref_left_P) + naive.replace_inf(comp_P) + naive.replace_inf(comp_left_P) + + npt.assert_almost_equal(ref_P, comp_P) + npt.assert_almost_equal(ref_I, comp_I) + npt.assert_almost_equal(ref_left_P, comp_left_P) + npt.assert_almost_equal(ref_left_I, comp_left_I) + + for i in range(34): + t = np.random.rand() + t = np.random.rand() + ref_mp.update(t) + stream.update(t) + + comp_P = stream.P_.copy() + comp_I = stream.I_ + comp_left_P = stream.left_P_.copy() + comp_left_I = stream.left_I_ + + ref_P = ref_mp.P_.copy() + ref_I = ref_mp.I_ + ref_left_P = ref_mp.left_P_.copy() + ref_left_I = ref_mp.left_I_ + + naive.replace_inf(ref_P) + naive.replace_inf(ref_left_P) + naive.replace_inf(comp_P) + naive.replace_inf(comp_left_P) + + npt.assert_almost_equal(ref_P, comp_P) + npt.assert_almost_equal(ref_I, comp_I) + npt.assert_almost_equal(ref_left_P, comp_left_P) + npt.assert_almost_equal(ref_left_I, comp_left_I) From c5d23452b08e76227a050dfc0cdf5632472706b8 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 25 Jun 2022 23:06:20 -0600 Subject: [PATCH 263/416] Fix shape of output for KNN test --- tests/test_stumpi.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_stumpi.py b/tests/test_stumpi.py index 2d00ab525..13c4a6ee4 100644 --- a/tests/test_stumpi.py +++ b/tests/test_stumpi.py @@ -913,9 +913,9 @@ def test_stumpi_self_join_KNN(): comp_left_I = stream.left_I_ ref_mp = naive.stump(stream.T_, m, exclusion_zone=zone, row_wise=True, k=k) - ref_P = ref_mp[:, 0].reshape(-1, 1) - ref_I = ref_mp[:, 1].reshape(-1, 1) - ref_left_I = ref_mp[:, 2].astype(np.int64) + ref_P = ref_mp[:, :k] + ref_I = ref_mp[:, k : 2 * k] + ref_left_I = ref_mp[:, 2 * k].astype(np.int64) ref_left_P = np.full_like(ref_left_I, np.inf, dtype=np.float64) for i, j in enumerate(ref_left_I): if j >= 0: From cdc11c8a194b290766ed0cb9e8a0a7c266e0c8d6 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 26 Jun 2022 07:09:48 -0600 Subject: [PATCH 264/416] Full test and coverage 1 hr From fa7fa4a620727201652565cc8b366cc696e5d071 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 26 Jun 2022 23:30:55 -0600 Subject: [PATCH 265/416] Avoid using searchsort when k is 1 --- stumpy/gpu_stump.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index ecd8434b9..8a13686d8 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -284,13 +284,17 @@ def _compute_and_update_PI_kernel( indices_R[j] = i if p_norm < profile[j, -1]: - idx = _gpu_searchsorted_right(profile[j], p_norm, bfs, nlevel) - for g in range(k - 1, idx, -1): - profile[j, g] = profile[j, g - 1] - indices[j, g] = indices[j, g - 1] + if k == 1: + profile[j, 0] = p_norm + indices[j, 0] = i + else: + idx = _gpu_searchsorted_right(profile[j], p_norm, bfs, nlevel) + for g in range(k - 1, idx, -1): + profile[j, g] = profile[j, g - 1] + indices[j, g] = indices[j, g - 1] - profile[j, idx] = p_norm - indices[j, idx] = i + profile[j, idx] = p_norm + indices[j, idx] = i def _gpu_stump( From d41a2e96dd062daedadee82c78b28f5d14ebcb86 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 28 Jun 2022 22:27:02 -0600 Subject: [PATCH 266/416] Revise code according to top k matrix profile structure --- stumpy/stumpi.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/stumpy/stumpi.py b/stumpy/stumpi.py index 0dc4a039c..619ae8f72 100644 --- a/stumpy/stumpi.py +++ b/stumpy/stumpi.py @@ -143,15 +143,15 @@ def __init__(self, T, m, egress=True, normalize=True, p=2.0, k=1): self._T, self._M_T, self._Σ_T = core.preprocess(self._T, self._m) # Retrieve the left matrix profile values - # Since each matrix profile value is the minimum between the left and right - # matrix profile values, we can save time by re-computing only the left matrix - # profile value when the matrix profile index is equal to the right matrix - # profile index. - mask = self._left_I == self._I - self._left_P[mask] = self._P[mask] + # Since each (top-1) matrix profile value is the minimum between the left + # and right matrix profile values, we can save time by re-computing only + # the left matrix profile value when the (top-1) matrix profile index is + # equal to the right matrix profile index. + mask = self._left_I == self._I[:, 0] + self._left_P[mask] = self._P[mask, 0] # Only re-compute the `i`-th left matrix profile value, `self._left_P[i]`, - # when `self._I[i] != self._left_I[i]` + # when `self._I[i, 0] != self._left_I[i]` for i in np.flatnonzero(self._left_I >= 0 & ~mask): j = self._left_I[i] QT = np.dot(self._T[i : i + self._m], self._T[j : j + self._m]) From 38f4c1def82fa62a5004d82192be1ea6425ad328 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 30 Jun 2022 00:15:54 -0600 Subject: [PATCH 267/416] Remove if condition --- stumpy/gpu_stump.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index 8a13686d8..ecd8434b9 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -284,17 +284,13 @@ def _compute_and_update_PI_kernel( indices_R[j] = i if p_norm < profile[j, -1]: - if k == 1: - profile[j, 0] = p_norm - indices[j, 0] = i - else: - idx = _gpu_searchsorted_right(profile[j], p_norm, bfs, nlevel) - for g in range(k - 1, idx, -1): - profile[j, g] = profile[j, g - 1] - indices[j, g] = indices[j, g - 1] + idx = _gpu_searchsorted_right(profile[j], p_norm, bfs, nlevel) + for g in range(k - 1, idx, -1): + profile[j, g] = profile[j, g - 1] + indices[j, g] = indices[j, g - 1] - profile[j, idx] = p_norm - indices[j, idx] = i + profile[j, idx] = p_norm + indices[j, idx] = i def _gpu_stump( From e4fd875e8560b8099cb6c5d227b0e63f21844570 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 30 Jun 2022 23:05:50 -0600 Subject: [PATCH 268/416] Improve dosctrings --- stumpy/core.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 74df988b2..26949a52a 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2571,6 +2571,12 @@ def _merge_topk_PI(PA, PB, IA, IB): Unlike `_merge_topk_ρI`, where `top-k` largest values are kept, this function keeps `top-k` smallest values. + `PA` and `PB` are 2D arrays, with each row sorted ascendingly. To update `PA[i]`, + the array `PB[i]` is traversed forward from index `0` to its last index, and + will update `PA[i]` if its element is smaller than `PA[i, -1]`, i.e. the greatest + value in `PA[i]`. In case of tied value `v`, it will be inserted to the right side + of the greatest index in `PA[i]` whose value is `v`. + Parameters ---------- PA : numpy.ndarray @@ -2627,18 +2633,15 @@ def _merge_topk_ρI(ρA, ρB, IA, IB): keeping the top-k largest values in merging two `top-k` rows `ρA[i]` and `ρB[i]`, each sorted ascendingly. - from right to left of the merged array: In case of ties between `ρA[i]` and - `ρB[i]`, the priority is with `ρA[i]`, and in case of ties within `ρA[i]`, - the priority is with element with greater index. - - Example: - note: the prime symbol is to distinguish two elements with same value - ρA = [0, 0', 1], and ρB = [0, 1, 1']. - merging outcome: [1_B, 1'_B, 1_A] - Unlike `_merge_topk_PI`, where `top-k` smallest values are kept, this function keeps `top-k` largest values. + `ρA` and `ρB` are 2D arrays, with each row sorted ascendingly. To update `ρA[i]`, + the array `ρB[i]` is traversed backward from its last index to index 0, and will + update `ρA[i]` if its element is greater than `ρA[i, 0]`, i.e. the smallest value + in `ρA[i]`. In case of tied value `v`, it will be inserted to the left side of the + lowest index in `ρA[i]` whose value is `v`. + Parameters ---------- ρA : numpy.ndarray @@ -2705,8 +2708,8 @@ def _shift_insert_at_index(a, idx, v, shift="right"): idx: int The index at which the value `v` should be inserted. This can be any - integer number from `0` to `len(a)`. When `idx=0` and `shift="right"`, - OR when `idx=len(a)` and `shift != "right"`, then no change will occur on + integer number from `0` to `len(a)`. When `idx=len(a)` and `shift="right"`, + OR when `idx=0` and `shift != "right"`, then no change will occur on the input array `a`. v: float From 0afb3ec2903e3616683897bb7e7b3f7b2091a507 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 30 Jun 2022 23:07:04 -0600 Subject: [PATCH 269/416] Avoid allocating new memory --- stumpy/scrump.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 0dd4b25dd..a28a99f34 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -743,13 +743,13 @@ def update(self): # update left matrix profile and indices cond = PL < self._PL - self._PL = np.where(cond, PL, self._PL) - self._IL = np.where(cond, IL, self._IL) + self._PL[:] = np.where(cond, PL, self._PL) + self._IL[:] = np.where(cond, IL, self._IL) # update right matrix profile and indices cond = PR < self._PR - self._PR = np.where(cond, PR, self._PR) - self._IR = np.where(cond, IR, self._IR) + self._PR[:] = np.where(cond, PR, self._PR) + self._IR[:] = np.where(cond, IR, self._IR) self._chunk_idx += 1 From 13da458fd22e060c778417ba41ea9fd03a492389 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 30 Jun 2022 23:19:44 -0600 Subject: [PATCH 270/416] Avoid allocating new memory --- stumpy/gpu_stump.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index ecd8434b9..f3e03348c 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -528,9 +528,9 @@ def _gpu_stump( indices_L = device_indices_L.copy_to_host() indices_R = device_indices_R.copy_to_host() - profile = np.sqrt(profile) - profile_L = np.sqrt(profile_L) - profile_R = np.sqrt(profile_R) + profile[:, :] = np.sqrt(profile) + profile_L[:] = np.sqrt(profile_L) + profile_R[:] = np.sqrt(profile_R) profile_fname = core.array_to_temp_file(profile) profile_L_fname = core.array_to_temp_file(profile_L) From 93b5708ed05cceeef962ff6ebc19f31dde4070dc Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 6 Jul 2022 01:31:19 -0600 Subject: [PATCH 271/416] Improve comments --- stumpy/arimp_stump.py | 98 +++++++++++++++++++++++++++++++++++++++++++ stumpy/gpu_stump.py | 2 +- stumpy/scrump.py | 4 +- stumpy/stump.py | 8 ++-- stumpy/stumpi.py | 13 +++--- 5 files changed, 111 insertions(+), 14 deletions(-) create mode 100644 stumpy/arimp_stump.py diff --git a/stumpy/arimp_stump.py b/stumpy/arimp_stump.py new file mode 100644 index 000000000..6951bd5bd --- /dev/null +++ b/stumpy/arimp_stump.py @@ -0,0 +1,98 @@ +# naive +def arimp_naive(T_A, m, exclusion_zone=None, row_wise=False): + """ + Traverse distance matrix diagonally and update the matrix profile and + matrix profile indices if the parameter `row_wise` is set to `False`. + If the parameter `row_wise` is set to `True`, it is a row-wise traversal. + """ + + distance_matrix = np.array( + [distance_profile(Q, T_A, m) for Q in core.rolling_window(T_A, m)] + ) + T_B = T_A.copy() + + distance_matrix[np.isnan(distance_matrix)] = np.inf + + n_A = T_A.shape[0] + n_B = T_B.shape[0] + l = n_A - m + 1 + if exclusion_zone is None: + exclusion_zone = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM)) + + SL = list([np.inf] for _ in range(l)) + SLI = list([-1] for _ in range(l)) + + SR = list([np.inf] for _ in range(l)) + ISR = list([-1] for _ in range(l)) + + RL = list([np.inf] for _ in range(l)) + RLI = list([-1] for _ in range(l)) + + LR = list([np.inf] for _ in range(l)) + LRI = list([-1] for _ in range(l)) + + if row_wise: + for i in range(l): + apply_exclusion_zone(distance_matrix[i], i, exclusion_zone, np.inf) + + for i, D in enumerate(distance_matrix): + # self-join / AB-join: matrix proifle and indices + idx = np.argmin(D) + P[i, 0] = D[idx] + if P[i, 0] == np.inf: + idx = -1 + I[i, 0] = idx + + # self-join: left matrix profile + if ignore_trivial and i > 0: + IL = np.argmin(D[:i]) + if D[IL] == np.inf: + IL = -1 + I[i, 1] = IL + + # self-join: right matrix profile + if ignore_trivial and i < D.shape[0]: + IR = i + np.argmin(D[i:]) # shift argmin by `i` to get true index + if D[IR] == np.inf: + IR = -1 + I[i, 2] = IR + + else: # diagonal traversal + if ignore_trivial: + diags = np.arange(exclusion_zone + 1, n_A - m + 1) + else: + diags = np.arange(-(n_A - m + 1) + 1, n_B - m + 1) + + for k in diags: + if k >= 0: + iter_range = range(0, min(n_A - m + 1, n_B - m + 1 - k)) + else: + iter_range = range(-k, min(n_A - m + 1, n_B - m + 1 - k)) + + for i in iter_range: + D = distance_matrix[i, i + k] + if D < P[i, 0]: + P[i, 0] = D + I[i, 0] = i + k + + if ignore_trivial: # Self-joins only + if D < P[i + k, 0]: + P[i + k, 0] = D + I[i + k, 0] = i + + if i < i + k: + # Left matrix profile and left matrix profile index + if D < P[i + k, 1]: + P[i + k, 1] = D + I[i + k, 1] = i + + if D < P[i, 2]: + # right matrix profile and right matrix profile index + P[i, 2] = D + I[i, 2] = i + k + + result = np.empty((l, 4), dtype=object) + result[:, 0] = P[:, 0] + result[:, 1:4] = I[:, :] + + return result diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index f3e03348c..e236d3b05 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -432,7 +432,7 @@ def _gpu_stump( device_bfs = cuda.to_device(core._bfs_indices(k, fill_value=-1)) nlevel = np.floor(np.log2(k) + 1).astype(np.int64) - # number of levels in binary seearch tree from which `bfs` is constructed. + # number of levels in binary search tree from which `bfs` is constructed. with cuda.gpus[device_id]: device_T_A = cuda.to_device(T_A) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index a28a99f34..fced2d043 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -758,7 +758,7 @@ def P_(self): """ Get the updated (top-k) matrix profile. When k=1 (default), the first (and only) column in this 2D array consists of the matrix profile. When k > 1, the output - has exactly k columns consist of the top-k matrix profile. + has exactly k columns consisting of the top-k matrix profile. """ return self._P.astype(np.float64) @@ -767,7 +767,7 @@ def I_(self): """ Get the updated (top-k) matrix profile indices. When k=1 (default), the first (and only) column in this 2D array consists of the matrix profile - indices. When k > 1, the output has exactly k columns consist of the top-k + indices. When k > 1, the output has exactly k columns consisting of the top-k matrix profile indices. """ return self._I.astype(np.int64) diff --git a/stumpy/stump.py b/stumpy/stump.py index 59be1e61d..ed726bdd6 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -212,10 +212,10 @@ def _compute_diagonal( if T_B_subseq_isconstant[i + g] and T_A_subseq_isconstant[i]: pearson = 1.0 - # ρ[thread_idx, i, :] is sorted ascendingly. It MUST be updated - # when the newly-calculated pearson value becomes greater than the - # first (i.e. smallest) element of this array. (Reminder: higher - # pearson value means lower distance, which is of our interest) + # `ρ[thread_idx, i, :]` is sorted ascendingly and MUST be updated + # when the newly-calculated `pearson` value becomes greater than the + # first (i.e. smallest) element in this array. Note that a higher + # pearson value corresponds to a lower distance. if pearson > ρ[thread_idx, i, 0]: pos = np.searchsorted(ρ[thread_idx, i], pearson) core._shift_insert_at_index( diff --git a/stumpy/stumpi.py b/stumpy/stumpi.py index 619ae8f72..8c0f0335a 100644 --- a/stumpy/stumpi.py +++ b/stumpy/stumpi.py @@ -256,16 +256,18 @@ def _update_egress(self, t): ) # D.shape[0] is base-1 - # O(Nlog(K)) time complexity + # Calculate the (top-k) matrix profile values/indices for the last susequence + # by using its correspondng distance profile `D` self._P[-1] = np.inf self._I[-1] = -1 for i, d in enumerate(D): - if d < self._P[-1, -1]: # mean last index, maximum value (k-th value) + if d < self._P[-1, -1]: pos = np.searchsorted(self._P[-1], d, side="right") core._shift_insert_at_index(self._P[-1], pos, d) core._shift_insert_at_index(self._I[-1], pos, i + self._n_appended) - # for last index, the left matrix profile is basically `self._P[-1, 0]` + # All neighbors of the last subsequence are on its left. So, its matrix profile + # value/index and its left matrix profile value/index must be equal. self._left_P[-1] = self._P[-1, 0] self._left_I[-1] = self._I[-1, 0] @@ -318,12 +320,9 @@ def _update(self, t): core._shift_insert_at_index(self._I[i], pos, l) # Calculating top-k and left matrix profile for new subsequence whose - # distance profie is D - - # O(Nlog(K)) time complexity for obtaining top-k + # distance profie is `D` P_new = np.full(self._k, np.inf, dtype=np.float64) I_new = np.full(self._k, -1, dtype=np.int64) - for i, d in enumerate(D): if d < P_new[-1]: # maximum value in sorted array P_new pos = np.searchsorted(P_new, d, side="right") From 114c0cca27db0dd0db1aa51c5377363e4e54bd56 Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 6 Jul 2022 02:09:18 -0600 Subject: [PATCH 272/416] Remove numpy.where to avoid copying unchanged values --- stumpy/gpu_stump.py | 12 ++++++------ stumpy/scrump.py | 12 ++++++------ stumpy/stump.py | 12 ++++++------ stumpy/stumped.py | 12 ++++++------ 4 files changed, 24 insertions(+), 24 deletions(-) diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py index e236d3b05..99343db66 100644 --- a/stumpy/gpu_stump.py +++ b/stumpy/gpu_stump.py @@ -854,14 +854,14 @@ def gpu_stump( core._merge_topk_PI(profile[0], profile[i], indices[0], indices[i]) # Update (top-1) left matrix profile and matrix profile indices - cond = profile_L[0] < profile_L[i] - profile_L[0] = np.where(cond, profile_L[0], profile_L[i]) - indices_L[0] = np.where(cond, indices_L[0], indices_L[i]) + mask = profile_L[0] < profile_L[i] + profile_L[0][mask] = profile_L[i][mask] + indices_L[0][mask] = indices_L[i][mask] # Update (top-1) right matrix profile and matrix profile indices - cond = profile_R[0] < profile_R[i] - profile_R[0] = np.where(cond, profile_R[0], profile_R[i]) - indices_R[0] = np.where(cond, indices_R[0], indices_R[i]) + mask = profile_R[0] < profile_R[i] + profile_R[0][mask] = profile_R[i][mask] + indices_R[0][mask] = indices_R[i][mask] out = np.empty((w, 2 * k + 2), dtype=object) # last two columns are to store # (top-1) left/right matrix profile indices diff --git a/stumpy/scrump.py b/stumpy/scrump.py index fced2d043..3f535b11e 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -742,14 +742,14 @@ def update(self): core._merge_topk_PI(self._P, P, self._I, I) # update left matrix profile and indices - cond = PL < self._PL - self._PL[:] = np.where(cond, PL, self._PL) - self._IL[:] = np.where(cond, IL, self._IL) + mask = PL < self._PL + self._PL[mask] = PL[mask] + self._IL[mask] = IL[mask] # update right matrix profile and indices - cond = PR < self._PR - self._PR[:] = np.where(cond, PR, self._PR) - self._IR[:] = np.where(cond, IR, self._IR) + mask = PR < self._PR + self._PR[mask] = PR[mask] + self._IR[mask] = IR[mask] self._chunk_idx += 1 diff --git a/stumpy/stump.py b/stumpy/stump.py index ed726bdd6..8f035c268 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -475,14 +475,14 @@ def _stump( core._merge_topk_ρI(ρ[0], ρ[thread_idx], I[0], I[thread_idx]) # update left matrix profile and matrix profile indices - cond = ρL[0] < ρL[thread_idx] - ρL[0] = np.where(cond, ρL[thread_idx], ρL[0]) - IL[0] = np.where(cond, IL[thread_idx], IL[0]) + mask = ρL[0] < ρL[thread_idx] + ρL[0, mask] = ρL[thread_idx, mask] + IL[0, mask] = IL[thread_idx, mask] # update right matrix profile and matrix profile indices - cond = ρR[0] < ρR[thread_idx] - ρR[0] = np.where(cond, ρR[thread_idx], ρR[0]) - IR[0] = np.where(cond, IR[thread_idx], IR[0]) + mask = ρR[0] < ρR[thread_idx] + ρR[0, mask] = ρR[thread_idx, mask] + IR[0, mask] = IR[thread_idx, mask] # Reverse top-k rho (and its associated I) to be in descending order and # then convert from Pearson correlations to Euclidean distances (ascending order) diff --git a/stumpy/stumped.py b/stumpy/stumped.py index 1246bbb2c..d991c1304 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -273,14 +273,14 @@ def stumped( core._merge_topk_PI(profile, P, indices, I) # Update top-1 left matrix profile and matrix profile index - cond = PL < profile_L - profile_L[:] = np.where(cond, PL, profile_L) - indices_L[:] = np.where(cond, IL, indices_L) + mask = PL < profile_L + profile_L[mask] = PL[mask] + indices_L[mask] = IL[mask] # Update top-1 right matrix profile and matrix profile index - cond = PR < profile_R - profile_R[:] = np.where(cond, PR, profile_R) - indices_R[:] = np.where(cond, IR, indices_R) + mask = PR < profile_R + profile_R[mask] = PR[mask] + indices_R[mask] = IR[mask] out = np.empty((l, 2 * k + 2), dtype=object) out[:, :k] = profile From 719aefd2ed15c4cdb6d8f97aae87ebaeaa401a31 Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 6 Jul 2022 02:13:23 -0600 Subject: [PATCH 273/416] Remove unnecessary trailing colon --- stumpy/stumpi.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stumpy/stumpi.py b/stumpy/stumpi.py index 8c0f0335a..3a24cad92 100644 --- a/stumpy/stumpi.py +++ b/stumpy/stumpi.py @@ -210,8 +210,8 @@ def _update_egress(self, t): t_drop = self._T[l - 1] self._T_isfinite[:-1] = self._T_isfinite[1:] - self._I[:-1, :] = self._I[1:, :] - self._P[:-1, :] = self._P[1:, :] + self._I[:-1] = self._I[1:] + self._P[:-1] = self._P[1:] self._left_I[:-1] = self._left_I[1:] self._left_P[:-1] = self._left_P[1:] From 528bf12f0f322e44688cf7c90b984e056a097774 Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 6 Jul 2022 02:15:15 -0600 Subject: [PATCH 274/416] Replace negative np.inf with np.NINF --- stumpy/stump.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index 8f035c268..4c025db25 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -407,13 +407,13 @@ def _stump( l = n_A - m + 1 n_threads = numba.config.NUMBA_NUM_THREADS - ρ = np.full((n_threads, l, k), -np.inf, dtype=np.float64) + ρ = np.full((n_threads, l, k), np.NINF, dtype=np.float64) I = np.full((n_threads, l, k), -1, dtype=np.int64) - ρL = np.full((n_threads, l), -np.inf, dtype=np.float64) + ρL = np.full((n_threads, l), np.NINF, dtype=np.float64) IL = np.full((n_threads, l), -1, dtype=np.int64) - ρR = np.full((n_threads, l), -np.inf, dtype=np.float64) + ρR = np.full((n_threads, l), np.NINF, dtype=np.float64) IR = np.full((n_threads, l), -1, dtype=np.int64) ndist_counts = core._count_diagonal_ndist(diags, m, n_A, n_B) From ce58a5908778a46996b3a3547bb16aa99fef2d62 Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 6 Jul 2022 02:39:49 -0600 Subject: [PATCH 275/416] delete a wrong file --- stumpy/arimp_stump.py | 98 ------------------------------------------- 1 file changed, 98 deletions(-) delete mode 100644 stumpy/arimp_stump.py diff --git a/stumpy/arimp_stump.py b/stumpy/arimp_stump.py deleted file mode 100644 index 6951bd5bd..000000000 --- a/stumpy/arimp_stump.py +++ /dev/null @@ -1,98 +0,0 @@ -# naive -def arimp_naive(T_A, m, exclusion_zone=None, row_wise=False): - """ - Traverse distance matrix diagonally and update the matrix profile and - matrix profile indices if the parameter `row_wise` is set to `False`. - If the parameter `row_wise` is set to `True`, it is a row-wise traversal. - """ - - distance_matrix = np.array( - [distance_profile(Q, T_A, m) for Q in core.rolling_window(T_A, m)] - ) - T_B = T_A.copy() - - distance_matrix[np.isnan(distance_matrix)] = np.inf - - n_A = T_A.shape[0] - n_B = T_B.shape[0] - l = n_A - m + 1 - if exclusion_zone is None: - exclusion_zone = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM)) - - SL = list([np.inf] for _ in range(l)) - SLI = list([-1] for _ in range(l)) - - SR = list([np.inf] for _ in range(l)) - ISR = list([-1] for _ in range(l)) - - RL = list([np.inf] for _ in range(l)) - RLI = list([-1] for _ in range(l)) - - LR = list([np.inf] for _ in range(l)) - LRI = list([-1] for _ in range(l)) - - if row_wise: - for i in range(l): - apply_exclusion_zone(distance_matrix[i], i, exclusion_zone, np.inf) - - for i, D in enumerate(distance_matrix): - # self-join / AB-join: matrix proifle and indices - idx = np.argmin(D) - P[i, 0] = D[idx] - if P[i, 0] == np.inf: - idx = -1 - I[i, 0] = idx - - # self-join: left matrix profile - if ignore_trivial and i > 0: - IL = np.argmin(D[:i]) - if D[IL] == np.inf: - IL = -1 - I[i, 1] = IL - - # self-join: right matrix profile - if ignore_trivial and i < D.shape[0]: - IR = i + np.argmin(D[i:]) # shift argmin by `i` to get true index - if D[IR] == np.inf: - IR = -1 - I[i, 2] = IR - - else: # diagonal traversal - if ignore_trivial: - diags = np.arange(exclusion_zone + 1, n_A - m + 1) - else: - diags = np.arange(-(n_A - m + 1) + 1, n_B - m + 1) - - for k in diags: - if k >= 0: - iter_range = range(0, min(n_A - m + 1, n_B - m + 1 - k)) - else: - iter_range = range(-k, min(n_A - m + 1, n_B - m + 1 - k)) - - for i in iter_range: - D = distance_matrix[i, i + k] - if D < P[i, 0]: - P[i, 0] = D - I[i, 0] = i + k - - if ignore_trivial: # Self-joins only - if D < P[i + k, 0]: - P[i + k, 0] = D - I[i + k, 0] = i - - if i < i + k: - # Left matrix profile and left matrix profile index - if D < P[i + k, 1]: - P[i + k, 1] = D - I[i + k, 1] = i - - if D < P[i, 2]: - # right matrix profile and right matrix profile index - P[i, 2] = D - I[i, 2] = i + k - - result = np.empty((l, 4), dtype=object) - result[:, 0] = P[:, 0] - result[:, 1:4] = I[:, :] - - return result From ba4986b6dd8c2b715773a508ae3ff5a7e5e5fd88 Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 6 Jul 2022 02:54:56 -0600 Subject: [PATCH 276/416] Avoid advance indexing by using chain slicing so it can be run by njit --- stumpy/stump.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index 4c025db25..d64fd3532 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -476,13 +476,13 @@ def _stump( # update left matrix profile and matrix profile indices mask = ρL[0] < ρL[thread_idx] - ρL[0, mask] = ρL[thread_idx, mask] - IL[0, mask] = IL[thread_idx, mask] + ρL[0][mask] = ρL[thread_idx][mask] + IL[0][mask] = IL[thread_idx][mask] # update right matrix profile and matrix profile indices mask = ρR[0] < ρR[thread_idx] - ρR[0, mask] = ρR[thread_idx, mask] - IR[0, mask] = IR[thread_idx, mask] + ρR[0][mask] = ρR[thread_idx][mask] + IR[0][mask] = IR[thread_idx][mask] # Reverse top-k rho (and its associated I) to be in descending order and # then convert from Pearson correlations to Euclidean distances (ascending order) From 2f0f53c013b63d9d67b36a3e6f7a8a8c9df984b0 Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 6 Jul 2022 03:44:33 -0600 Subject: [PATCH 277/416] Improve docstring --- stumpy/stump.py | 3 ++- stumpy/stumped.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index d64fd3532..a21454aaa 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -548,7 +548,8 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1): k : int, default 1 The number of top `k` smallest distances used to construct the matrix profile. Note that this will increase the total computational time and memory usage - when k > 1. + when k > 1. If you have access to a GPU device, then you may be able to + leverage `gpu_stump` for better performance and scalability. Returns ------- diff --git a/stumpy/stumped.py b/stumpy/stumped.py index d991c1304..299f5c8a2 100644 --- a/stumpy/stumped.py +++ b/stumpy/stumped.py @@ -59,7 +59,8 @@ def stumped( k : int, default 1 The number of top `k` smallest distances used to construct the matrix profile. Note that this will increase the total computational time and memory usage - when k > 1. + when k > 1. If you have access to a GPU device, then you may be able to + leverage `gpu_stump` for better performance and scalability. Returns ------- From 6b49de867ae146c501bb9926c4cc79c8ebaf76d3 Mon Sep 17 00:00:00 2001 From: Sean Law Date: Wed, 6 Jul 2022 12:51:27 -0400 Subject: [PATCH 278/416] Added gpu_searchsorted checks when GPUs unavailable --- stumpy/core.py | 14 ++++++++++++++ test.sh | 4 +++- tests/test_gpu_stump.py | 11 ++++++++++- 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 26949a52a..fe231cd06 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -209,6 +209,20 @@ def _gpu_aamp_stimp_driver_not_found(*args, **kwargs): # pragma: no cover driver_not_found() +def _gpu_searchsorted_left_driver_not_found(*args, **kwargs): # pragma: no cover + """ + Dummy function to raise CudaSupportError driver not found error. + """ + driver_not_found() + + +def _gpu_searchsorted_right_driver_not_found(*args, **kwargs): # pragma: no cover + """ + Dummy function to raise CudaSupportError driver not found error. + """ + driver_not_found() + + def get_pkg_name(): # pragma: no cover """ Return package name. diff --git a/test.sh b/test.sh index 2db67e061..5fd98468c 100755 --- a/test.sh +++ b/test.sh @@ -32,7 +32,7 @@ done check_errs() { # Function. Parameter 1 is the return code - if [[ $1 -ne "0" ]]; then + if [[ $1 -ne "0" && $1 -ne "5" ]]; then echo "Error: pytest encountered exit code $1" # as a bonus, make our script exit with the right error code. exit $1 @@ -119,6 +119,7 @@ test_unit() { echo "Testing Numba JIT Compiled Functions" pytest -rsx -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_gpu_stump.py + check_errs $? pytest -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_core.py check_errs $? pytest -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_config.py @@ -148,6 +149,7 @@ test_unit() check_errs $? # aamp pytest -rsx -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_gpu_aamp.py + check_errs $? pytest -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_aamp.py tests/test_maamp.py tests/test_scraamp.py tests/test_aampi.py check_errs $? pytest -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_scraamp.py diff --git a/tests/test_gpu_stump.py b/tests/test_gpu_stump.py index 4d3093c99..3f86ab03f 100644 --- a/tests/test_gpu_stump.py +++ b/tests/test_gpu_stump.py @@ -3,10 +3,19 @@ import numpy.testing as npt import pandas as pd from stumpy import core, gpu_stump -from stumpy.gpu_stump import _gpu_searchsorted_left, _gpu_searchsorted_right from stumpy import config from numba import cuda +if cuda.is_available(): + from stumpy.gpu_stump import _gpu_searchsorted_left, _gpu_searchsorted_right +else: # pragma: no cover + from stumpy.core import ( + _gpu_searchsorted_left_driver_not_found as _gpu_searchsorted_left, + ) + from stumpy.core import ( + _gpu_searchsorted_right_driver_not_found as _gpu_searchsorted_right, + ) + try: from numba.errors import NumbaPerformanceWarning except ModuleNotFoundError: From 0d1e482e6748dbe1fc23f07911a5ab3e19495a3d Mon Sep 17 00:00:00 2001 From: Sean Law Date: Wed, 6 Jul 2022 13:40:35 -0400 Subject: [PATCH 279/416] Added error checks and pytest ignore warning --- test.sh | 6 ++++++ tests/test_non_normalized_decorator.py | 1 + 2 files changed, 7 insertions(+) diff --git a/test.sh b/test.sh index 5fd98468c..373e03f17 100755 --- a/test.sh +++ b/test.sh @@ -135,6 +135,7 @@ test_unit() pytest -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_ostinato.py check_errs $? pytest -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_gpu_ostinato.py + check_errs $? pytest -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_mpdist.py check_errs $? pytest -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_motifs.py @@ -142,9 +143,11 @@ test_unit() pytest -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_mmotifs.py check_errs $? pytest -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_gpu_mpdist.py + check_errs $? pytest -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_snippets.py check_errs $? pytest -rsx -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_gpu_stimp.py + check_errs $? pytest -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_stimp.py check_errs $? # aamp @@ -161,6 +164,7 @@ test_unit() pytest -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_aamp_ostinato.py check_errs $? pytest -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_gpu_aamp_ostinato.py + check_errs $? pytest -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_aampdist.py check_errs $? pytest -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_aamp_motifs.py @@ -168,9 +172,11 @@ test_unit() pytest -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_aamp_mmotifs.py check_errs $? pytest -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_gpu_aampdist.py + check_errs $? pytest -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_aampdist_snippets.py check_errs $? pytest -rsx -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_gpu_aamp_stimp.py + check_errs $? pytest -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_aamp_stimp.py check_errs $? pytest -x -W ignore::DeprecationWarning tests/test_non_normalized_decorator.py diff --git a/tests/test_non_normalized_decorator.py b/tests/test_non_normalized_decorator.py index 2c6447d09..8da9354c8 100644 --- a/tests/test_non_normalized_decorator.py +++ b/tests/test_non_normalized_decorator.py @@ -340,6 +340,7 @@ def test_mmotifs(T, m): npt.assert_almost_equal(ref_distances, cmp_distances) +@pytest.mark.filterwarnings("ignore:All-NaN slice encountered") def test_snippets(): T = np.random.rand(64) m = 10 From f72ca7a6f71b0c41aa87893bc66faac6cf4af3ab Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 7 Jul 2022 13:20:21 -0600 Subject: [PATCH 280/416] Improve docstrings --- stumpy/core.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index fe231cd06..4e87ffe68 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2587,9 +2587,9 @@ def _merge_topk_PI(PA, PB, IA, IB): `PA` and `PB` are 2D arrays, with each row sorted ascendingly. To update `PA[i]`, the array `PB[i]` is traversed forward from index `0` to its last index, and - will update `PA[i]` if its element is smaller than `PA[i, -1]`, i.e. the greatest - value in `PA[i]`. In case of tied value `v`, it will be inserted to the right side - of the greatest index in `PA[i]` whose value is `v`. + if its element is smaller than `PA[i, -1]`, i.e. the greatest value in `PA[i]`, + then `PA[i]` will be updatd. In case of tied value `v`, it will be inserted to + the right side of the greatest index in `PA[i]` whose value is `v`. Parameters ---------- @@ -2651,10 +2651,10 @@ def _merge_topk_ρI(ρA, ρB, IA, IB): keeps `top-k` largest values. `ρA` and `ρB` are 2D arrays, with each row sorted ascendingly. To update `ρA[i]`, - the array `ρB[i]` is traversed backward from its last index to index 0, and will - update `ρA[i]` if its element is greater than `ρA[i, 0]`, i.e. the smallest value - in `ρA[i]`. In case of tied value `v`, it will be inserted to the left side of the - lowest index in `ρA[i]` whose value is `v`. + the array `ρB[i]` is traversed backward from its last index to index 0, and if + its element is greater than `ρA[i, 0]`, i.e. the smallest value in `ρA[i]`, then + `ρA[i]` will be updated. In case of tied value `v`, it will be inserted to the + left side of the lowest index in `ρA[i]` whose value is `v`. Parameters ---------- From af40906f0f761b18337e38797c89c68ec656cd7d Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 7 Jul 2022 13:34:30 -0600 Subject: [PATCH 281/416] minor changes in if-block and dosctring --- stumpy/core.py | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 4e87ffe68..2ca026ee9 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2709,11 +2709,14 @@ def _merge_topk_ρI(ρA, ρB, IA, IB): @njit def _shift_insert_at_index(a, idx, v, shift="right"): """ - If `shift=right`, all elements in `a[idx:]` are shifted to the right by one element - and the last element is discarded. If `shift=left` (or any string value other - than "right") all elements in `a[:idx]` are shifted to the left by one element - and the first element is discarded. In both cases, the length of `a` remains - unchanged. + If `shift=right`(default), all elements in `a[idx:]` are shifted to the right by + one element and the last element is discarded. If `shift=left`, all elements in + `a[:idx]` are shifted to the left by one element and the first element is discarded. + In both cases, the length of `a` remains unchanged. + + Note that for any other string value for parameter `shift`, the parameter will be + reset to `shift="right"`. + Parameters ---------- @@ -2723,7 +2726,7 @@ def _shift_insert_at_index(a, idx, v, shift="right"): idx: int The index at which the value `v` should be inserted. This can be any integer number from `0` to `len(a)`. When `idx=len(a)` and `shift="right"`, - OR when `idx=0` and `shift != "right"`, then no change will occur on + OR when `idx=0` and `shift="left"`, then no change will occur on the input array `a`. v: float @@ -2731,21 +2734,21 @@ def _shift_insert_at_index(a, idx, v, shift="right"): shift: str, default "right" The value that indicates whether the shifting of elements should be to the - right or to the left. If "right" (default), all elements in `a[idx:]` are - shifted to right by one element. For any other string value, all elements + right or to the left. If `shift="right"` (default), all elements in `a[idx:]` + are shifted to the right by one element. If `shift="left"`, all elements in `a[:idx]` are shifted to the left by one element. Returns ------- None """ - if shift == "right": - if 0 <= idx < len(a): - a[idx + 1 :] = a[idx:-1] - a[idx] = v - - else: + if shift == "left": if 0 < idx <= len(a): a[: idx - 1] = a[1:idx] - # elements were shifted to left, thus the insertion index becomes `idx-1` + # elements were shifted to the left, thus the insertion index becomes `idx-1` a[idx - 1] = v + + else: + if 0 <= idx < len(a): + a[idx + 1 :] = a[idx:-1] + a[idx] = v From 42ec617c0b7d71946c0624662027bdc3b0862525 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 7 Jul 2022 13:42:58 -0600 Subject: [PATCH 282/416] Improve docstrings --- stumpy/scrump.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 3f535b11e..6394599ea 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -295,12 +295,12 @@ def _prescrump( out1 : numpy.ndarray The (top-k) Matrix profile. When k=1 (default), the first (and only) column in this 2D array consists of the matrix profile. When k > 1, the output - has exactly k columns consist of the top-k matrix profile. + has exactly `k` columns consisting of the top-k matrix profile. out2 : numpy.ndarray The (top-k) Matrix profile indices. When k=1 (default), the first (and only) column in this 2D array consists of the matrix profile indices. When k > 1, - the output has exactly k columns consist of the top-k matrix profile. + the output has exactly `k` columns consisting of the top-k matrix profile indices. Notes ----- @@ -381,14 +381,14 @@ def prescrump(T_A, m, T_B=None, s=None, normalize=True, p=2.0, k=1): Returns ------- P : numpy.ndarray - The (top-k) Matrix profile. When k = 1 (default), the first and only column - consists of the matrix profile. When k > 1, the output has exactly k columns - consist of the top-k matrix profile. + The (top-k) Matrix profile. When k = 1 (default), the first (and only) column + in this 2D array consists of the matrix profile. When k > 1, the output has + exactly `k` columns consisting of the top-k matrix profile. I : numpy.ndarray - The (top-k) Matrix profile. When k = 1 (default), the first and only column - consists of the matrix profile indices. When k > 1, the output has exactly - k columns consist of the top-k matrix profile indices. + The (top-k) Matrix profile indices. When k = 1 (default), the first (and only) + column in this 2D array consists of the matrix profile indices. When k > 1, + the output has exactly `k` columns consisting of the top-k matrix profile indices. Notes ----- @@ -489,10 +489,14 @@ class scrump: Attributes ---------- P_ : numpy.ndarray - The updated (top-k) matrix profile + The updated (top-k) matrix profile. When k=1 (default), the first (and only) + column in this 2D array consists of the matrix profile. When k > 1, the output + has exactly k columns consisting of the top-k matrix profile. I_ : numpy.ndarray - The updated (top-k) matrix profile indices + The updated (top-k) matrix profile indices. When k=1 (default), the first (and only) + column in this 2D array consists of the matrix profile indices. When k > 1, + the output has exactly k columns consisting of the top-k matrix profile indices. left_I_ : numpy.ndarray The updated left (top-1) matrix profile indices @@ -758,7 +762,7 @@ def P_(self): """ Get the updated (top-k) matrix profile. When k=1 (default), the first (and only) column in this 2D array consists of the matrix profile. When k > 1, the output - has exactly k columns consisting of the top-k matrix profile. + has exactly `k` columns consisting of the top-k matrix profile. """ return self._P.astype(np.float64) @@ -767,7 +771,7 @@ def I_(self): """ Get the updated (top-k) matrix profile indices. When k=1 (default), the first (and only) column in this 2D array consists of the matrix profile - indices. When k > 1, the output has exactly k columns consisting of the top-k + indices. When k > 1, the output has exactly `k` columns consisting of the top-k matrix profile indices. """ return self._I.astype(np.int64) From 0d8f5de06f3c429b55486d3e1dfa3fe19df0036f Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 7 Jul 2022 13:45:50 -0600 Subject: [PATCH 283/416] Improve comments --- stumpy/stumpi.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/stumpy/stumpi.py b/stumpy/stumpi.py index 3a24cad92..e059676c3 100644 --- a/stumpy/stumpi.py +++ b/stumpy/stumpi.py @@ -151,7 +151,7 @@ def __init__(self, T, m, egress=True, normalize=True, p=2.0, k=1): self._left_P[mask] = self._P[mask, 0] # Only re-compute the `i`-th left matrix profile value, `self._left_P[i]`, - # when `self._I[i, 0] != self._left_I[i]` + # when `self._left_I[i] != self._I[i, 0]` for i in np.flatnonzero(self._left_I >= 0 & ~mask): j = self._left_I[i] QT = np.dot(self._T[i : i + self._m], self._T[j : j + self._m]) @@ -319,8 +319,8 @@ def _update(self, t): core._shift_insert_at_index(self._P[i], pos, D[i]) core._shift_insert_at_index(self._I[i], pos, l) - # Calculating top-k and left matrix profile for new subsequence whose - # distance profie is `D` + # Calculating top-k matrix profile and (top-1) left matrix profile (and thier + # corresponding indices) for new subsequence whose distance profie is `D` P_new = np.full(self._k, np.inf, dtype=np.float64) I_new = np.full(self._k, -1, dtype=np.int64) for i, d in enumerate(D): From fe9c4dba2cddac5e38cef5e6421733c9d7c1798b Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 7 Jul 2022 13:51:54 -0600 Subject: [PATCH 284/416] minor changes --- tests/naive.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index f257dc4a0..8a2ae3a0e 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -169,10 +169,9 @@ def searchsorted_right(a, v): def stump(T_A, m, T_B=None, exclusion_zone=None, row_wise=False, k=1): """ - Traverse distance matrix diagonally and update the top-k nearest neighbor - matrix profile and matrix profile indices if the parameter `row_wise` is - set to `False`. If the parameter `row_wise` is set to `True`, - it is a row-wise traversal. + Traverse distance matrix diagonally and update the top-k matrix profile and + matrix profile indices if the parameter `row_wise` is set to `False`. If the + parameter `row_wise` is set to `True`, it is a row-wise traversal. """ if T_B is None: # self-join: ignore_trivial = True @@ -194,7 +193,7 @@ def stump(T_A, m, T_B=None, exclusion_zone=None, row_wise=False, k=1): if exclusion_zone is None: exclusion_zone = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM)) - P = np.full((l, k + 2), np.inf) + P = np.full((l, k + 2), np.inf, dtype=np.float64) I = np.full((l, k + 2), -1, dtype=np.int64) # two more columns are to store # ... left and right top-1 matrix profile indices @@ -720,8 +719,10 @@ def __init__(self, T, m, excl_zone=None, p=2.0): self._T_isfinite = np.isfinite(self._T) self._m = m self._p = p - if excl_zone is None: # see stumpi, and make similar changes here - self._excl_zone = int(np.ceil(self._m / config.STUMPY_EXCL_ZONE_DENOM)) + + if excl_zone is None: # apply similar changes in naive `class stumpi_egress` + excl_zone = int(np.ceil(self._m / config.STUMPY_EXCL_ZONE_DENOM)) + self._excl_zone = excl_zone self._l = self._T.shape[0] - m + 1 mp = aamp(T, m, p=p) From 9aba6d2ce7889eb91184d237f9042194168f1be8 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 7 Jul 2022 13:57:50 -0600 Subject: [PATCH 285/416] Correct format --- stumpy/core.py | 3 ++- stumpy/scrump.py | 13 ++++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 2ca026ee9..879c3ebdf 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2745,7 +2745,8 @@ def _shift_insert_at_index(a, idx, v, shift="right"): if shift == "left": if 0 < idx <= len(a): a[: idx - 1] = a[1:idx] - # elements were shifted to the left, thus the insertion index becomes `idx-1` + # elements were shifted to the left, thus the insertion index becomes + # `idx-1` a[idx - 1] = v else: diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 6394599ea..8d265dc11 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -300,7 +300,8 @@ def _prescrump( out2 : numpy.ndarray The (top-k) Matrix profile indices. When k=1 (default), the first (and only) column in this 2D array consists of the matrix profile indices. When k > 1, - the output has exactly `k` columns consisting of the top-k matrix profile indices. + the output has exactly `k` columns consisting of the top-k matrix profile + indices. Notes ----- @@ -388,7 +389,8 @@ def prescrump(T_A, m, T_B=None, s=None, normalize=True, p=2.0, k=1): I : numpy.ndarray The (top-k) Matrix profile indices. When k = 1 (default), the first (and only) column in this 2D array consists of the matrix profile indices. When k > 1, - the output has exactly `k` columns consisting of the top-k matrix profile indices. + the output has exactly `k` columns consisting of the top-k matrix profile + indices. Notes ----- @@ -494,9 +496,10 @@ class scrump: has exactly k columns consisting of the top-k matrix profile. I_ : numpy.ndarray - The updated (top-k) matrix profile indices. When k=1 (default), the first (and only) - column in this 2D array consists of the matrix profile indices. When k > 1, - the output has exactly k columns consisting of the top-k matrix profile indices. + The updated (top-k) matrix profile indices. When k=1 (default), the first + (and only) column in this 2D array consists of the matrix profile indices. + When k > 1, the output has exactly k columns consisting of the top-k matrix + profile indices. left_I_ : numpy.ndarray The updated left (top-1) matrix profile indices From 2565c91eb09b3d63db21c48bacf4a3f3b41b63e3 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 7 Jul 2022 14:07:12 -0600 Subject: [PATCH 286/416] Improve docstrings --- stumpy/scrump.py | 4 ++-- stumpy/stumpi.py | 13 +++++++++---- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 8d265dc11..1c1286e4e 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -513,9 +513,9 @@ class scrump: update() Update the matrix profile and the matrix profile indices by computing additional new distances (limited by `percentage`) that make up the full - distance matrix. The outputs are (top-k) matrix profile, (top-1) left + distance matrix. It updates the (top-k) matrix profile, (top-1) left matrix profile, (top-1) right matrix profile, (top-k) matrix profile indices, - (top-1) left matrix profile indices, (top-1) right matrix profile indices. + (top-1) left matrix profile indices, and (top-1) right matrix profile indices. See Also -------- diff --git a/stumpy/stumpi.py b/stumpy/stumpi.py index e059676c3..465c39db5 100644 --- a/stumpy/stumpi.py +++ b/stumpy/stumpi.py @@ -46,13 +46,13 @@ class stumpi: P_ : numpy.ndarray The updated (top-k) matrix profile for `T`. When `k=1` (default), the first (and only) column in this 2D array consists of the matrix profile. When - `k > 1`, the output has exactly `k` columns consist of the top-k matrix + `k > 1`, the output has exactly `k` columns consisting of the top-k matrix profile. I_ : numpy.ndarray The updated (top-k) matrix profile indices for `T`. When `k=1` (default), the first (and only) column in this 2D array consists of the matrix profile - indices. When `k > 1`, the output has exactly `k` columns consist of the + indices. When `k > 1`, the output has exactly `k` columns consisting of the top-k matrix profile indices. left_P_ : numpy.ndarray @@ -344,14 +344,19 @@ def _update(self, t): @property def P_(self): """ - Get the (top-k) matrix profile + Get the (top-k) matrix profile. When `k=1` (default), the first (and only) + column in this 2D array consists of the matrix profile. When `k > 1`, the + output has exactly `k` columns consisting of the top-k matrix profile. """ return self._P.astype(np.float64) @property def I_(self): """ - Get the (top-k) matrix profile indices + Get the (top-k) matrix profile indices. When `k=1` (default), the first + (and only) column in this 2D array consists of the matrix profile indices. + When `k > 1`, the output has exactly `k` columns consisting of the top-k + matrix profile indices. """ return self._I.astype(np.int64) From a89e21434a72b0f5ce841836530b613f17a78736 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 10 Jul 2022 15:25:23 -0600 Subject: [PATCH 287/416] optimize functions --- stumpy/core.py | 94 ++++++++++++++++++++------------------------------ 1 file changed, 38 insertions(+), 56 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 05745ba97..1c0e48311 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2599,7 +2599,7 @@ def _check_P(P, threshold=1e-6): logger.warning("For a self-join, try setting `ignore_trivial=True`.") -@njit(parallel=True) +@njit def _merge_topk_PI(PA, PB, IA, IB): """ Merge two top-k matrix profiles PA and PB, and update PA (in place) while @@ -2635,36 +2635,27 @@ def _merge_topk_PI(PA, PB, IA, IB): ------- None """ - for i in prange(PB.shape[0]): - start = 0 - stop = np.searchsorted(PA[i], PB[i, -1], side="right") - - if stop == 0: - # means `PB[i, -1] < PA[i, 0]`, i.e. the maximum value in `PB[i]` is - # less than smallest value in `PA[i]`. So, we should replace `PA[i]` - # with `PB[i]` so that we have the top-k smallest. - PA[i] = PB[i] - IA[i] = IB[i] - continue - - for j in range(PB.shape[1]): - if PB[i, j] >= PA[i, -1]: - # `PB[i]` is sorted ascaendingly. - # Hence, in next iteration: `PB[i, j+1] >= PB[i, j] >= PA[i, -1]` - break - - # `PB[i, j]` is less than `PA[i, -1]`, the maximum value in `PA[i]`. - # so, we must update `PA[i]` to have the top-k smallest values. - idx = np.searchsorted(PA[i, start:stop], PB[i, j], side="right") + start - - _shift_insert_at_index(PA[i], idx, PB[i, j], shift="right") - _shift_insert_at_index(IA[i], idx, IB[i, j], shift="right") + tmp_P = np.empty(PA.shape[1], dtype=np.float64) + tmp_I = np.empty(PA.shape[1], dtype=np.int64) + for i in range(len(PA)): + tmp_P[:] = np.empty(PA.shape[1], dtype=np.float64) + tmp_I[:] = np.empty(PA.shape[1], dtype=np.int64) + aj, bj = 0, 0 + for k in range(len(tmp_P)): + if PB[i, bj] < PA[i, aj]: + tmp_P[k] = PB[i, bj] + tmp_I[k] = IB[i, bj] + bj += 1 + else: + tmp_P[k] = PA[i, aj] + tmp_I[k] = IA[i, aj] + aj += 1 - start = idx - stop += 1 # because of shifting elements to the right by one + PA[i] = tmp_P + IA[i] = tmp_I -@njit(parallel=True) +@njit def _merge_topk_ρI(ρA, ρB, IA, IB): """ Merge two top-k pearson profiles `ρA` and `ρB`, and update `ρA` (in place) by @@ -2700,34 +2691,25 @@ def _merge_topk_ρI(ρA, ρB, IA, IB): ------- None """ - for i in prange(ρB.shape[0]): - start = np.searchsorted(ρA[i], ρB[i, 0], side="left") - stop = ρB.shape[1] - - if start == ρB.shape[1]: - # means `ρB[i, 0] > ρA[i, -1]`, i.e. the minimum value in `ρB[i]` is - # greater than greatest value in `ρA[i]`. So, we should replace `ρA[i]` - # with `ρB[i]` so that we have top-k largest values - ρA[i] = ρB[i] - IA[i] = IB[i] - continue - - for j in range(ρB.shape[1] - 1, -1, -1): - if ρB[i, j] <= ρA[i, 0]: - # `ρB[i]` is sorted ascaendingly. - # Hence, in the next iteration: `ρB[i, j-1] <= ρB[i, j] <= ρA[i, 0]` - break - - # `ρB[i, j]` is greater than `ρA[i, 0]`, the minimum value in `ρA[i]`. - # so, we must update `ρA[i]` to make sure we have top-k largest values. - idx = np.searchsorted(ρA[i, start:stop], ρB[i, j], side="left") + start - - _shift_insert_at_index(ρA[i], idx, ρB[i, j], shift="left") - _shift_insert_at_index(IA[i], idx, IB[i, j], shift="left") - - stop = idx - if start > 0: - start -= 1 # because of shifting elements to the left by one + tmp_ρ = np.empty(ρA.shape[1], dtype=np.float64) + tmp_I = np.empty(ρA.shape[1], dtype=np.int64) + last_idx = len(tmp_ρ) - 1 + for i in range(len(ρA)): + tmp_ρ[:] = np.empty(ρA.shape[1], dtype=np.float64) + tmp_I[:] = np.empty(ρA.shape[1], dtype=np.int64) + aj, bj = last_idx, last_idx + for k in range(last_idx, -1, -1): + if ρB[i, bj] > ρA[i, aj]: + tmp_ρ[k] = ρB[i, bj] + tmp_I[k] = IB[i, bj] + bj -= 1 + else: + tmp_ρ[k] = ρA[i, aj] + tmp_I[k] = IA[i, aj] + aj -= 1 + + ρA[i] = tmp_ρ + IA[i] = tmp_I @njit From 9b845b1e6e77ff2e3bb86243c943029ff688e8dd Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 10 Jul 2022 15:30:35 -0600 Subject: [PATCH 288/416] Remove redundant import --- stumpy/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stumpy/core.py b/stumpy/core.py index 1c0e48311..f2eafe5d6 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -7,7 +7,7 @@ import inspect import numpy as np -from numba import njit, prange +from numba import njit from scipy.signal import convolve from scipy.ndimage import maximum_filter1d, minimum_filter1d from scipy import linalg From 85f1226ed70722f11b601bbac47ba82fed722a31 Mon Sep 17 00:00:00 2001 From: ninimama Date: Sun, 10 Jul 2022 18:51:19 -0600 Subject: [PATCH 289/416] minor change --- stumpy/core.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index f2eafe5d6..eb3d33c92 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2635,11 +2635,9 @@ def _merge_topk_PI(PA, PB, IA, IB): ------- None """ - tmp_P = np.empty(PA.shape[1], dtype=np.float64) - tmp_I = np.empty(PA.shape[1], dtype=np.int64) for i in range(len(PA)): - tmp_P[:] = np.empty(PA.shape[1], dtype=np.float64) - tmp_I[:] = np.empty(PA.shape[1], dtype=np.int64) + tmp_P = np.empty(PA.shape[1], dtype=np.float64) + tmp_I = np.empty(PA.shape[1], dtype=np.int64) aj, bj = 0, 0 for k in range(len(tmp_P)): if PB[i, bj] < PA[i, aj]: @@ -2691,12 +2689,10 @@ def _merge_topk_ρI(ρA, ρB, IA, IB): ------- None """ - tmp_ρ = np.empty(ρA.shape[1], dtype=np.float64) - tmp_I = np.empty(ρA.shape[1], dtype=np.int64) - last_idx = len(tmp_ρ) - 1 + last_idx = ρA.shape[1] - 1 for i in range(len(ρA)): - tmp_ρ[:] = np.empty(ρA.shape[1], dtype=np.float64) - tmp_I[:] = np.empty(ρA.shape[1], dtype=np.int64) + tmp_ρ = np.empty(ρA.shape[1], dtype=np.float64) + tmp_I = np.empty(ρA.shape[1], dtype=np.int64) aj, bj = last_idx, last_idx for k in range(last_idx, -1, -1): if ρB[i, bj] > ρA[i, aj]: From fb6ed07cc7f829db0ef0fb9b8ada64838f11ab1a Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 11 Jul 2022 00:07:44 -0600 Subject: [PATCH 290/416] Revise docstrings --- stumpy/core.py | 50 ++++++++++++++++++++------------------------------ 1 file changed, 20 insertions(+), 30 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index eb3d33c92..394842076 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2602,34 +2602,29 @@ def _check_P(P, threshold=1e-6): @njit def _merge_topk_PI(PA, PB, IA, IB): """ - Merge two top-k matrix profiles PA and PB, and update PA (in place) while - always choosing values of PA over values of PB in case of ties. Also, update - IA accordingly. + Merge two top-k matrix profiles `PA` and `PB`, and update `PA` (in place) while + always prioritizing the values of `PA` over the values of `PB` in case of ties. + (i.e., values from `PB` are always inserted to the right of values from `PA`). + Also, update `IA` accordingly. Unlike `_merge_topk_ρI`, where `top-k` largest values are kept, this function keeps `top-k` smallest values. - `PA` and `PB` are 2D arrays, with each row sorted ascendingly. To update `PA[i]`, - the array `PB[i]` is traversed forward from index `0` to its last index, and - if its element is smaller than `PA[i, -1]`, i.e. the greatest value in `PA[i]`, - then `PA[i]` will be updatd. In case of tied value `v`, it will be inserted to - the right side of the greatest index in `PA[i]` whose value is `v`. - Parameters ---------- PA : numpy.ndarray - A (top-k) matrix profile, with ndim of 2, where values in each row are - sorted in ascending order. + A (top-k) matrix profile where values in each row are sorted in ascending + order. `PA` must be 2-dimensional. PB : numpy.ndarray - A (top-k) matrix profile, with ndim of 2, where values in each row are - sorted in ascending order. `PB` must have the same shape as `PA`. + A (top-k) matrix profile where values in each row are sorted in ascending + order. `PB` must have the same shape as `PA`. IA : numpy.ndarray - A (top-k) matrix profile indices corresponding to PA + A (top-k) matrix profile indices corresponding to `PA` IB : numpy.ndarray - A (top-k) matrix profile indices corresponding to PB + A (top-k) matrix profile indices corresponding to `PB` Returns ------- @@ -2656,34 +2651,29 @@ def _merge_topk_PI(PA, PB, IA, IB): @njit def _merge_topk_ρI(ρA, ρB, IA, IB): """ - Merge two top-k pearson profiles `ρA` and `ρB`, and update `ρA` (in place) by - keeping the top-k largest values in merging two `top-k` rows `ρA[i]` and `ρB[i]`, - each sorted ascendingly. + Merge two top-k pearson profiles `ρA` and `ρB`, and update `ρA` (in place) while + always prioritizing the values of `ρA` over the values of `ρB` in case of ties. + (i.e., values from `ρB` are always inserted to the left of values from `ρA`). + Also, update `IA` accordingly. Unlike `_merge_topk_PI`, where `top-k` smallest values are kept, this function keeps `top-k` largest values. - `ρA` and `ρB` are 2D arrays, with each row sorted ascendingly. To update `ρA[i]`, - the array `ρB[i]` is traversed backward from its last index to index 0, and if - its element is greater than `ρA[i, 0]`, i.e. the smallest value in `ρA[i]`, then - `ρA[i]` will be updated. In case of tied value `v`, it will be inserted to the - left side of the lowest index in `ρA[i]` whose value is `v`. - Parameters ---------- ρA : numpy.ndarray - A (top-k) pearson profile, with ndim of 2, where values in each row are - sorted in ascending order. + A (top-k) pearson profile where values in each row are sorted in ascending + order. `ρA` must be 2-dimensional. ρB : numpy.ndarray - A (top-k) pearson profile, with ndim of 2, where values in each row are - sorted in ascending order. `ρB` must have the same shape as `ρA`. + A (top-k) pearson profile, where values in each row are sorted in ascending + order. `ρB` must have the same shape as `ρA`. IA : numpy.ndarray - A (top-k) matrix profile indices corresponding to ρA + A (top-k) matrix profile indices corresponding to `ρA` IB : numpy.ndarray - A (top-k) matrix profile indices corresponding to ρB + A (top-k) matrix profile indices corresponding to `ρB` Returns ------- From 9f7b6d83f9377bb78fe540d3f6682e048d3df3c6 Mon Sep 17 00:00:00 2001 From: Sean Law Date: Mon, 11 Jul 2022 07:42:18 -0400 Subject: [PATCH 291/416] Fixed black formatting after conflict resolution --- stumpy/core.py | 1 - tests/test_core.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 573edd1da..7c54b9bb5 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2777,4 +2777,3 @@ def _check_P(P, threshold=1e-6): if are_distances_too_small(P, threshold=threshold): # pragma: no cover logger.warning(f"A large number of values in `P` are smaller than {threshold}.") logger.warning("For a self-join, try setting `ignore_trivial=True`.") - diff --git a/tests/test_core.py b/tests/test_core.py index 76ba34c67..2ce86a4bb 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1153,4 +1153,4 @@ def test_shift_insert_at_index(): def test_check_P(): with pytest.raises(ValueError): - core._check_P(np.random.rand(10).reshape(2, 5)) \ No newline at end of file + core._check_P(np.random.rand(10).reshape(2, 5)) From 54d1d1fecd58f2537046223cdeb7686e8b2914b7 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 11 Jul 2022 08:08:23 -0600 Subject: [PATCH 292/416] Correct docstring --- stumpy/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stumpy/core.py b/stumpy/core.py index 8d0bcafe9..5330e6046 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2547,7 +2547,7 @@ def _select_P_ABBA_value(P_ABBA, k, custom_func=None): k : int Specify the `k`th value in the concatenated matrix profiles to return. This - parameter is ignored when `k_func` is not None. + parameter is ignored when `custom_func` is not None. custom_func : object, default None A custom user defined function for selecting the desired value from the From 67351671865ba38fcb5759274dde873b12d56d1d Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 15 Jul 2022 18:43:39 -0600 Subject: [PATCH 293/416] Revise docstrings --- stumpy/core.py | 2 +- stumpy/scrump.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 5330e6046..167502fc3 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2680,7 +2680,7 @@ def _shift_insert_at_index(a, idx, v, shift="right"): If `shift=right`(default), all elements in `a[idx:]` are shifted to the right by one element and the last element is discarded. If `shift=left`, all elements in `a[:idx]` are shifted to the left by one element and the first element is discarded. - In both cases, the length of `a` remains unchanged. + In both cases, `a` is updated in palce and its length remains unchanged. Note that for any other string value for parameter `shift`, the parameter will be reset to `shift="right"`. diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 1c1286e4e..ad4c2f4df 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -293,12 +293,12 @@ def _prescrump( Returns ------- out1 : numpy.ndarray - The (top-k) Matrix profile. When k=1 (default), the first (and only) column + The (top-k) matrix profile. When k=1 (default), the first (and only) column in this 2D array consists of the matrix profile. When k > 1, the output has exactly `k` columns consisting of the top-k matrix profile. out2 : numpy.ndarray - The (top-k) Matrix profile indices. When k=1 (default), the first (and only) + The (top-k) matrix profile indices. When k=1 (default), the first (and only) column in this 2D array consists of the matrix profile indices. When k > 1, the output has exactly `k` columns consisting of the top-k matrix profile indices. @@ -382,12 +382,12 @@ def prescrump(T_A, m, T_B=None, s=None, normalize=True, p=2.0, k=1): Returns ------- P : numpy.ndarray - The (top-k) Matrix profile. When k = 1 (default), the first (and only) column + The (top-k) matrix profile. When k = 1 (default), the first (and only) column in this 2D array consists of the matrix profile. When k > 1, the output has exactly `k` columns consisting of the top-k matrix profile. I : numpy.ndarray - The (top-k) Matrix profile indices. When k = 1 (default), the first (and only) + The (top-k) matrix profile indices. When k = 1 (default), the first (and only) column in this 2D array consists of the matrix profile indices. When k > 1, the output has exactly `k` columns consisting of the top-k matrix profile indices. From 0112989f76ce163c5ae2fdac6800f13fdf726226 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 15 Jul 2022 18:46:46 -0600 Subject: [PATCH 294/416] minor change --- stumpy/scrump.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index ad4c2f4df..5d191555a 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -684,9 +684,9 @@ def __init__( if pre_scrump: if self._ignore_trivial: - P, I = prescrump(T_A, m, s=s, k=k) + P, I = prescrump(T_A, m, s=s, k=self._k) else: - P, I = prescrump(T_A, m, T_B=T_B, s=s, k=k) + P, I = prescrump(T_A, m, T_B=T_B, s=s, k=self._k) core._merge_topk_PI(self._P, P, self._I, I) From ff322a0279adea92c78e2e08096f8ac74cb192dc Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 15 Jul 2022 18:58:26 -0600 Subject: [PATCH 295/416] Revise comments --- stumpy/stumpi.py | 5 +++-- tests/naive.py | 8 ++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/stumpy/stumpi.py b/stumpy/stumpi.py index 465c39db5..8d30f3319 100644 --- a/stumpy/stumpi.py +++ b/stumpy/stumpi.py @@ -266,8 +266,9 @@ def _update_egress(self, t): core._shift_insert_at_index(self._P[-1], pos, d) core._shift_insert_at_index(self._I[-1], pos, i + self._n_appended) - # All neighbors of the last subsequence are on its left. So, its matrix profile - # value/index and its left matrix profile value/index must be equal. + # All neighbors of the last subsequence are on its left. So, its (top-1) + # matrix profile value/index and its left matrix profile value/index must + # be equal. self._left_P[-1] = self._P[-1, 0] self._left_I[-1] = self._I[-1, 0] diff --git a/tests/naive.py b/tests/naive.py index 8a2ae3a0e..928dee9d8 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -218,7 +218,7 @@ def stump(T_A, m, T_B=None, exclusion_zone=None, row_wise=False, k=1): # self-join: right matrix profile index (top-1) if ignore_trivial and i < D.shape[0]: - IR = i + np.argmin(D[i:]) # shift arg by `i` to get true index + IR = i + np.argmin(D[i:]) # offset by `i` to get true index if D[IR] == np.inf: IR = -1 I[i, k + 1] = IR @@ -239,7 +239,7 @@ def stump(T_A, m, T_B=None, exclusion_zone=None, row_wise=False, k=1): d = distance_matrix[i, i + g] if d < P[i, k - 1]: idx = searchsorted_right(P[i], d) - # to keep the top-k, we must get rid of the last element. + # to keep the top-k, we must discard the last element. P[i, :k] = np.insert(P[i, :k], idx, d)[:-1] I[i, :k] = np.insert(I[i, :k], idx, i + g)[:-1] @@ -854,8 +854,8 @@ def update(self, t): self.I_[-1] = I_last_topk + self._n_appended self.I_[-1][self.P_[-1] == np.inf] = -1 - # for last indx, the left matrix profile value is self.P_[-1, 0] - # and the same goes for left matrix profile index + # for the last index, the left matrix profile value is self.P_[-1, 0] + # and the same goes for the left matrix profile index self.left_P_[-1] = self.P_[-1, 0] self.left_I_[-1] = self.I_[-1, 0] From 598fcf40c3a25e4aa3728e59b77670dce896268b Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 15 Jul 2022 19:05:42 -0600 Subject: [PATCH 296/416] Avoid redundant allocation of memory --- stumpy/core.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 167502fc3..1350d6a6d 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2606,9 +2606,9 @@ def _merge_topk_PI(PA, PB, IA, IB): ------- None """ + tmp_P = np.empty(PA.shape[1], dtype=np.float64) + tmp_I = np.empty(PA.shape[1], dtype=np.int64) for i in range(len(PA)): - tmp_P = np.empty(PA.shape[1], dtype=np.float64) - tmp_I = np.empty(PA.shape[1], dtype=np.int64) aj, bj = 0, 0 for k in range(len(tmp_P)): if PB[i, bj] < PA[i, aj]: @@ -2655,10 +2655,10 @@ def _merge_topk_ρI(ρA, ρB, IA, IB): ------- None """ - last_idx = ρA.shape[1] - 1 + tmp_ρ = np.empty(ρA.shape[1], dtype=np.float64) + tmp_I = np.empty(ρA.shape[1], dtype=np.int64) + last_idx = len(tmp_ρ) - 1 for i in range(len(ρA)): - tmp_ρ = np.empty(ρA.shape[1], dtype=np.float64) - tmp_I = np.empty(ρA.shape[1], dtype=np.int64) aj, bj = last_idx, last_idx for k in range(last_idx, -1, -1): if ρB[i, bj] > ρA[i, aj]: From 54643e21d57b91899f9a5c2dd48d807afd2a8994 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 15 Jul 2022 19:12:00 -0600 Subject: [PATCH 297/416] Revise docstrings and comments --- stumpy/core.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 1350d6a6d..655a4ef86 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2677,13 +2677,13 @@ def _merge_topk_ρI(ρA, ρB, IA, IB): @njit def _shift_insert_at_index(a, idx, v, shift="right"): """ - If `shift=right`(default), all elements in `a[idx:]` are shifted to the right by - one element and the last element is discarded. If `shift=left`, all elements in - `a[:idx]` are shifted to the left by one element and the first element is discarded. - In both cases, `a` is updated in palce and its length remains unchanged. + If `shift=right` (default), all elements in `a[idx:]` are shifted to the right by + one element, `v` in inserted at index `idx` and the last element is discarded. + If `shift=left`, all elements in `a[:idx]` are shifted to the left by one element, + `v` in inserted at index `idx-1`, and the first element is discarded. In both cases, + `a` is updated in place and its length remains unchanged. - Note that for any other string value for parameter `shift`, the parameter will be - reset to `shift="right"`. + Note that all unrecognized `shift` inputs will default to `shift=right`. Parameters @@ -2701,8 +2701,8 @@ def _shift_insert_at_index(a, idx, v, shift="right"): The value that should be inserted into array `a` at index `idx` shift: str, default "right" - The value that indicates whether the shifting of elements should be to the - right or to the left. If `shift="right"` (default), all elements in `a[idx:]` + The value that indicates whether the shifting of elements should be towards + the right or left. If `shift="right"` (default), all elements in `a[idx:]` are shifted to the right by one element. If `shift="left"`, all elements in `a[:idx]` are shifted to the left by one element. @@ -2716,7 +2716,6 @@ def _shift_insert_at_index(a, idx, v, shift="right"): # elements were shifted to the left, thus the insertion index becomes # `idx-1` a[idx - 1] = v - else: if 0 <= idx < len(a): a[idx + 1 :] = a[idx:-1] From 9433499e382cb6037e638b0c8d6ad6f7ae8a763e Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 15 Jul 2022 19:20:29 -0600 Subject: [PATCH 298/416] rename variables --- stumpy/scrump.py | 34 +++++++++++++++++----------------- stumpy/stump.py | 12 ++++++------ stumpy/stumpi.py | 24 ++++++++++++------------ 3 files changed, 35 insertions(+), 35 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 5d191555a..97b0799e9 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -148,22 +148,22 @@ def _compute_PI( σ_Q[j + g], ) if D_squared < P_squared[thread_idx, i + g, -1]: - pos = np.searchsorted( + idx = np.searchsorted( P_squared[thread_idx, i + g], D_squared, side="right" ) core._shift_insert_at_index( - P_squared[thread_idx, i + g], pos, D_squared + P_squared[thread_idx, i + g], idx, D_squared ) - core._shift_insert_at_index(I[thread_idx, i + g], pos, j + g) + core._shift_insert_at_index(I[thread_idx, i + g], idx, j + g) if D_squared < P_squared[thread_idx, j + g, -1]: - pos = np.searchsorted( + idx = np.searchsorted( P_squared[thread_idx, j + g], D_squared, side="right" ) core._shift_insert_at_index( - P_squared[thread_idx, j + g], pos, D_squared + P_squared[thread_idx, j + g], idx, D_squared ) - core._shift_insert_at_index(I[thread_idx, j + g], pos, i + g) + core._shift_insert_at_index(I[thread_idx, j + g], idx, i + g) QT_j = QT_j_prime # Update top-k for both subsequences `S[i-g] = T[i-g:i-g+m]` and @@ -180,22 +180,22 @@ def _compute_PI( σ_Q[j - g], ) if D_squared < P_squared[thread_idx, i - g, -1]: - pos = np.searchsorted( + idx = np.searchsorted( P_squared[thread_idx, i - g], D_squared, side="right" ) core._shift_insert_at_index( - P_squared[thread_idx, i - g], pos, D_squared + P_squared[thread_idx, i - g], idx, D_squared ) - core._shift_insert_at_index(I[thread_idx, i - g], pos, j - g) + core._shift_insert_at_index(I[thread_idx, i - g], idx, j - g) if D_squared < P_squared[thread_idx, j - g, -1]: - pos = np.searchsorted( + idx = np.searchsorted( P_squared[thread_idx, j - g], D_squared, side="right" ) core._shift_insert_at_index( - P_squared[thread_idx, j - g], pos, D_squared + P_squared[thread_idx, j - g], idx, D_squared ) - core._shift_insert_at_index(I[thread_idx, j - g], pos, i - g) + core._shift_insert_at_index(I[thread_idx, j - g], idx, i - g) # In the case of a self-join, the calculated distance profile can also be # used to refine the top-k for all non-trivial subsequences @@ -205,17 +205,17 @@ def _compute_PI( # can be used to update the top-k for BOTH subsequence `i` and # subsequence `j`. We update the latter here. - idx = np.flatnonzero( + indices = np.flatnonzero( squared_distance_profile < P_squared[thread_idx, :, -1] ) - for j in idx: - pos = np.searchsorted( + for j in indices: + idx = np.searchsorted( P_squared[thread_idx, j], squared_distance_profile[j], side="right" ) core._shift_insert_at_index( - P_squared[thread_idx, j], pos, squared_distance_profile[j] + P_squared[thread_idx, j], idx, squared_distance_profile[j] ) - core._shift_insert_at_index(I[thread_idx, j], pos, i) + core._shift_insert_at_index(I[thread_idx, j], idx, i) @njit( diff --git a/stumpy/stump.py b/stumpy/stump.py index 1144015af..3b8c74bed 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -217,22 +217,22 @@ def _compute_diagonal( # first (i.e. smallest) element in this array. Note that a higher # pearson value corresponds to a lower distance. if pearson > ρ[thread_idx, i, 0]: - pos = np.searchsorted(ρ[thread_idx, i], pearson) + idx = np.searchsorted(ρ[thread_idx, i], pearson) core._shift_insert_at_index( - ρ[thread_idx, i], pos, pearson, shift="left" + ρ[thread_idx, i], idx, pearson, shift="left" ) core._shift_insert_at_index( - I[thread_idx, i], pos, i + g, shift="left" + I[thread_idx, i], idx, i + g, shift="left" ) if ignore_trivial: # self-joins only if pearson > ρ[thread_idx, i + g, 0]: - pos = np.searchsorted(ρ[thread_idx, i + g], pearson) + idx = np.searchsorted(ρ[thread_idx, i + g], pearson) core._shift_insert_at_index( - ρ[thread_idx, i + g], pos, pearson, shift="left" + ρ[thread_idx, i + g], idx, pearson, shift="left" ) core._shift_insert_at_index( - I[thread_idx, i + g], pos, i, shift="left" + I[thread_idx, i + g], idx, i, shift="left" ) if i < i + g: diff --git a/stumpy/stumpi.py b/stumpy/stumpi.py index 8d30f3319..22bc9b122 100644 --- a/stumpy/stumpi.py +++ b/stumpy/stumpi.py @@ -249,10 +249,10 @@ def _update_egress(self, t): update_idx = np.argwhere(D < self._P[:, -1]).flatten() for i in update_idx: - pos = np.searchsorted(self._P[i], D[i], side="right") - core._shift_insert_at_index(self._P[i], pos, D[i]) + idx = np.searchsorted(self._P[i], D[i], side="right") + core._shift_insert_at_index(self._P[i], idx, D[i]) core._shift_insert_at_index( - self._I[i], pos, D.shape[0] + self._n_appended - 1 + self._I[i], idx, D.shape[0] + self._n_appended - 1 ) # D.shape[0] is base-1 @@ -262,9 +262,9 @@ def _update_egress(self, t): self._I[-1] = -1 for i, d in enumerate(D): if d < self._P[-1, -1]: - pos = np.searchsorted(self._P[-1], d, side="right") - core._shift_insert_at_index(self._P[-1], pos, d) - core._shift_insert_at_index(self._I[-1], pos, i + self._n_appended) + idx = np.searchsorted(self._P[-1], d, side="right") + core._shift_insert_at_index(self._P[-1], idx, d) + core._shift_insert_at_index(self._I[-1], idx, i + self._n_appended) # All neighbors of the last subsequence are on its left. So, its (top-1) # matrix profile value/index and its left matrix profile value/index must @@ -316,9 +316,9 @@ def _update(self, t): update_idx = np.argwhere(D[:l] < self._P[:l, -1]).flatten() for i in update_idx: - pos = np.searchsorted(self._P[i], D[i], side="right") - core._shift_insert_at_index(self._P[i], pos, D[i]) - core._shift_insert_at_index(self._I[i], pos, l) + idx = np.searchsorted(self._P[i], D[i], side="right") + core._shift_insert_at_index(self._P[i], idx, D[i]) + core._shift_insert_at_index(self._I[i], idx, l) # Calculating top-k matrix profile and (top-1) left matrix profile (and thier # corresponding indices) for new subsequence whose distance profie is `D` @@ -326,9 +326,9 @@ def _update(self, t): I_new = np.full(self._k, -1, dtype=np.int64) for i, d in enumerate(D): if d < P_new[-1]: # maximum value in sorted array P_new - pos = np.searchsorted(P_new, d, side="right") - core._shift_insert_at_index(P_new, pos, d) - core._shift_insert_at_index(I_new, pos, i) + idx = np.searchsorted(P_new, d, side="right") + core._shift_insert_at_index(P_new, idx, d) + core._shift_insert_at_index(I_new, idx, i) left_I_new = I_new[0] left_P_new = P_new[0] From 902d7ab4f9aa6ed4e33720efb2070c6b5bf26194 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 15 Jul 2022 19:26:59 -0600 Subject: [PATCH 299/416] minor correction --- tests/test_scrump.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_scrump.py b/tests/test_scrump.py index 3bd43b423..b4ea27153 100644 --- a/tests/test_scrump.py +++ b/tests/test_scrump.py @@ -384,7 +384,7 @@ def test_scrump_plus_plus_self_join(T_A, T_B, percentages): comp_I = approx.I_ naive.replace_inf(ref_P) - naive.replace_inf(comp_I) + naive.replace_inf(comp_P) npt.assert_almost_equal(ref_P, comp_P) npt.assert_almost_equal(ref_I, comp_I) @@ -810,7 +810,7 @@ def test_scrump_plus_plus_self_join_KNN(T_A, T_B, percentages): comp_I = approx.I_ naive.replace_inf(ref_P) - naive.replace_inf(comp_I) + naive.replace_inf(comp_P) npt.assert_almost_equal(ref_P, comp_P) npt.assert_almost_equal(ref_I, comp_I) From 3c16d3316d08e6ee6d6b6cf5f2260d872322ed7f Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 18 Jul 2022 08:07:05 -0600 Subject: [PATCH 300/416] Fix indexing --- tests/naive.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 928dee9d8..2690da75d 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1442,7 +1442,7 @@ def prescrump(T_A, m, T_B, s, exclusion_zone=None, k=1): pos = np.searchsorted(P[i + g], d, side="right") P[i + g] = np.insert(P[i + g], pos, d)[:-1] I[i + g] = np.insert(I[i + g], pos, j + g)[:-1] - if d < P[j + g]: + if d < P[j + g, -1]: pos = np.searchsorted(P[j + g], d, side="right") P[j + g] = np.insert(P[j + g], pos, d)[:-1] I[j + g] = np.insert(I[j + g], pos, i + g)[:-1] @@ -1453,7 +1453,7 @@ def prescrump(T_A, m, T_B, s, exclusion_zone=None, k=1): pos = np.searchsorted(P[i - g], d, side="right") P[i - g] = np.insert(P[i - g], pos, d)[:-1] I[i - g] = np.insert(I[i - g], pos, j - g)[:-1] - if d < P[j - g]: + if d < P[j - g, -1]: pos = np.searchsorted(P[j - g], d, side="right") P[j - g] = np.insert(P[j - g], pos, d)[:-1] I[j - g] = np.insert(I[j - g], pos, i - g)[:-1] From 9bb8b16fd0f88d6dd6347d57d06836d42126b5a3 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 18 Jul 2022 08:17:18 -0600 Subject: [PATCH 301/416] Add new test function --- tests/test_scrump.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/test_scrump.py b/tests/test_scrump.py index b4ea27153..f5797ad16 100644 --- a/tests/test_scrump.py +++ b/tests/test_scrump.py @@ -814,3 +814,23 @@ def test_scrump_plus_plus_self_join_KNN(T_A, T_B, percentages): npt.assert_almost_equal(ref_P, comp_P) npt.assert_almost_equal(ref_I, comp_I) + + +@pytest.mark.parametrize("T_A, T_B", test_data) +def test_prescrump_self_join_larger_window_m_5_k_5(T_A, T_B): + m = 5 + k = 5 + zone = int(np.ceil(m / 4)) + + if len(T_B) > m: + for s in range(1, zone + 1): + seed = np.random.randint(100000) + + np.random.seed(seed) + ref_P, ref_I = naive.prescrump(T_B, m, T_B, s=s, exclusion_zone=zone) + + np.random.seed(seed) + comp_P, comp_I = prescrump(T_B, m, s=s) + + npt.assert_almost_equal(ref_P, comp_P) + npt.assert_almost_equal(ref_I, comp_I) From 33c611231cfdfcfb15b2c29b2b6b2f2c77c2ed4d Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 18 Jul 2022 08:23:40 -0600 Subject: [PATCH 302/416] Modify test function --- tests/test_scrump.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_scrump.py b/tests/test_scrump.py index f5797ad16..3eb298bf4 100644 --- a/tests/test_scrump.py +++ b/tests/test_scrump.py @@ -827,10 +827,10 @@ def test_prescrump_self_join_larger_window_m_5_k_5(T_A, T_B): seed = np.random.randint(100000) np.random.seed(seed) - ref_P, ref_I = naive.prescrump(T_B, m, T_B, s=s, exclusion_zone=zone) + ref_P, ref_I = naive.prescrump(T_B, m, T_B, s=s, exclusion_zone=zone, k=k) np.random.seed(seed) - comp_P, comp_I = prescrump(T_B, m, s=s) + comp_P, comp_I = prescrump(T_B, m, s=s, k=k) npt.assert_almost_equal(ref_P, comp_P) npt.assert_almost_equal(ref_I, comp_I) From 30f4bcff39e1a41d985d75d367038464f9b76bd6 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 19 Jul 2022 11:50:02 -0600 Subject: [PATCH 303/416] Avoid dumplicate in naive prescrump --- tests/naive.py | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 625e09320..9bc7513bb 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1426,10 +1426,15 @@ def prescrump(T_A, m, T_B, s, exclusion_zone=None, k=1): if exclusion_zone is not None: apply_exclusion_zone(distance_profile, i, exclusion_zone, np.inf) - I[i, 1:] = I[i, :-1] - I[i, 0] = np.argmin(distance_profile) - P[i, 1:] = P[i, :-1] - P[i, 0] = distance_profile[I[i, 0]] + idx = np.argmin(distance_profile) + if idx not in I[i]: + I[i, 1:] = I[i, :-1] + I[i, 0] = idx + P[i, 1:] = P[i, :-1] + P[i, 0] = distance_profile[I[i, 0]] + + # else: the idx, i.e. 1NN of `i`, was already obtained (it maynot be stored + # at the first index of array I[i] though!) if P[i, 0] == np.inf: I[i, 0] = -1 @@ -1440,31 +1445,36 @@ def prescrump(T_A, m, T_B, s, exclusion_zone=None, k=1): d = dist_matrix[i + g, j + g] if d < P[i + g, -1]: pos = np.searchsorted(P[i + g], d, side="right") - P[i + g] = np.insert(P[i + g], pos, d)[:-1] - I[i + g] = np.insert(I[i + g], pos, j + g)[:-1] + if (j + g) not in I[i + g, :pos]: + P[i + g] = np.insert(P[i + g], pos, d)[:-1] + I[i + g] = np.insert(I[i + g], pos, j + g)[:-1] if exclusion_zone is not None and d < P[j + g, -1]: pos = np.searchsorted(P[j + g], d, side="right") - P[j + g] = np.insert(P[j + g], pos, d)[:-1] - I[j + g] = np.insert(I[j + g], pos, i + g)[:-1] + if (i + g) not in I[j + g, :pos]: + P[j + g] = np.insert(P[j + g], pos, d)[:-1] + I[j + g] = np.insert(I[j + g], pos, i + g)[:-1] for g in range(1, min(s, i + 1, j + 1)): d = dist_matrix[i - g, j - g] if d < P[i - g, -1]: pos = np.searchsorted(P[i - g], d, side="right") - P[i - g] = np.insert(P[i - g], pos, d)[:-1] - I[i - g] = np.insert(I[i - g], pos, j - g)[:-1] + if (j - g) not in I[i - g, :pos]: + P[i - g] = np.insert(P[i - g], pos, d)[:-1] + I[i - g] = np.insert(I[i - g], pos, j - g)[:-1] if exclusion_zone is not None and d < P[j - g, -1]: pos = np.searchsorted(P[j - g], d, side="right") - P[j - g] = np.insert(P[j - g], pos, d)[:-1] - I[j - g] = np.insert(I[j - g], pos, i - g)[:-1] + if (i - g) not in I[j - g, :pos]: + P[j - g] = np.insert(P[j - g], pos, d)[:-1] + I[j - g] = np.insert(I[j - g], pos, i - g)[:-1] # In the case of a self-join, the calculated distance profile can also be # used to refine the top-k for all non-trivial subsequences if exclusion_zone is not None: for idx in np.flatnonzero(distance_profile < P[:, -1]): pos = np.searchsorted(P[idx], distance_profile[idx], side="right") - P[idx] = np.insert(P[idx], pos, distance_profile[idx])[:-1] - I[idx] = np.insert(I[idx], pos, i)[:-1] + if i not in I[idx, :pos]: + P[idx] = np.insert(P[idx], pos, distance_profile[idx])[:-1] + I[idx] = np.insert(I[idx], pos, i)[:-1] return P, I From 77e56f7335937d162530a26c53898ce166730d97 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 19 Jul 2022 12:09:17 -0600 Subject: [PATCH 304/416] Add parameter assume_unique to handle duplicates --- tests/naive.py | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 9bc7513bb..87127dd13 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1800,16 +1800,29 @@ def _total_diagonal_ndists(tile_lower_diag, tile_upper_diag, tile_height, tile_w return total_ndists -def merge_topk_PI(PA, PB, IA, IB): +def merge_topk_PI(PA, PB, IA, IB, assume_unique=True): + k = PA.shape[1] profile = np.column_stack((PA, PB)) indices = np.column_stack((IA, IB)) - idx = np.argsort(profile, axis=1) - profile[:, :] = np.take_along_axis(profile, idx, axis=1) - indices[:, :] = np.take_along_axis(indices, idx, axis=1) + IDX = np.argsort(profile, axis=1) + if assume_unique: + profile[:, :] = np.take_along_axis(profile, IDX, axis=1) + indices[:, :] = np.take_along_axis(indices, IDX, axis=1) - PA[:, :] = profile[:, : PA.shape[1]] - IA[:, :] = indices[:, : PA.shape[1]] + PA[:, :] = profile[:, :k] + IA[:, :] = indices[:, :k] + else: + # avoid duplicates while merging IA[i] and IB[i] + IDX_merged = np.full_like(PA, -1, dtype=np.int64) + for i, idx in enumerate(IDX): + _, arg_unique = np.unique(indices[i, idx], return_index=True) + arg_unique = np.sort(arg_unique)[:k] # preserving order of their appearence + idx = idx[arg_unique] + IDX_merged[i, : len(idx)] = idx + + PA[:, :] = np.take_along_axis(profile, IDX_merged, axis=1) + IA[:, :] = np.take_along_axis(indices, IDX_merged, axis=1) def merge_topk_ρI(ρA, ρB, IA, IB): From 7a93a7c957e1844dfa44c661779086f5c2eb4720 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 19 Jul 2022 12:23:33 -0600 Subject: [PATCH 305/416] Add test function to test for duplicates in topk_merge --- tests/test_core.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tests/test_core.py b/tests/test_core.py index 2ce86a4bb..703f7e6f7 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1062,6 +1062,7 @@ def test_select_P_ABBA_val_inf(): def test_merge_topk_PI(): + # `assume_unique = True` n = 50 for k in range(1, 6): PA = np.random.rand(n * k).reshape(n, k) @@ -1088,6 +1089,41 @@ def test_merge_topk_PI(): npt.assert_array_equal(ref_P, comp_P) npt.assert_array_equal(ref_I, comp_I) + # `assume_unique = False` + n = 50 + for k in range(1, 6): + PA = np.random.rand(n * k).reshape(n, k) + PB = np.random.rand(n * k).reshape(n, k) + + IA = np.arange(n * k).reshape(n, k) + IB = IA + n * k + + col_idx_A = np.random.randint(0, k, size=n) + col_idx_B = np.random.randint(0, k, size=n) + for i in range(n): # creating random duplicates between A and B + PB[i, col_idx_B[i]] = PA[i, col_idx_A[i]] + np.random.rand(1) * 1e-8 + IB[i, col_idx_B[i]] = IA[i, col_idx_A[i]] + + IDX = np.argsort(PA, axis=1) + PA[:, :] = np.take_along_axis(PA, IDX, axis=1) + IA[:, :] = np.take_along_axis(IA, IDX, axis=1) + + IDX = np.argsort(PB, axis=1) + PB[:, :] = np.take_along_axis(PB, IDX, axis=1) + IB[:, :] = np.take_along_axis(IB, IDX, axis=1) + + ref_P = PA.copy() + ref_I = IA.copy() + + comp_P = PA.copy() + comp_I = IA.copy() + + naive.merge_topk_PI(ref_P, PB, ref_I, IB, assume_unique=False) + core._merge_topk_PI(comp_P, PB, comp_I, IB) + + npt.assert_array_equal(ref_P, comp_P) + npt.assert_array_equal(ref_I, comp_I) + def test_merge_topk_ρI(): n = 50 From fefcaa9b2163c4793706668b65a2cd183cac815b Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 19 Jul 2022 12:38:31 -0600 Subject: [PATCH 306/416] Add parameter assume_unique to performant merge_topk --- stumpy/core.py | 51 +++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 9 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 655a4ef86..1dc4f47b3 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2576,7 +2576,7 @@ def _select_P_ABBA_value(P_ABBA, k, custom_func=None): @njit -def _merge_topk_PI(PA, PB, IA, IB): +def _merge_topk_PI(PA, PB, IA, IB, assume_unique=True): """ Merge two top-k matrix profiles `PA` and `PB`, and update `PA` (in place) while always prioritizing the values of `PA` over the values of `PB` in case of ties. @@ -2602,22 +2602,55 @@ def _merge_topk_PI(PA, PB, IA, IB): IB : numpy.ndarray A (top-k) matrix profile indices corresponding to `PB` + assume_unique : bool, default True + If True (default), each row of IA and its corresponding row in IB have no + duplicates. False otherwise. + Returns ------- None """ - tmp_P = np.empty(PA.shape[1], dtype=np.float64) - tmp_I = np.empty(PA.shape[1], dtype=np.int64) - for i in range(len(PA)): + k = PA.shape[1] + tmp_P = np.empty(k, dtype=np.float64) + tmp_I = np.empty(k, dtype=np.int64) + for i in range(PA.shape[0]): aj, bj = 0, 0 - for k in range(len(tmp_P)): + idx = 0 + prev_val = np.inf + for _ in range(2 * k): # 2 * k to traverse both A and B + if idx >= k: + break + if aj >= k: # PA is already fully traversed. + tmp_P[idx:] = PB[i, bj : bj + k - idx] + tmp_I[idx:] = IB[i, bj : bj + k - idx] + break + if bj >= k: # PB is already fully traversed. + tmp_P[idx:] = PA[i, aj : aj + k - idx] + tmp_I[idx:] = IA[i, aj : aj + k - idx] + break + if PB[i, bj] < PA[i, aj]: - tmp_P[k] = PB[i, bj] - tmp_I[k] = IB[i, bj] + if ( + assume_unique + or abs(PB[i, bj] - prev_val) > 1e-6 + or IB[i, bj] not in tmp_I[:idx] + ): + tmp_P[idx] = PB[i, bj] + tmp_I[idx] = IB[i, bj] + prev_val = tmp_P[idx] + idx += 1 bj += 1 + else: - tmp_P[k] = PA[i, aj] - tmp_I[k] = IA[i, aj] + if ( + assume_unique + or abs(PB[i, bj] - prev_val) > 1e-6 + or IB[i, bj] not in tmp_I[:idx] + ): + tmp_P[k] = PA[i, aj] + tmp_I[k] = IA[i, aj] + prev_val = tmp_P[idx] + idx += 1 aj += 1 PA[i] = tmp_P From 5e9c5fccf6d60b5aa9123c1c7c3f2e4a02f1b20b Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 19 Jul 2022 12:40:10 -0600 Subject: [PATCH 307/416] fix test function --- tests/test_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_core.py b/tests/test_core.py index 703f7e6f7..4b448ce37 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1119,7 +1119,7 @@ def test_merge_topk_PI(): comp_I = IA.copy() naive.merge_topk_PI(ref_P, PB, ref_I, IB, assume_unique=False) - core._merge_topk_PI(comp_P, PB, comp_I, IB) + core._merge_topk_PI(comp_P, PB, comp_I, IB, assume_unique=False) npt.assert_array_equal(ref_P, comp_P) npt.assert_array_equal(ref_I, comp_I) From 9685e440733580dec6830e39ce85bc34f4f3e76f Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 19 Jul 2022 13:03:56 -0600 Subject: [PATCH 308/416] Fix bug --- stumpy/core.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 1dc4f47b3..eebd6930e 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2644,11 +2644,11 @@ def _merge_topk_PI(PA, PB, IA, IB, assume_unique=True): else: if ( assume_unique - or abs(PB[i, bj] - prev_val) > 1e-6 + or abs(PA[i, bj] - prev_val) > 1e-6 or IB[i, bj] not in tmp_I[:idx] ): - tmp_P[k] = PA[i, aj] - tmp_I[k] = IA[i, aj] + tmp_P[idx] = PA[i, aj] + tmp_I[idx] = IA[i, aj] prev_val = tmp_P[idx] idx += 1 aj += 1 From 9dd452be2d563d877d9ef3aa4d5aec9e54094229 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 19 Jul 2022 13:09:27 -0600 Subject: [PATCH 309/416] Revise prescrump to avoid duplicates --- stumpy/scrump.py | 62 +++++++++++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 25 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 63737f597..4c4ce4dc7 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -115,10 +115,11 @@ def _compute_PI( core._apply_exclusion_zone(squared_distance_profile, i, excl_zone, np.inf) nn_idx = np.argmin(squared_distance_profile) - core._shift_insert_at_index( - P_squared[thread_idx, i], 0, squared_distance_profile[nn_idx] - ) - core._shift_insert_at_index(I[thread_idx, i], 0, nn_idx) + if nn_idx not in I[thread_idx, i]: + core._shift_insert_at_index( + P_squared[thread_idx, i], 0, squared_distance_profile[nn_idx] + ) + core._shift_insert_at_index(I[thread_idx, i], 0, nn_idx) if P_squared[thread_idx, i, 0] == np.inf: # pragma: no cover I[thread_idx, i, 0] = -1 @@ -151,19 +152,21 @@ def _compute_PI( idx = np.searchsorted( P_squared[thread_idx, i + g], D_squared, side="right" ) - core._shift_insert_at_index( - P_squared[thread_idx, i + g], idx, D_squared - ) - core._shift_insert_at_index(I[thread_idx, i + g], idx, j + g) + if (j + g) not in I[thread_idx, i + g, :idx]: + core._shift_insert_at_index( + P_squared[thread_idx, i + g], idx, D_squared + ) + core._shift_insert_at_index(I[thread_idx, i + g], idx, j + g) if excl_zone is not None and D_squared < P_squared[thread_idx, j + g, -1]: idx = np.searchsorted( P_squared[thread_idx, j + g], D_squared, side="right" ) - core._shift_insert_at_index( - P_squared[thread_idx, j + g], idx, D_squared - ) - core._shift_insert_at_index(I[thread_idx, j + g], idx, i + g) + if (i + g) not in I[thread_idx, j + g, :idx]: + core._shift_insert_at_index( + P_squared[thread_idx, j + g], idx, D_squared + ) + core._shift_insert_at_index(I[thread_idx, j + g], idx, i + g) QT_j = QT_j_prime # Update top-k for both subsequences `S[i-g] = T[i-g:i-g+m]` and @@ -183,19 +186,21 @@ def _compute_PI( idx = np.searchsorted( P_squared[thread_idx, i - g], D_squared, side="right" ) - core._shift_insert_at_index( - P_squared[thread_idx, i - g], idx, D_squared - ) - core._shift_insert_at_index(I[thread_idx, i - g], idx, j - g) + if (j - g) not in I[thread_idx, i - g, :idx]: + core._shift_insert_at_index( + P_squared[thread_idx, i - g], idx, D_squared + ) + core._shift_insert_at_index(I[thread_idx, i - g], idx, j - g) if excl_zone is not None and D_squared < P_squared[thread_idx, j - g, -1]: idx = np.searchsorted( P_squared[thread_idx, j - g], D_squared, side="right" ) - core._shift_insert_at_index( - P_squared[thread_idx, j - g], idx, D_squared - ) - core._shift_insert_at_index(I[thread_idx, j - g], idx, i - g) + if (i - g) not in I[thread_idx, j - g, :idx]: + core._shift_insert_at_index( + P_squared[thread_idx, j - g], idx, D_squared + ) + core._shift_insert_at_index(I[thread_idx, j - g], idx, i - g) # In the case of a self-join, the calculated distance profile can also be # used to refine the top-k for all non-trivial subsequences @@ -212,10 +217,11 @@ def _compute_PI( idx = np.searchsorted( P_squared[thread_idx, j], squared_distance_profile[j], side="right" ) - core._shift_insert_at_index( - P_squared[thread_idx, j], idx, squared_distance_profile[j] - ) - core._shift_insert_at_index(I[thread_idx, j], idx, i) + if i not in I[thread_idx, j, :idx]: + core._shift_insert_at_index( + P_squared[thread_idx, j], idx, squared_distance_profile[j] + ) + core._shift_insert_at_index(I[thread_idx, j], idx, i) @njit( @@ -337,7 +343,13 @@ def _prescrump( ) for thread_idx in range(1, n_threads): - core._merge_topk_PI(P_squared[0], P_squared[thread_idx], I[0], I[thread_idx]) + core._merge_topk_PI( + P_squared[0], + P_squared[thread_idx], + I[0], + I[thread_idx], + assume_unique=False, + ) return np.sqrt(P_squared[0]), I[0] From 3d68ae70c1c0606ea8d2c7102261c57f62a072e5 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 19 Jul 2022 13:12:08 -0600 Subject: [PATCH 310/416] Avoid duplocates in scrump --- stumpy/scrump.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 4c4ce4dc7..dee399334 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -700,7 +700,7 @@ def __init__( else: P, I = prescrump(T_A, m, T_B=T_B, s=s, k=self._k) - core._merge_topk_PI(self._P, P, self._I, I) + core._merge_topk_PI(self._P, P, self._I, I, assume_unique=False) if self._ignore_trivial: self._diags = np.random.permutation( @@ -758,7 +758,7 @@ def update(self): ) # Update (top-k) matrix profile and indices - core._merge_topk_PI(self._P, P, self._I, I) + core._merge_topk_PI(self._P, P, self._I, I, assume_unique=False) # update left matrix profile and indices mask = PL < self._PL From 4c171198e74858ac4ef9f32b317341cc465b562b Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 19 Jul 2022 13:14:23 -0600 Subject: [PATCH 311/416] Revise test function to consider new parameter --- tests/test_scrump.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/test_scrump.py b/tests/test_scrump.py index 3eb298bf4..7b0ca6393 100644 --- a/tests/test_scrump.py +++ b/tests/test_scrump.py @@ -373,7 +373,7 @@ def test_scrump_plus_plus_self_join(T_A, T_B, percentages): ref_P_aux, ref_I_aux, _, _ = naive.scrump( T_B, m, T_B, percentage, zone, True, s ) - naive.merge_topk_PI(ref_P, ref_P_aux, ref_I, ref_I_aux) + naive.merge_topk_PI(ref_P, ref_P_aux, ref_I, ref_I_aux, assume_unique=False) np.random.seed(seed) approx = scrump( @@ -406,7 +406,7 @@ def test_scrump_plus_plus_A_B_join(T_A, T_B, percentages): ref_P_aux, ref_I_aux, ref_left_I_aux, ref_right_I_aux = naive.scrump( T_A, m, T_B, percentage, None, False, None ) - naive.merge_topk_PI(ref_P, ref_P_aux, ref_I, ref_I_aux) + naive.merge_topk_PI(ref_P, ref_P_aux, ref_I, ref_I_aux, assume_unique=False) ref_left_I = ref_left_I_aux ref_right_I = ref_right_I_aux @@ -793,7 +793,9 @@ def test_scrump_plus_plus_self_join_KNN(T_A, T_B, percentages): ref_P_aux, ref_I_aux, _, _ = naive.scrump( T_B, m, T_B, percentage, zone, True, s, k=k ) - naive.merge_topk_PI(ref_P, ref_P_aux, ref_I, ref_I_aux) + naive.merge_topk_PI( + ref_P, ref_P_aux, ref_I, ref_I_aux, assume_unique=False + ) np.random.seed(seed) approx = scrump( From 2c662a93e47d66caf9dac4eb6b0fe0ed16f09460 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 19 Jul 2022 13:27:22 -0600 Subject: [PATCH 312/416] Fix bug --- stumpy/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index eebd6930e..20f8d2853 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2644,8 +2644,8 @@ def _merge_topk_PI(PA, PB, IA, IB, assume_unique=True): else: if ( assume_unique - or abs(PA[i, bj] - prev_val) > 1e-6 - or IB[i, bj] not in tmp_I[:idx] + or abs(PA[i, aj] - prev_val) > 1e-6 + or IA[i, aj] not in tmp_I[:idx] ): tmp_P[idx] = PA[i, aj] tmp_I[idx] = IA[i, aj] From 3a0f4dacf750474cdee8f98fb95e265d356b27e2 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 19 Jul 2022 13:30:27 -0600 Subject: [PATCH 313/416] Revise naive scrump to avoid duplicates --- tests/naive.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 87127dd13..9a6070ef3 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1518,13 +1518,15 @@ def scrump(T_A, m, T_B, percentage, exclusion_zone, pre_scrump, s, k=1): d = dist_matrix[i, j] if d < P[i, -1]: # update TopK of P[i] idx = searchsorted_right(P[i], d) - P[i] = np.insert(P[i], idx, d)[:-1] - I[i] = np.insert(I[i], idx, i + g)[:-1] + if (i + g) not in I[i, :idx]: + P[i] = np.insert(P[i], idx, d)[:-1] + I[i] = np.insert(I[i], idx, i + g)[:-1] if exclusion_zone is not None and d < P[i + g, -1]: idx = searchsorted_right(P[i + g], d) - P[i + g] = np.insert(P[i + g], idx, d)[:-1] - I[i + g] = np.insert(I[i + g], idx, i)[:-1] + if i not in I[i + g, :idx]: + P[i + g] = np.insert(P[i + g], idx, d)[:-1] + I[i + g] = np.insert(I[i + g], idx, i)[:-1] # left matrix profile and left matrix profile indices if exclusion_zone is not None and i < i + g and d < PL[i + g]: From d8728c98702efd9b67d9eee4edf6ab746dd22180 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 19 Jul 2022 13:41:18 -0600 Subject: [PATCH 314/416] Add comment --- stumpy/scrump.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index dee399334..387787f16 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -116,6 +116,8 @@ def _compute_PI( nn_idx = np.argmin(squared_distance_profile) if nn_idx not in I[thread_idx, i]: + # It is more than likely that the top-k values for the `i`-th subsequence + # will be already populated. So, we must shift-insert here core._shift_insert_at_index( P_squared[thread_idx, i], 0, squared_distance_profile[nn_idx] ) From 561b4281f73ab35102ff9bd2b64033e305113651 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 19 Jul 2022 13:42:06 -0600 Subject: [PATCH 315/416] minor optimization --- stumpy/core.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 20f8d2853..30ffabd69 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2633,7 +2633,8 @@ def _merge_topk_PI(PA, PB, IA, IB, assume_unique=True): if ( assume_unique or abs(PB[i, bj] - prev_val) > 1e-6 - or IB[i, bj] not in tmp_I[:idx] + or IB[i, bj] not in tmp_I[:idx][::-1] # traverse in reverse to + # find duplicate in shorter time ): tmp_P[idx] = PB[i, bj] tmp_I[idx] = IB[i, bj] @@ -2645,7 +2646,8 @@ def _merge_topk_PI(PA, PB, IA, IB, assume_unique=True): if ( assume_unique or abs(PA[i, aj] - prev_val) > 1e-6 - or IA[i, aj] not in tmp_I[:idx] + or IA[i, aj] not in tmp_I[:idx][::-1] # traverse in reverse to + # find duplicate in shorter time ): tmp_P[idx] = PA[i, aj] tmp_I[idx] = IA[i, aj] From 44b85a826b1b4854e099e38da0a4893776ad3d73 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 19 Jul 2022 13:43:30 -0600 Subject: [PATCH 316/416] Correct style --- stumpy/scrump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 387787f16..a224f2a1c 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -117,7 +117,7 @@ def _compute_PI( nn_idx = np.argmin(squared_distance_profile) if nn_idx not in I[thread_idx, i]: # It is more than likely that the top-k values for the `i`-th subsequence - # will be already populated. So, we must shift-insert here + # will be already populated. So, we must shift-insert here core._shift_insert_at_index( P_squared[thread_idx, i], 0, squared_distance_profile[nn_idx] ) From dbdc7c9fc39670d65e079adf37a5e02a9a9e6bea Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 19 Jul 2022 13:48:09 -0600 Subject: [PATCH 317/416] Correct style --- stumpy/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 30ffabd69..bbb4524fd 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2633,7 +2633,7 @@ def _merge_topk_PI(PA, PB, IA, IB, assume_unique=True): if ( assume_unique or abs(PB[i, bj] - prev_val) > 1e-6 - or IB[i, bj] not in tmp_I[:idx][::-1] # traverse in reverse to + or IB[i, bj] not in tmp_I[:idx][::-1] # traverse in reverse to # find duplicate in shorter time ): tmp_P[idx] = PB[i, bj] @@ -2646,7 +2646,7 @@ def _merge_topk_PI(PA, PB, IA, IB, assume_unique=True): if ( assume_unique or abs(PA[i, aj] - prev_val) > 1e-6 - or IA[i, aj] not in tmp_I[:idx][::-1] # traverse in reverse to + or IA[i, aj] not in tmp_I[:idx][::-1] # traverse in reverse to # find duplicate in shorter time ): tmp_P[idx] = PA[i, aj] From 19129ab7c2a0add61d8cb5a8e905b3baa528fecf Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 19 Jul 2022 13:56:33 -0600 Subject: [PATCH 318/416] increase threshold --- stumpy/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index bbb4524fd..e1db51fd3 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2632,7 +2632,7 @@ def _merge_topk_PI(PA, PB, IA, IB, assume_unique=True): if PB[i, bj] < PA[i, aj]: if ( assume_unique - or abs(PB[i, bj] - prev_val) > 1e-6 + or abs(PB[i, bj] - prev_val) > 1e-3 or IB[i, bj] not in tmp_I[:idx][::-1] # traverse in reverse to # find duplicate in shorter time ): @@ -2645,7 +2645,7 @@ def _merge_topk_PI(PA, PB, IA, IB, assume_unique=True): else: if ( assume_unique - or abs(PA[i, aj] - prev_val) > 1e-6 + or abs(PA[i, aj] - prev_val) > 1e-3 or IA[i, aj] not in tmp_I[:idx][::-1] # traverse in reverse to # find duplicate in shorter time ): From 5d96bbdd5a194efb3ed952ece98f341ce9650c7e Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 19 Jul 2022 16:35:08 -0600 Subject: [PATCH 319/416] Specifiy kind in sort --- tests/naive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/naive.py b/tests/naive.py index 9a6070ef3..466ebadf6 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1807,7 +1807,7 @@ def merge_topk_PI(PA, PB, IA, IB, assume_unique=True): profile = np.column_stack((PA, PB)) indices = np.column_stack((IA, IB)) - IDX = np.argsort(profile, axis=1) + IDX = np.argsort(profile, axis=1, kind="mergesort") if assume_unique: profile[:, :] = np.take_along_axis(profile, IDX, axis=1) indices[:, :] = np.take_along_axis(indices, IDX, axis=1) From d3a9b3175e9224d7802610040da8ab9694265723 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 19 Jul 2022 17:32:44 -0600 Subject: [PATCH 320/416] minor change --- tests/test_scrump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_scrump.py b/tests/test_scrump.py index 7b0ca6393..5add152cc 100644 --- a/tests/test_scrump.py +++ b/tests/test_scrump.py @@ -677,8 +677,8 @@ def test_prescrump_self_join_KNN(T_A, T_B): np.random.seed(seed) comp_P, comp_I = prescrump(T_B, m, s=s, k=k) - npt.assert_almost_equal(ref_P, comp_P) npt.assert_almost_equal(ref_I, comp_I) + npt.assert_almost_equal(ref_P, comp_P) @pytest.mark.parametrize("T_A, T_B", test_data) From 970efc7a0b881adf4b076c9b30e412424c9f0c42 Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 20 Jul 2022 12:40:30 -0600 Subject: [PATCH 321/416] specify kind in sort --- tests/naive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/naive.py b/tests/naive.py index 466ebadf6..d602c9bf5 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1850,7 +1850,7 @@ def merge_topk_ρI(ρA, ρB, IA, IB): profile = np.column_stack((ρB, ρA)) indices = np.column_stack((IB, IA)) - idx = np.argsort(profile, axis=1) + idx = np.argsort(profile, axis=1, kind="mergesort") profile[:, :] = np.take_along_axis(profile, idx, axis=1) indices[:, :] = np.take_along_axis(indices, idx, axis=1) From cd7fe1a4d792d8028db3a2192e57c93ebef6bc46 Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 20 Jul 2022 12:44:22 -0600 Subject: [PATCH 322/416] minor changes --- tests/naive.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index d602c9bf5..c23e36a3e 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1426,10 +1426,10 @@ def prescrump(T_A, m, T_B, s, exclusion_zone=None, k=1): if exclusion_zone is not None: apply_exclusion_zone(distance_profile, i, exclusion_zone, np.inf) - idx = np.argmin(distance_profile) - if idx not in I[i]: + nn_idx = np.argmin(distance_profile) + if nn_idx not in I[i]: I[i, 1:] = I[i, :-1] - I[i, 0] = idx + I[i, 0] = nn_idx P[i, 1:] = P[i, :-1] P[i, 0] = distance_profile[I[i, 0]] @@ -1440,17 +1440,17 @@ def prescrump(T_A, m, T_B, s, exclusion_zone=None, k=1): I[i, 0] = -1 continue - j = I[i, 0] # index of 1st NN + j = nn_idx for g in range(1, min(s, l - max(i, j))): d = dist_matrix[i + g, j + g] if d < P[i + g, -1]: pos = np.searchsorted(P[i + g], d, side="right") - if (j + g) not in I[i + g, :pos]: + if (j + g) not in I[i + g, :]: P[i + g] = np.insert(P[i + g], pos, d)[:-1] I[i + g] = np.insert(I[i + g], pos, j + g)[:-1] if exclusion_zone is not None and d < P[j + g, -1]: pos = np.searchsorted(P[j + g], d, side="right") - if (i + g) not in I[j + g, :pos]: + if (i + g) not in I[j + g, :]: P[j + g] = np.insert(P[j + g], pos, d)[:-1] I[j + g] = np.insert(I[j + g], pos, i + g)[:-1] @@ -1458,12 +1458,12 @@ def prescrump(T_A, m, T_B, s, exclusion_zone=None, k=1): d = dist_matrix[i - g, j - g] if d < P[i - g, -1]: pos = np.searchsorted(P[i - g], d, side="right") - if (j - g) not in I[i - g, :pos]: + if (j - g) not in I[i - g, :]: P[i - g] = np.insert(P[i - g], pos, d)[:-1] I[i - g] = np.insert(I[i - g], pos, j - g)[:-1] if exclusion_zone is not None and d < P[j - g, -1]: pos = np.searchsorted(P[j - g], d, side="right") - if (i - g) not in I[j - g, :pos]: + if (i - g) not in I[j - g, :]: P[j - g] = np.insert(P[j - g], pos, d)[:-1] I[j - g] = np.insert(I[j - g], pos, i - g)[:-1] @@ -1472,7 +1472,7 @@ def prescrump(T_A, m, T_B, s, exclusion_zone=None, k=1): if exclusion_zone is not None: for idx in np.flatnonzero(distance_profile < P[:, -1]): pos = np.searchsorted(P[idx], distance_profile[idx], side="right") - if i not in I[idx, :pos]: + if i not in I[idx, :]: P[idx] = np.insert(P[idx], pos, distance_profile[idx])[:-1] I[idx] = np.insert(I[idx], pos, i)[:-1] @@ -1518,13 +1518,13 @@ def scrump(T_A, m, T_B, percentage, exclusion_zone, pre_scrump, s, k=1): d = dist_matrix[i, j] if d < P[i, -1]: # update TopK of P[i] idx = searchsorted_right(P[i], d) - if (i + g) not in I[i, :idx]: + if (i + g) not in I[i, :]: P[i] = np.insert(P[i], idx, d)[:-1] I[i] = np.insert(I[i], idx, i + g)[:-1] if exclusion_zone is not None and d < P[i + g, -1]: idx = searchsorted_right(P[i + g], d) - if i not in I[i + g, :idx]: + if i not in I[i + g, :]: P[i + g] = np.insert(P[i + g], idx, d)[:-1] I[i + g] = np.insert(I[i + g], idx, i)[:-1] From 6ca36d01abe376098f92e4b049ca7b8d56e7c198 Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 20 Jul 2022 12:59:04 -0600 Subject: [PATCH 323/416] De-otpimize if condition Due to numerical erorrs, we need to avoid partial traversal of array --- stumpy/scrump.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index a224f2a1c..a94d46280 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -127,7 +127,7 @@ def _compute_PI( I[thread_idx, i, 0] = -1 continue - j = I[thread_idx, i, 0] + j = nn_idx # Given the squared distance, work backwards and compute QT QT_j = (m - P_squared[thread_idx, i, 0] / 2.0) * (Σ_T[j] * σ_Q[i]) + ( m * M_T[j] * μ_Q[i] @@ -154,7 +154,7 @@ def _compute_PI( idx = np.searchsorted( P_squared[thread_idx, i + g], D_squared, side="right" ) - if (j + g) not in I[thread_idx, i + g, :idx]: + if (j + g) not in I[thread_idx, i + g]: core._shift_insert_at_index( P_squared[thread_idx, i + g], idx, D_squared ) @@ -164,7 +164,7 @@ def _compute_PI( idx = np.searchsorted( P_squared[thread_idx, j + g], D_squared, side="right" ) - if (i + g) not in I[thread_idx, j + g, :idx]: + if (i + g) not in I[thread_idx, j + g]: core._shift_insert_at_index( P_squared[thread_idx, j + g], idx, D_squared ) @@ -188,7 +188,7 @@ def _compute_PI( idx = np.searchsorted( P_squared[thread_idx, i - g], D_squared, side="right" ) - if (j - g) not in I[thread_idx, i - g, :idx]: + if (j - g) not in I[thread_idx, i - g]: core._shift_insert_at_index( P_squared[thread_idx, i - g], idx, D_squared ) @@ -198,7 +198,7 @@ def _compute_PI( idx = np.searchsorted( P_squared[thread_idx, j - g], D_squared, side="right" ) - if (i - g) not in I[thread_idx, j - g, :idx]: + if (i - g) not in I[thread_idx, j - g]: core._shift_insert_at_index( P_squared[thread_idx, j - g], idx, D_squared ) @@ -219,7 +219,7 @@ def _compute_PI( idx = np.searchsorted( P_squared[thread_idx, j], squared_distance_profile[j], side="right" ) - if i not in I[thread_idx, j, :idx]: + if i not in I[thread_idx, j]: core._shift_insert_at_index( P_squared[thread_idx, j], idx, squared_distance_profile[j] ) From 5d930b236ac2331e3b7efab65e14a010572c134f Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 20 Jul 2022 13:20:45 -0600 Subject: [PATCH 324/416] Update scrump --- stumpy/scrump.py | 80 ++++++++++++++++++++++++------------------------ 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index a94d46280..484a1867b 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -19,10 +19,10 @@ def _compute_PI( T_A, T_B, m, - M_T, - Σ_T, μ_Q, σ_Q, + M_T, + Σ_T, indices, start, stop, @@ -49,18 +49,17 @@ def _compute_PI( m : int Window size - M_T : numpy.ndarray - Sliding window mean for T_A - - Σ_T : numpy.ndarray - Sliding window standard deviation for T_A - μ_Q : numpy.ndarray - Mean of the query sequence, `Q`, relative to the current sliding window in `T_B` + Sliding window mean for `T_A` σ_Q : numpy.ndarray - Standard deviation of the query sequence, `Q`, relative to the current - sliding window in `T_B` + Sliding window standard deviation for `T_A` + + M_T : numpy.ndarray + Sliding window mean for `T_B` + + Σ_T : numpy.ndarray + Sliding window standard deviation for `T_B` indices : numpy.ndarray The subsequence indices to compute `prescrump` for @@ -103,9 +102,10 @@ def _compute_PI( See Algorithm 2 """ - l = T_B.shape[0] - m + 1 - squared_distance_profile = np.empty(l) - QT = np.empty(l, dtype=np.float64) + l = T_A.shape[0] - m + 1 # length of matrix profile + w = T_B.shape[0] - m + 1 # length of distance profile + squared_distance_profile = np.empty(w) + QT = np.empty(w, dtype=np.float64) for i in indices[start:stop]: Q = T_A[i : i + m] QT[:] = core._sliding_dot_product(Q, T_B) @@ -136,19 +136,19 @@ def _compute_PI( # Update top-k for both subsequences `S[i+g] = T[i+g:i+g+m]`` and # `S[j+g] = T[j+g:j+g+m]` (i.e., the right neighbors of `T[i : i+m]` and # `T[j:j+m]`) by using the distance between `S[i+g]` and `S[j+g]` - for g in range(1, min(s, l - max(i, j))): + for g in range(1, min(s, l - l - i, w - j)): QT_j = ( QT_j - - T_B[i + g - 1] * T_A[j + g - 1] - + T_B[i + g + m - 1] * T_A[j + g + m - 1] + - T_B[j + g - 1] * T_A[i + g - 1] + + T_B[j + g + m - 1] * T_A[i + g + m - 1] ) D_squared = core._calculate_squared_distance( m, QT_j, - M_T[i + g], - Σ_T[i + g], - μ_Q[j + g], - σ_Q[j + g], + M_T[j + g], + Σ_T[j + g], + μ_Q[i + g], + σ_Q[i + g], ) if D_squared < P_squared[thread_idx, i + g, -1]: idx = np.searchsorted( @@ -175,14 +175,14 @@ def _compute_PI( # `S[j-g] = T[j-g:j-g+m]` (i.e., the left neighbors of `T[i : i+m]` and # `T[j:j+m]`) by using the distance between `S[i-g]` and `S[j-g]` for g in range(1, min(s, i + 1, j + 1)): - QT_j = QT_j - T_B[i - g + m] * T_A[j - g + m] + T_B[i - g] * T_A[j - g] + QT_j = QT_j - T_B[j - g + m] * T_A[i - g + m] + T_B[j - g] * T_A[i - g] D_squared = core._calculate_squared_distance( m, QT_j, - M_T[i - g], - Σ_T[i - g], - μ_Q[j - g], - σ_Q[j - g], + M_T[j - g], + Σ_T[j - g], + μ_Q[i - g], + σ_Q[i - g], ) if D_squared < P_squared[thread_idx, i - g, -1]: idx = np.searchsorted( @@ -236,10 +236,10 @@ def _prescrump( T_A, T_B, m, - M_T, - Σ_T, μ_Q, σ_Q, + M_T, + Σ_T, indices, s, excl_zone=None, @@ -260,18 +260,18 @@ def _prescrump( m : int Window size + μ_Q : numpy.ndarray + Sliding window mean for `T_A` + + σ_Q : numpy.ndarray + Sliding window standard deviation for `T_A` + M_T : numpy.ndarray - Sliding window mean for T_A + Sliding window mean for `T_B` Σ_T : numpy.ndarray - Sliding window standard deviation for T_A + Sliding window standard deviation for `T_B` - μ_Q : numpy.ndarray - Mean of the query sequence, `Q`, relative to the current sliding window in `T_B` - - σ_Q : numpy.ndarray - Standard deviation of the query sequence, `Q`, relative to the current - sliding window in `T_B` indices : numpy.ndarray The subsequence indices to compute `prescrump` for @@ -329,10 +329,10 @@ def _prescrump( T_A, T_B, m, - M_T, - Σ_T, μ_Q, σ_Q, + M_T, + Σ_T, indices, idx_ranges[thread_idx, 0], idx_ranges[thread_idx, 1], @@ -433,10 +433,10 @@ def prescrump(T_A, m, T_B=None, s=None, normalize=True, p=2.0, k=1): T_A, T_B, m, - M_T, - Σ_T, μ_Q, σ_Q, + M_T, + Σ_T, indices, s, excl_zone, From 4b5876512142fc0e921d295e6ee58ea683b1f548 Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 20 Jul 2022 13:32:43 -0600 Subject: [PATCH 325/416] minor changes --- stumpy/scrump.py | 2 +- tests/test_scrump.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 484a1867b..ea2aca927 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -136,7 +136,7 @@ def _compute_PI( # Update top-k for both subsequences `S[i+g] = T[i+g:i+g+m]`` and # `S[j+g] = T[j+g:j+g+m]` (i.e., the right neighbors of `T[i : i+m]` and # `T[j:j+m]`) by using the distance between `S[i+g]` and `S[j+g]` - for g in range(1, min(s, l - l - i, w - j)): + for g in range(1, min(s, l - i, w - j)): QT_j = ( QT_j - T_B[j + g - 1] * T_A[i + g - 1] diff --git a/tests/test_scrump.py b/tests/test_scrump.py index 3a635a07b..6f9ee719c 100644 --- a/tests/test_scrump.py +++ b/tests/test_scrump.py @@ -662,6 +662,7 @@ def test_scrump_nan_zero_mean_self_join(percentages): npt.assert_almost_equal(ref_left_I, comp_left_I) npt.assert_almost_equal(ref_right_I, comp_right_I) + @pytest.mark.parametrize("T_A, T_B", test_data) def test_prescrump_A_B_join_larger_window(T_A, T_B): m = 5 @@ -679,6 +680,7 @@ def test_prescrump_A_B_join_larger_window(T_A, T_B): npt.assert_almost_equal(ref_P, comp_P) npt.assert_almost_equal(ref_I, comp_I) + @pytest.mark.parametrize("T_A, T_B", test_data) def test_prescrump_self_join_KNN(T_A, T_B): m = 3 From aaa8ff7bbe07bf4205e1d04476e11c95d01d038b Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 20 Jul 2022 13:39:55 -0600 Subject: [PATCH 326/416] add new test function --- tests/test_scrump.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/test_scrump.py b/tests/test_scrump.py index 6f9ee719c..8b19b40a9 100644 --- a/tests/test_scrump.py +++ b/tests/test_scrump.py @@ -854,3 +854,22 @@ def test_prescrump_self_join_larger_window_m_5_k_5(T_A, T_B): npt.assert_almost_equal(ref_P, comp_P) npt.assert_almost_equal(ref_I, comp_I) + + +@pytest.mark.parametrize("T_A, T_B", test_data) +def test_prescrump_A_B_join_larger_window_m_5_k_5(T_A, T_B): + m = 5 + k = 5 + zone = int(np.ceil(m / 4)) + if len(T_A) > m and len(T_B) > m: + for s in range(1, zone + 1): + seed = np.random.randint(100000) + + np.random.seed(seed) + ref_P, ref_I = naive.prescrump(T_A, m, T_B, s=s, k=k) + + np.random.seed(seed) + comp_P, comp_I = prescrump(T_A, m, T_B, s=s, k=k) + + npt.assert_almost_equal(ref_P, comp_P) + npt.assert_almost_equal(ref_I, comp_I) From 6c8eddcd2d8dfed25abca15761303018d059640c Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 20 Jul 2022 14:52:36 -0600 Subject: [PATCH 327/416] optimize if condition --- stumpy/scrump.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index ea2aca927..d5fe898cc 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -154,7 +154,13 @@ def _compute_PI( idx = np.searchsorted( P_squared[thread_idx, i + g], D_squared, side="right" ) - if (j + g) not in I[thread_idx, i + g]: + # Due to numerical error, it is possible that the element that is + # about to insert at idx is identical to an element of array located + # at idx, idx + 1, .... Hence, we should traverse full array. + # This is optimized in the if conditon. + if ((j + g) not in I[thread_idx, i + g, :idx][::-1]) and ( + (j + g) not in I[thread_idx, i + g, idx:] + ): core._shift_insert_at_index( P_squared[thread_idx, i + g], idx, D_squared ) @@ -164,7 +170,9 @@ def _compute_PI( idx = np.searchsorted( P_squared[thread_idx, j + g], D_squared, side="right" ) - if (i + g) not in I[thread_idx, j + g]: + if ((i + g) not in I[thread_idx, j + g, :idx][::-1]) and ( + (i + g) not in I[thread_idx, j + g, idx:] + ): core._shift_insert_at_index( P_squared[thread_idx, j + g], idx, D_squared ) @@ -188,7 +196,9 @@ def _compute_PI( idx = np.searchsorted( P_squared[thread_idx, i - g], D_squared, side="right" ) - if (j - g) not in I[thread_idx, i - g]: + if ((j - g) not in I[thread_idx, i - g, :idx][::-1]) and ( + (j - g) not in I[thread_idx, i - g, idx:] + ): core._shift_insert_at_index( P_squared[thread_idx, i - g], idx, D_squared ) @@ -198,7 +208,9 @@ def _compute_PI( idx = np.searchsorted( P_squared[thread_idx, j - g], D_squared, side="right" ) - if (i - g) not in I[thread_idx, j - g]: + if ((i - g) not in I[thread_idx, j - g, :idx][::-1]) and ( + (i - g) not in I[thread_idx, j - g, idx:] + ): core._shift_insert_at_index( P_squared[thread_idx, j - g], idx, D_squared ) @@ -219,7 +231,9 @@ def _compute_PI( idx = np.searchsorted( P_squared[thread_idx, j], squared_distance_profile[j], side="right" ) - if i not in I[thread_idx, j]: + if (i not in I[thread_idx, j, :idx][::-1]) and ( + i not in I[thread_idx, j, idx:] + ): core._shift_insert_at_index( P_squared[thread_idx, j], idx, squared_distance_profile[j] ) From 5bb6879a7e464b9cffb76952c893730c97d243d6 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 21 Jul 2022 16:54:09 -0600 Subject: [PATCH 328/416] Give priority to PA in case of ties between IA and IB --- tests/naive.py | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index dc9e01b8a..8dd1ef6f9 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1802,29 +1802,20 @@ def _total_diagonal_ndists(tile_lower_diag, tile_upper_diag, tile_height, tile_w return total_ndists -def merge_topk_PI(PA, PB, IA, IB, assume_unique=True): +def merge_topk_PI(PA, PB, IA, IB): k = PA.shape[1] + for i in range(PA.shape[0]): + _, _, overlap_idx_B = np.intersect1d(IA[i], IB[i]) + PB[i, overlap_idx_B] = np.inf + profile = np.column_stack((PA, PB)) indices = np.column_stack((IA, IB)) - IDX = np.argsort(profile, axis=1, kind="mergesort") - if assume_unique: - profile[:, :] = np.take_along_axis(profile, IDX, axis=1) - indices[:, :] = np.take_along_axis(indices, IDX, axis=1) + profile[:, :] = np.take_along_axis(profile, IDX, axis=1) + indices[:, :] = np.take_along_axis(indices, IDX, axis=1) - PA[:, :] = profile[:, :k] - IA[:, :] = indices[:, :k] - else: - # avoid duplicates while merging IA[i] and IB[i] - IDX_merged = np.full_like(PA, -1, dtype=np.int64) - for i, idx in enumerate(IDX): - _, arg_unique = np.unique(indices[i, idx], return_index=True) - arg_unique = np.sort(arg_unique)[:k] # preserving order of their appearence - idx = idx[arg_unique] - IDX_merged[i, : len(idx)] = idx - - PA[:, :] = np.take_along_axis(profile, IDX_merged, axis=1) - IA[:, :] = np.take_along_axis(indices, IDX_merged, axis=1) + PA[:, :] = profile[:, :k] + IA[:, :] = indices[:, :k] def merge_topk_ρI(ρA, ρB, IA, IB): From fc10e8acb94a4c34b1d7c4327bcc8bf8be9afb2d Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 21 Jul 2022 16:56:54 -0600 Subject: [PATCH 329/416] Remove trailing colon --- tests/naive.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 8dd1ef6f9..64bad3caa 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1445,12 +1445,12 @@ def prescrump(T_A, m, T_B, s, exclusion_zone=None, k=1): d = dist_matrix[i + g, j + g] if d < P[i + g, -1]: pos = np.searchsorted(P[i + g], d, side="right") - if (j + g) not in I[i + g, :]: + if (j + g) not in I[i + g]: P[i + g] = np.insert(P[i + g], pos, d)[:-1] I[i + g] = np.insert(I[i + g], pos, j + g)[:-1] if exclusion_zone is not None and d < P[j + g, -1]: pos = np.searchsorted(P[j + g], d, side="right") - if (i + g) not in I[j + g, :]: + if (i + g) not in I[j + g]: P[j + g] = np.insert(P[j + g], pos, d)[:-1] I[j + g] = np.insert(I[j + g], pos, i + g)[:-1] @@ -1458,12 +1458,12 @@ def prescrump(T_A, m, T_B, s, exclusion_zone=None, k=1): d = dist_matrix[i - g, j - g] if d < P[i - g, -1]: pos = np.searchsorted(P[i - g], d, side="right") - if (j - g) not in I[i - g, :]: + if (j - g) not in I[i - g]: P[i - g] = np.insert(P[i - g], pos, d)[:-1] I[i - g] = np.insert(I[i - g], pos, j - g)[:-1] if exclusion_zone is not None and d < P[j - g, -1]: pos = np.searchsorted(P[j - g], d, side="right") - if (i - g) not in I[j - g, :]: + if (i - g) not in I[j - g]: P[j - g] = np.insert(P[j - g], pos, d)[:-1] I[j - g] = np.insert(I[j - g], pos, i - g)[:-1] @@ -1472,7 +1472,7 @@ def prescrump(T_A, m, T_B, s, exclusion_zone=None, k=1): if exclusion_zone is not None: for idx in np.flatnonzero(distance_profile < P[:, -1]): pos = np.searchsorted(P[idx], distance_profile[idx], side="right") - if i not in I[idx, :]: + if i not in I[idx]: P[idx] = np.insert(P[idx], pos, distance_profile[idx])[:-1] I[idx] = np.insert(I[idx], pos, i)[:-1] @@ -1518,13 +1518,13 @@ def scrump(T_A, m, T_B, percentage, exclusion_zone, pre_scrump, s, k=1): d = dist_matrix[i, j] if d < P[i, -1]: # update TopK of P[i] idx = searchsorted_right(P[i], d) - if (i + g) not in I[i, :]: + if (i + g) not in I[i]: P[i] = np.insert(P[i], idx, d)[:-1] I[i] = np.insert(I[i], idx, i + g)[:-1] if exclusion_zone is not None and d < P[i + g, -1]: idx = searchsorted_right(P[i + g], d) - if i not in I[i + g, :]: + if i not in I[i + g]: P[i + g] = np.insert(P[i + g], idx, d)[:-1] I[i + g] = np.insert(I[i + g], idx, i)[:-1] From ef1309bf6c698d791d0119cea7f47253ff2f61de Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 21 Jul 2022 17:01:19 -0600 Subject: [PATCH 330/416] update test function --- tests/test_core.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/tests/test_core.py b/tests/test_core.py index 4b448ce37..0b795235e 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1083,8 +1083,8 @@ def test_merge_topk_PI(): comp_P = PA.copy() comp_I = IA.copy() - naive.merge_topk_PI(ref_P, PB, ref_I, IB) - core._merge_topk_PI(comp_P, PB, comp_I, IB) + naive.merge_topk_PI(ref_P, PB.copy(), ref_I, IB.copy()) + core._merge_topk_PI(comp_P, PB.copy(), comp_I, IB.copy()) npt.assert_array_equal(ref_P, comp_P) npt.assert_array_equal(ref_I, comp_I) @@ -1098,12 +1098,14 @@ def test_merge_topk_PI(): IA = np.arange(n * k).reshape(n, k) IB = IA + n * k - col_idx_A = np.random.randint(0, k, size=n) - col_idx_B = np.random.randint(0, k, size=n) - for i in range(n): # creating random duplicates between A and B - PB[i, col_idx_B[i]] = PA[i, col_idx_A[i]] + np.random.rand(1) * 1e-8 - IB[i, col_idx_B[i]] = IA[i, col_idx_A[i]] + cols_idx_A = np.random.randint(0, k, size=n) + for i in range(n): + # create overlaps + IDX = np.random.choice(np.arange(k), cols_idx_A[i], replace=False) + PB[i, IDX] = PA[i, IDX] + IB[i, IDX]] = IA[i, IDX] + # sort each row of PA/PB (and update IA/IB accordingly) IDX = np.argsort(PA, axis=1) PA[:, :] = np.take_along_axis(PA, IDX, axis=1) IA[:, :] = np.take_along_axis(IA, IDX, axis=1) @@ -1118,8 +1120,8 @@ def test_merge_topk_PI(): comp_P = PA.copy() comp_I = IA.copy() - naive.merge_topk_PI(ref_P, PB, ref_I, IB, assume_unique=False) - core._merge_topk_PI(comp_P, PB, comp_I, IB, assume_unique=False) + naive.merge_topk_PI(ref_P, PB.copy(), ref_I, IB.copy()) + core._merge_topk_PI(comp_P, PB.copy(), comp_I, IB.copy()) npt.assert_array_equal(ref_P, comp_P) npt.assert_array_equal(ref_I, comp_I) From c2fe4d2bfb469d550f6b3d2f1b98e076ddc73bbc Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 21 Jul 2022 17:04:47 -0600 Subject: [PATCH 331/416] revise function to avoid adding new parameter --- stumpy/core.py | 39 +++++++-------------------------------- 1 file changed, 7 insertions(+), 32 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index e1db51fd3..f3eff3db6 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2576,7 +2576,7 @@ def _select_P_ABBA_value(P_ABBA, k, custom_func=None): @njit -def _merge_topk_PI(PA, PB, IA, IB, assume_unique=True): +def _merge_topk_PI(PA, PB, IA, IB): """ Merge two top-k matrix profiles `PA` and `PB`, and update `PA` (in place) while always prioritizing the values of `PA` over the values of `PB` in case of ties. @@ -2602,10 +2602,6 @@ def _merge_topk_PI(PA, PB, IA, IB, assume_unique=True): IB : numpy.ndarray A (top-k) matrix profile indices corresponding to `PB` - assume_unique : bool, default True - If True (default), each row of IA and its corresponding row in IB have no - duplicates. False otherwise. - Returns ------- None @@ -2614,45 +2610,24 @@ def _merge_topk_PI(PA, PB, IA, IB, assume_unique=True): tmp_P = np.empty(k, dtype=np.float64) tmp_I = np.empty(k, dtype=np.int64) for i in range(PA.shape[0]): + overlap = np.intersect1d(IA[i], IB[i]) aj, bj = 0, 0 idx = 0 - prev_val = np.inf for _ in range(2 * k): # 2 * k to traverse both A and B if idx >= k: break - if aj >= k: # PA is already fully traversed. - tmp_P[idx:] = PB[i, bj : bj + k - idx] - tmp_I[idx:] = IB[i, bj : bj + k - idx] - break - if bj >= k: # PB is already fully traversed. - tmp_P[idx:] = PA[i, aj : aj + k - idx] - tmp_I[idx:] = IA[i, aj : aj + k - idx] - break - if PB[i, bj] < PA[i, aj]: - if ( - assume_unique - or abs(PB[i, bj] - prev_val) > 1e-3 - or IB[i, bj] not in tmp_I[:idx][::-1] # traverse in reverse to - # find duplicate in shorter time - ): + if bj < k and PB[i, bj] < PA[i, aj]: + if IB[i, bj] not in overlap: tmp_P[idx] = PB[i, bj] tmp_I[idx] = IB[i, bj] - prev_val = tmp_P[idx] idx += 1 bj += 1 else: - if ( - assume_unique - or abs(PA[i, aj] - prev_val) > 1e-3 - or IA[i, aj] not in tmp_I[:idx][::-1] # traverse in reverse to - # find duplicate in shorter time - ): - tmp_P[idx] = PA[i, aj] - tmp_I[idx] = IA[i, aj] - prev_val = tmp_P[idx] - idx += 1 + tmp_P[idx] = PA[i, aj] + tmp_I[idx] = IA[i, aj] + idx += 1 aj += 1 PA[i] = tmp_P From 99806a9223cd3ed23a68eaf0c17487c0e71b70d1 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 21 Jul 2022 17:12:32 -0600 Subject: [PATCH 332/416] Update module scrump and improvee its readability --- stumpy/scrump.py | 32 ++++++++------------------------ 1 file changed, 8 insertions(+), 24 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index d5fe898cc..206c4e304 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -158,9 +158,7 @@ def _compute_PI( # about to insert at idx is identical to an element of array located # at idx, idx + 1, .... Hence, we should traverse full array. # This is optimized in the if conditon. - if ((j + g) not in I[thread_idx, i + g, :idx][::-1]) and ( - (j + g) not in I[thread_idx, i + g, idx:] - ): + if (j + g) not in I[thread_idx, i + g]: core._shift_insert_at_index( P_squared[thread_idx, i + g], idx, D_squared ) @@ -170,9 +168,7 @@ def _compute_PI( idx = np.searchsorted( P_squared[thread_idx, j + g], D_squared, side="right" ) - if ((i + g) not in I[thread_idx, j + g, :idx][::-1]) and ( - (i + g) not in I[thread_idx, j + g, idx:] - ): + if (i + g) not in I[thread_idx, j + g]: core._shift_insert_at_index( P_squared[thread_idx, j + g], idx, D_squared ) @@ -196,9 +192,7 @@ def _compute_PI( idx = np.searchsorted( P_squared[thread_idx, i - g], D_squared, side="right" ) - if ((j - g) not in I[thread_idx, i - g, :idx][::-1]) and ( - (j - g) not in I[thread_idx, i - g, idx:] - ): + if (j - g) not in I[thread_idx, i - g]: core._shift_insert_at_index( P_squared[thread_idx, i - g], idx, D_squared ) @@ -208,9 +202,7 @@ def _compute_PI( idx = np.searchsorted( P_squared[thread_idx, j - g], D_squared, side="right" ) - if ((i - g) not in I[thread_idx, j - g, :idx][::-1]) and ( - (i - g) not in I[thread_idx, j - g, idx:] - ): + if (i - g) not in I[thread_idx, j - g]: core._shift_insert_at_index( P_squared[thread_idx, j - g], idx, D_squared ) @@ -231,9 +223,7 @@ def _compute_PI( idx = np.searchsorted( P_squared[thread_idx, j], squared_distance_profile[j], side="right" ) - if (i not in I[thread_idx, j, :idx][::-1]) and ( - i not in I[thread_idx, j, idx:] - ): + if i not in I[thread_idx, j]: core._shift_insert_at_index( P_squared[thread_idx, j], idx, squared_distance_profile[j] ) @@ -359,13 +349,7 @@ def _prescrump( ) for thread_idx in range(1, n_threads): - core._merge_topk_PI( - P_squared[0], - P_squared[thread_idx], - I[0], - I[thread_idx], - assume_unique=False, - ) + core._merge_topk_PI(P_squared[0], P_squared[thread_idx], I[0], I[thread_idx]) return np.sqrt(P_squared[0]), I[0] @@ -716,7 +700,7 @@ def __init__( else: P, I = prescrump(T_A, m, T_B=T_B, s=s, k=self._k) - core._merge_topk_PI(self._P, P, self._I, I, assume_unique=False) + core._merge_topk_PI(self._P, P, self._I, I) if self._ignore_trivial: self._diags = np.random.permutation( @@ -774,7 +758,7 @@ def update(self): ) # Update (top-k) matrix profile and indices - core._merge_topk_PI(self._P, P, self._I, I, assume_unique=False) + core._merge_topk_PI(self._P, P, self._I, I) # update left matrix profile and indices mask = PL < self._PL From b57c69174fe11b788d4f99b2a22a1ccdea845924 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 21 Jul 2022 17:13:32 -0600 Subject: [PATCH 333/416] Fix syntax --- tests/test_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_core.py b/tests/test_core.py index 0b795235e..fa4553d62 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1103,7 +1103,7 @@ def test_merge_topk_PI(): # create overlaps IDX = np.random.choice(np.arange(k), cols_idx_A[i], replace=False) PB[i, IDX] = PA[i, IDX] - IB[i, IDX]] = IA[i, IDX] + IB[i, IDX] = IA[i, IDX] # sort each row of PA/PB (and update IA/IB accordingly) IDX = np.argsort(PA, axis=1) From e499057441d403a531f58653e98fbb8a9d51820c Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 21 Jul 2022 17:17:58 -0600 Subject: [PATCH 334/416] update test functions --- tests/test_scrump.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_scrump.py b/tests/test_scrump.py index 8b19b40a9..0d5869776 100644 --- a/tests/test_scrump.py +++ b/tests/test_scrump.py @@ -373,7 +373,7 @@ def test_scrump_plus_plus_self_join(T_A, T_B, percentages): ref_P_aux, ref_I_aux, _, _ = naive.scrump( T_B, m, T_B, percentage, zone, True, s ) - naive.merge_topk_PI(ref_P, ref_P_aux, ref_I, ref_I_aux, assume_unique=False) + naive.merge_topk_PI(ref_P, ref_P_aux, ref_I, ref_I_aux) np.random.seed(seed) approx = scrump( @@ -406,7 +406,7 @@ def test_scrump_plus_plus_A_B_join(T_A, T_B, percentages): ref_P_aux, ref_I_aux, ref_left_I_aux, ref_right_I_aux = naive.scrump( T_A, m, T_B, percentage, None, False, None ) - naive.merge_topk_PI(ref_P, ref_P_aux, ref_I, ref_I_aux, assume_unique=False) + naive.merge_topk_PI(ref_P, ref_P_aux, ref_I, ref_I_aux) ref_left_I = ref_left_I_aux ref_right_I = ref_right_I_aux @@ -812,7 +812,7 @@ def test_scrump_plus_plus_self_join_KNN(T_A, T_B, percentages): T_B, m, T_B, percentage, zone, True, s, k=k ) naive.merge_topk_PI( - ref_P, ref_P_aux, ref_I, ref_I_aux, assume_unique=False + ref_P, ref_P_aux, ref_I, ref_I_aux ) np.random.seed(seed) From b81131996ae6aafd82b8f2c8e6db2389965730a4 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 21 Jul 2022 17:20:58 -0600 Subject: [PATCH 335/416] minor fix --- tests/naive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/naive.py b/tests/naive.py index 64bad3caa..063bf9bf7 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1805,7 +1805,7 @@ def _total_diagonal_ndists(tile_lower_diag, tile_upper_diag, tile_height, tile_w def merge_topk_PI(PA, PB, IA, IB): k = PA.shape[1] for i in range(PA.shape[0]): - _, _, overlap_idx_B = np.intersect1d(IA[i], IB[i]) + _, _, overlap_idx_B = np.intersect1d(IA[i], IB[i], return_indices=True) PB[i, overlap_idx_B] = np.inf profile = np.column_stack((PA, PB)) From 11ee8ded823cb34a452cb1685f7a93526bf71be1 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 21 Jul 2022 17:22:45 -0600 Subject: [PATCH 336/416] correct format --- tests/test_scrump.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/test_scrump.py b/tests/test_scrump.py index 0d5869776..b7d82854c 100644 --- a/tests/test_scrump.py +++ b/tests/test_scrump.py @@ -811,9 +811,7 @@ def test_scrump_plus_plus_self_join_KNN(T_A, T_B, percentages): ref_P_aux, ref_I_aux, _, _ = naive.scrump( T_B, m, T_B, percentage, zone, True, s, k=k ) - naive.merge_topk_PI( - ref_P, ref_P_aux, ref_I, ref_I_aux - ) + naive.merge_topk_PI(ref_P, ref_P_aux, ref_I, ref_I_aux) np.random.seed(seed) approx = scrump( From 7925119ed5adb07e2b25bf0afd521599ca06f36d Mon Sep 17 00:00:00 2001 From: ninimama Date: Sat, 23 Jul 2022 11:55:48 -0600 Subject: [PATCH 337/416] Improve docstring --- stumpy/core.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/stumpy/core.py b/stumpy/core.py index f3eff3db6..c30bf8b6a 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2581,7 +2581,9 @@ def _merge_topk_PI(PA, PB, IA, IB): Merge two top-k matrix profiles `PA` and `PB`, and update `PA` (in place) while always prioritizing the values of `PA` over the values of `PB` in case of ties. (i.e., values from `PB` are always inserted to the right of values from `PA`). - Also, update `IA` accordingly. + Also, update `IA` accordingly. In case of overlapping values between two arrays + IA[i] and IB[i], the ones in IB[i] (and their corresponding values in PB[i]) + are ignored throughout the updating process of IA[i] (and PA[i]). Unlike `_merge_topk_ρI`, where `top-k` largest values are kept, this function keeps `top-k` smallest values. From c3b82dd39270b406f4e09c480a29a12b8957974a Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 26 Jul 2022 12:09:03 -0600 Subject: [PATCH 338/416] Avoid overlap while merging matrix profiles --- stumpy/core.py | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index c30bf8b6a..48795b6f8 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2667,19 +2667,29 @@ def _merge_topk_ρI(ρA, ρB, IA, IB): ------- None """ - tmp_ρ = np.empty(ρA.shape[1], dtype=np.float64) - tmp_I = np.empty(ρA.shape[1], dtype=np.int64) - last_idx = len(tmp_ρ) - 1 + k = ρA.shape[1] + + tmp_ρ = np.empty(k, dtype=np.float64) + tmp_I = np.empty(k, dtype=np.int64) + last_idx = k - 1 for i in range(len(ρA)): + overlap = np.intersect1d(IA[i], IB[i]) + aj, bj = last_idx, last_idx - for k in range(last_idx, -1, -1): - if ρB[i, bj] > ρA[i, aj]: - tmp_ρ[k] = ρB[i, bj] - tmp_I[k] = IB[i, bj] + idx = last_idx + for _ in range(2 * k): # 2 * k to traverse both A and B if needed + if idx < 0: + break + if bj >= 0 and ρB[i, bj] > ρA[i, aj]: + if IB[i, bj] not in overlap: + tmp_ρ[idx] = ρB[i, bj] + tmp_I[idx] = IB[i, bj] + idx -= 1 bj -= 1 else: - tmp_ρ[k] = ρA[i, aj] - tmp_I[k] = IA[i, aj] + tmp_ρ[idx] = ρA[i, aj] + tmp_I[idx] = IA[i, aj] + idx -= 1 aj -= 1 ρA[i] = tmp_ρ From f073d6c5440e469e5eceb5089e3ea7c77faed49b Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 26 Jul 2022 12:25:48 -0600 Subject: [PATCH 339/416] Add function to find overlapping values --- stumpy/core.py | 24 ++++++++++++++++++++++++ tests/test_core.py | 18 ++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/stumpy/core.py b/stumpy/core.py index 48795b6f8..410c0124b 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2766,3 +2766,27 @@ def _check_P(P, threshold=1e-6): if are_distances_too_small(P, threshold=threshold): # pragma: no cover logger.warning(f"A large number of values in `P` are smaller than {threshold}.") logger.warning("For a self-join, try setting `ignore_trivial=True`.") + + +@njit +def _intersect1d_int(arr1, arr2): + """ + Returns the overlapping values between two 1D arrays `arr1` and `arr2` that + consist of integer values. + + Parameters + ---------- + arr1 : numpy.ndarray + a 1D numpy array consisting of interget values + + arr2 : numpy.ndarray + a 1D numpy array consisting of interget values + + Returns + ------- + out : numpy.ndarray + a numpy array consits of the overlapping values between `arr1` and `arr2` + """ + return np.array( + list(set(arr1).intersection(set(arr2))), dtype=np.int64 # Basic set comparison + ) diff --git a/tests/test_core.py b/tests/test_core.py index fa4553d62..293bd9a28 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1192,3 +1192,21 @@ def test_shift_insert_at_index(): def test_check_P(): with pytest.raises(ValueError): core._check_P(np.random.rand(10).reshape(2, 5)) + + +def test_intersect1d_int(): + max_len_arr = 20 + for n in range(1, max_len_arr): + arr1 = np.random.randint(0, 100, size=n) + arr2 = np.random.randint(0, 100, size=n) + + # creating overlaps between `arr1` and `arr2` + s = np.random.randint(0, high=n) + IDX_1 = np.random.choice(np.arange(n), s, replace=False) + IDX_2 = np.random.choice(np.arange(n), s, replace=False) + arr2[IDX_2] = arr1[IDX_1] + + ref = np.intersect1d(arr1, arr2) + comp = core._intersect1d_int(arr1, arr2) + + npt.assert_array_equal(np.sort(ref), np.sort(comp)) From a5149438f056228ac93ecc786b111e96a2258908 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 26 Jul 2022 12:38:24 -0600 Subject: [PATCH 340/416] replace numpy function with our implementation --- stumpy/core.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 410c0124b..606dd00ee 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2673,8 +2673,7 @@ def _merge_topk_ρI(ρA, ρB, IA, IB): tmp_I = np.empty(k, dtype=np.int64) last_idx = k - 1 for i in range(len(ρA)): - overlap = np.intersect1d(IA[i], IB[i]) - + overlap = _intersect1d_int(IA[i], IB[i]) aj, bj = last_idx, last_idx idx = last_idx for _ in range(2 * k): # 2 * k to traverse both A and B if needed From edb62a23c31ed32aea6a0ac1bf2b0c0282621a8a Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 26 Jul 2022 12:51:11 -0600 Subject: [PATCH 341/416] Avoid unnecessary call of a function --- stumpy/core.py | 28 ++-------------------------- tests/test_core.py | 18 ------------------ 2 files changed, 2 insertions(+), 44 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 606dd00ee..c873b4a90 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2612,7 +2612,7 @@ def _merge_topk_PI(PA, PB, IA, IB): tmp_P = np.empty(k, dtype=np.float64) tmp_I = np.empty(k, dtype=np.int64) for i in range(PA.shape[0]): - overlap = np.intersect1d(IA[i], IB[i]) + overlap = set(IB[i]).intersection(set(IA[i])) aj, bj = 0, 0 idx = 0 for _ in range(2 * k): # 2 * k to traverse both A and B @@ -2673,7 +2673,7 @@ def _merge_topk_ρI(ρA, ρB, IA, IB): tmp_I = np.empty(k, dtype=np.int64) last_idx = k - 1 for i in range(len(ρA)): - overlap = _intersect1d_int(IA[i], IB[i]) + overlap = set(IB[i]).intersection(set(IA[i])) aj, bj = last_idx, last_idx idx = last_idx for _ in range(2 * k): # 2 * k to traverse both A and B if needed @@ -2765,27 +2765,3 @@ def _check_P(P, threshold=1e-6): if are_distances_too_small(P, threshold=threshold): # pragma: no cover logger.warning(f"A large number of values in `P` are smaller than {threshold}.") logger.warning("For a self-join, try setting `ignore_trivial=True`.") - - -@njit -def _intersect1d_int(arr1, arr2): - """ - Returns the overlapping values between two 1D arrays `arr1` and `arr2` that - consist of integer values. - - Parameters - ---------- - arr1 : numpy.ndarray - a 1D numpy array consisting of interget values - - arr2 : numpy.ndarray - a 1D numpy array consisting of interget values - - Returns - ------- - out : numpy.ndarray - a numpy array consits of the overlapping values between `arr1` and `arr2` - """ - return np.array( - list(set(arr1).intersection(set(arr2))), dtype=np.int64 # Basic set comparison - ) diff --git a/tests/test_core.py b/tests/test_core.py index 293bd9a28..fa4553d62 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1192,21 +1192,3 @@ def test_shift_insert_at_index(): def test_check_P(): with pytest.raises(ValueError): core._check_P(np.random.rand(10).reshape(2, 5)) - - -def test_intersect1d_int(): - max_len_arr = 20 - for n in range(1, max_len_arr): - arr1 = np.random.randint(0, 100, size=n) - arr2 = np.random.randint(0, 100, size=n) - - # creating overlaps between `arr1` and `arr2` - s = np.random.randint(0, high=n) - IDX_1 = np.random.choice(np.arange(n), s, replace=False) - IDX_2 = np.random.choice(np.arange(n), s, replace=False) - arr2[IDX_2] = arr1[IDX_1] - - ref = np.intersect1d(arr1, arr2) - comp = core._intersect1d_int(arr1, arr2) - - npt.assert_array_equal(np.sort(ref), np.sort(comp)) From ec020e0b83c66e40c6f9a181319aed6528b7b58b Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 26 Jul 2022 12:58:23 -0600 Subject: [PATCH 342/416] Revise docsting and comment --- stumpy/scrump.py | 7 +++++-- stumpy/stumpi.py | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 206c4e304..d9ee59a98 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -116,8 +116,11 @@ def _compute_PI( nn_idx = np.argmin(squared_distance_profile) if nn_idx not in I[thread_idx, i]: - # It is more than likely that the top-k values for the `i`-th subsequence - # will be already populated. So, we must shift-insert here + # Since the top-k values for the `i`-th subsequence may already + # be updated/populated in other previous iterations (i.e., not all + # values in `I[thread_idx]` are equal to `-1` or not all values in + # `P_squared[thread_idx, i]` are equal to `np.inf`), we must + # shift-insert here rather than assign values to the first element. core._shift_insert_at_index( P_squared[thread_idx, i], 0, squared_distance_profile[nn_idx] ) diff --git a/stumpy/stumpi.py b/stumpy/stumpi.py index 22bc9b122..b5c4b5a93 100644 --- a/stumpy/stumpi.py +++ b/stumpy/stumpi.py @@ -371,7 +371,7 @@ def left_P_(self): @property def left_I_(self): """ - Get the (top-1) sleft matrix profile indices + Get the (top-1) left matrix profile indices """ return self._left_I.astype(np.int64) From 7ab480e5b0faa16ac66bf09ed784d83014344f80 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 26 Jul 2022 13:02:07 -0600 Subject: [PATCH 343/416] Improve test function --- tests/test_core.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_core.py b/tests/test_core.py index fa4553d62..be03ee3bd 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1102,7 +1102,8 @@ def test_merge_topk_PI(): for i in range(n): # create overlaps IDX = np.random.choice(np.arange(k), cols_idx_A[i], replace=False) - PB[i, IDX] = PA[i, IDX] + imprecision = np.random.uniform(low=-1e6, high=1e6, size=len(IDX)) + PB[i, IDX] = PA[i, IDX] + imprecision IB[i, IDX] = IA[i, IDX] # sort each row of PA/PB (and update IA/IB accordingly) From b2bc5005e7e36fe0be4cabb23fcec8b4c718bb6a Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 26 Jul 2022 13:03:40 -0600 Subject: [PATCH 344/416] Remove comment --- stumpy/scrump.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index d9ee59a98..039ca13e5 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -157,10 +157,6 @@ def _compute_PI( idx = np.searchsorted( P_squared[thread_idx, i + g], D_squared, side="right" ) - # Due to numerical error, it is possible that the element that is - # about to insert at idx is identical to an element of array located - # at idx, idx + 1, .... Hence, we should traverse full array. - # This is optimized in the if conditon. if (j + g) not in I[thread_idx, i + g]: core._shift_insert_at_index( P_squared[thread_idx, i + g], idx, D_squared From c156530740bf06ea0e4505ddd79370865ca0142d Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 26 Jul 2022 13:45:42 -0600 Subject: [PATCH 345/416] Add test function to ensure duplicates are avoided --- tests/test_scrump.py | 89 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/tests/test_scrump.py b/tests/test_scrump.py index b7d82854c..f9f6f1a53 100644 --- a/tests/test_scrump.py +++ b/tests/test_scrump.py @@ -871,3 +871,92 @@ def test_prescrump_A_B_join_larger_window_m_5_k_5(T_A, T_B): npt.assert_almost_equal(ref_P, comp_P) npt.assert_almost_equal(ref_I, comp_I) + + +def test_prescrump_self_join_KNN_no_overlap(): + # This test is designed to ensure that the performant version prescrump avoids + # overlap while computing the top-k matrix profiles + T = np.array( + [ + -916.64703784, + -327.42056679, + 379.19386284, + -281.80427628, + -189.85401773, + -38.69610569, + 187.89889345, + 578.65862523, + 528.09687811, + -667.42973795, + -285.27749324, + -211.28930925, + -703.93802657, + -820.53780562, + -955.91174663, + 383.65471851, + 932.08809422, + -563.57569746, + 784.0546579, + -343.14886064, + -612.72329848, + -270.09273091, + -448.39346549, + 578.03202014, + 867.15436674, + -783.55167049, + -494.78062922, + -311.18567747, + 522.70052256, + 933.45474094, + 192.34822368, + -162.11374908, + -612.95359279, + -449.62297051, + -351.79138459, + -77.70189101, + -439.46519487, + -660.48431174, + 548.69362177, + 485.36004744, + -535.3566627, + -568.0955257, + 755.26647273, + 736.1079588, + -597.65672557, + 379.3299783, + 731.38211912, + 247.34827447, + 545.41888454, + 644.94300763, + 20.99042666, + 788.19859515, + -898.24325898, + -929.47841134, + -738.45875181, + 66.01030291, + 512.945841, + -44.07720164, + 302.97141464, + -696.95271302, + 662.98385163, + -712.3807531, + -43.62688539, + 74.16927482, + ] + ) + + mk_seeds = { + (3, 2): [4279, 9133, 8190], + (3, 5): [1267, 4016, 4046], + (5, 2): [6327, 4926, 3712], + (5, 5): [3032, 3032, 8117], + } + for (m, k), seeds in mk_seeds.items(): + zone = int(np.ceil(m / 4)) + for seed in seeds: + np.random.seed(seed) + ref_P, ref_I = naive.prescrump(T, m, T, s=1, exclusion_zone=zone, k=k) + comp_P, comp_I = prescrump(T, m, s=1, k=k) + + npt.assert_almost_equal(ref_P, comp_P) + npt.assert_array_equal(ref_I, comp_I) From 5f1acaedfe490adb07634121d9b89df36c813031 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 26 Jul 2022 14:29:32 -0600 Subject: [PATCH 346/416] Improve comments --- tests/naive.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 063bf9bf7..bf7cb6197 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1433,18 +1433,26 @@ def prescrump(T_A, m, T_B, s, exclusion_zone=None, k=1): P[i, 1:] = P[i, :-1] P[i, 0] = distance_profile[I[i, 0]] - # else: the idx, i.e. 1NN of `i`, was already obtained (it maynot be stored - # at the first index of array I[i] though!) + # else: the idx, i.e. 1NN of `i`, was already obtained. it may not be + # at the first index of array I[i] though! e.g. P[i] = [1e-10, 1e-9]), + # I[i] = [a, b]. Here, `b` can be the actual nn of i. However, the + # distance between `seq i` and `seq b` might be calculated alrady in one + # of previous iteration and, due to slight numerical error, it might have + # been inserted to the rigth of a. if P[i, 0] == np.inf: I[i, 0] = -1 continue - j = nn_idx + j = nn_idx # to follow the original paper even in top-k version, we use + # the actual nn_idx rather than I[i, 0]. for g in range(1, min(s, l - i, w - j)): d = dist_matrix[i + g, j + g] if d < P[i + g, -1]: pos = np.searchsorted(P[i + g], d, side="right") + # Do NOT optimize the `condition` in the following if statement + # and similar ones in this naive function. This is to ensure + # we are avoiding duplicates in each row of I. if (j + g) not in I[i + g]: P[i + g] = np.insert(P[i + g], pos, d)[:-1] I[i + g] = np.insert(I[i + g], pos, j + g)[:-1] From 5f9c537464e2a6214cc3403da1fa835d67cf5877 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 26 Jul 2022 14:50:17 -0600 Subject: [PATCH 347/416] Enhance naive version to avoid duplicates while merging --- tests/naive.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/naive.py b/tests/naive.py index bf7cb6197..4cc99e791 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1815,6 +1815,7 @@ def merge_topk_PI(PA, PB, IA, IB): for i in range(PA.shape[0]): _, _, overlap_idx_B = np.intersect1d(IA[i], IB[i], return_indices=True) PB[i, overlap_idx_B] = np.inf + IB[i, overlap_idx_B] = -1 profile = np.column_stack((PA, PB)) indices = np.column_stack((IA, IB)) @@ -1846,6 +1847,12 @@ def merge_topk_ρI(ρA, ρB, IA, IB): # merging `ρB` and `ρA` ascendingly while choosing `ρB` over `ρA` in case of # ties: [0_B, 0_A, 0'_A, 1_B, 1'_B, 1_A], and we just need to keep the second # half of this array, and discard the first half. + k = ρA.shape[1] + for i in range(ρA.shape[0]): + _, _, overlap_idx_B = np.intersect1d(IA[i], IB[i], return_indices=True) + ρB[i, overlap_idx_B] = np.NINF + IB[i, overlap_idx_B] = -1 + profile = np.column_stack((ρB, ρA)) indices = np.column_stack((IB, IA)) From f37bc29882d91ac0e7f16298d1a6fb37e82b7ab5 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 26 Jul 2022 15:00:33 -0600 Subject: [PATCH 348/416] Add test function and revise naive version --- tests/naive.py | 4 +-- tests/test_core.py | 65 +++++++++++++++++++++++++++++++++++++++------- 2 files changed, 58 insertions(+), 11 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 4cc99e791..2bde6bd36 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1861,5 +1861,5 @@ def merge_topk_ρI(ρA, ρB, IA, IB): indices[:, :] = np.take_along_axis(indices, idx, axis=1) # keep the last k elements (top-k largest values) - ρA[:, :] = profile[:, ρA.shape[1] :] - IA[:, :] = indices[:, ρA.shape[1] :] + ρA[:, :] = profile[:, k:] + IA[:, :] = indices[:, k:] diff --git a/tests/test_core.py b/tests/test_core.py index be03ee3bd..7b4034f08 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1061,8 +1061,9 @@ def test_select_P_ABBA_val_inf(): npt.assert_almost_equal(ref, comp) -def test_merge_topk_PI(): - # `assume_unique = True` +def test_merge_topk_PI_without_overlap(): + # This is to test function `core._merge_topk_PI(PA, PB, IA, IB)` when there + # is no overlap between row IA[i] and row IB[i]. n = 50 for k in range(1, 6): PA = np.random.rand(n * k).reshape(n, k) @@ -1089,7 +1090,10 @@ def test_merge_topk_PI(): npt.assert_array_equal(ref_P, comp_P) npt.assert_array_equal(ref_I, comp_I) - # `assume_unique = False` + +def test_merge_topk_PI_with_overlap(): + # This is to test function `core._merge_topk_PI(PA, PB, IA, IB)` when there + # is overlap between row IA[i] and row IB[i]. n = 50 for k in range(1, 6): PA = np.random.rand(n * k).reshape(n, k) @@ -1098,13 +1102,13 @@ def test_merge_topk_PI(): IA = np.arange(n * k).reshape(n, k) IB = IA + n * k - cols_idx_A = np.random.randint(0, k, size=n) + num_overlaps = np.random.randint(1, k + 1, size=n) for i in range(n): # create overlaps - IDX = np.random.choice(np.arange(k), cols_idx_A[i], replace=False) - imprecision = np.random.uniform(low=-1e6, high=1e6, size=len(IDX)) - PB[i, IDX] = PA[i, IDX] + imprecision - IB[i, IDX] = IA[i, IDX] + col_IDX = np.random.choice(np.arange(k), num_overlaps[i], replace=False) + imprecision = np.random.uniform(low=-1e6, high=1e6, size=len(col_IDX)) + PB[i, col_IDX] = PA[i, col_IDX] + imprecision + IB[i, col_IDX] = IA[i, col_IDX] # sort each row of PA/PB (and update IA/IB accordingly) IDX = np.argsort(PA, axis=1) @@ -1128,7 +1132,9 @@ def test_merge_topk_PI(): npt.assert_array_equal(ref_I, comp_I) -def test_merge_topk_ρI(): +def test_merge_topk_ρI_without_overlap(): + # This is to test function `core._merge_topk_ρI(ρA, ρB, IA, IB)` when there + # is no overlap between row IA[i] and row IB[i]. n = 50 for k in range(1, 6): ρA = np.random.rand(n * k).reshape(n, k) @@ -1156,6 +1162,47 @@ def test_merge_topk_ρI(): npt.assert_array_equal(ref_I, comp_I) +def test_merge_topk_ρI_with_overlap(): + # This is to test function `core._merge_topk_ρI(ρA, ρB, IA, IB)` when there + # is overlap between row IA[i] and row IB[i]. + n = 50 + for k in range(1, 6): + ρA = np.random.rand(n * k).reshape(n, k) + ρB = np.random.rand(n * k).reshape(n, k) + + IA = np.arange(n * k).reshape(n, k) + IB = IA + n * k + + num_overlaps = np.random.randint(1, k + 1, size=n) + for i in range(n): + # create overlaps + col_IDX = np.random.choice(np.arange(k), num_overlaps[i], replace=False) + imprecision = np.random.uniform(low=-1e6, high=1e6, size=len(col_IDX)) + ρB[i, col_IDX] = ρA[i, col_IDX] + imprecision + IB[i, col_IDX] = IA[i, col_IDX] + + # sort each row of PA/PB (and update IA/IB accordingly) + IDX = np.argsort(ρA, axis=1) + ρA[:, :] = np.take_along_axis(ρA, IDX, axis=1) + IA[:, :] = np.take_along_axis(IA, IDX, axis=1) + + IDX = np.argsort(ρB, axis=1) + ρB[:, :] = np.take_along_axis(ρB, IDX, axis=1) + IB[:, :] = np.take_along_axis(IB, IDX, axis=1) + + ref_ρ = ρA.copy() + ref_I = IA.copy() + + comp_ρ = ρA.copy() + comp_I = IA.copy() + + naive.merge_topk_ρI(ref_ρ, ρB.copy(), ref_I, IB.copy()) + core._merge_topk_ρI(comp_ρ, ρB.copy(), comp_I, IB.copy()) + + npt.assert_array_equal(ref_ρ, comp_ρ) + npt.assert_array_equal(ref_I, comp_I) + + def test_shift_insert_at_index(): for k in range(1, 6): a = np.random.rand(k) From dc97a12ce9ec68acb80d52268199670260a1caa9 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 26 Jul 2022 15:08:56 -0600 Subject: [PATCH 349/416] Improve code readability and comment --- tests/test_scrump.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/test_scrump.py b/tests/test_scrump.py index f9f6f1a53..332eca78a 100644 --- a/tests/test_scrump.py +++ b/tests/test_scrump.py @@ -875,7 +875,9 @@ def test_prescrump_A_B_join_larger_window_m_5_k_5(T_A, T_B): def test_prescrump_self_join_KNN_no_overlap(): # This test is designed to ensure that the performant version prescrump avoids - # overlap while computing the top-k matrix profiles + # overlap while computing the top-k matrix profiles and matrix profile indices. + # So, there would be no duplicates in each row of top-k matrix profile indices + # excluding the elements filled with `-1`. T = np.array( [ -916.64703784, @@ -945,15 +947,17 @@ def test_prescrump_self_join_KNN_no_overlap(): ] ) - mk_seeds = { + # test_cases: dict() with `key: value` pair, where key is `(m, k)`, and value + # is a list of random `seeds` + test_cases = { (3, 2): [4279, 9133, 8190], (3, 5): [1267, 4016, 4046], (5, 2): [6327, 4926, 3712], (5, 5): [3032, 3032, 8117], } - for (m, k), seeds in mk_seeds.items(): + for (m, k), specified_seeds in test_cases.items(): zone = int(np.ceil(m / 4)) - for seed in seeds: + for seed in specified_seeds: np.random.seed(seed) ref_P, ref_I = naive.prescrump(T, m, T, s=1, exclusion_zone=zone, k=k) comp_P, comp_I = prescrump(T, m, s=1, k=k) From fa340ba27a63ec3f3ab81e5cc981eddd8e54df75 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 28 Jul 2022 16:55:12 -0600 Subject: [PATCH 350/416] Update top-k profile by getting insertion index In NearestNeighbor case, the distance between sequence i and its NN is the smallest. However, due to `imprecision` in calculation, it is possible that its corresponding distance, i.e. distance between seq i and its NN, is not the smallest value in its top-k neighbors. So, instead of inserting it at index 0, we use numpy.searchsorted to find the correct insertion index. --- stumpy/scrump.py | 14 ++++++++++---- tests/naive.py | 8 ++++---- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 039ca13e5..7d9d86c09 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -121,10 +121,16 @@ def _compute_PI( # values in `I[thread_idx]` are equal to `-1` or not all values in # `P_squared[thread_idx, i]` are equal to `np.inf`), we must # shift-insert here rather than assign values to the first element. - core._shift_insert_at_index( - P_squared[thread_idx, i], 0, squared_distance_profile[nn_idx] - ) - core._shift_insert_at_index(I[thread_idx, i], 0, nn_idx) + if squared_distance_profile[nn_idx] < P_squared[thread_idx, i, -1]: + idx = np.searchsorted( + P_squared[thread_idx, i], + squared_distance_profile[nn_idx], + side="right", + ) + core._shift_insert_at_index( + P_squared[thread_idx, i], idx, squared_distance_profile[nn_idx] + ) + core._shift_insert_at_index(I[thread_idx, i], idx, nn_idx) if P_squared[thread_idx, i, 0] == np.inf: # pragma: no cover I[thread_idx, i, 0] = -1 diff --git a/tests/naive.py b/tests/naive.py index 2bde6bd36..f1f6fb8e3 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1428,10 +1428,10 @@ def prescrump(T_A, m, T_B, s, exclusion_zone=None, k=1): nn_idx = np.argmin(distance_profile) if nn_idx not in I[i]: - I[i, 1:] = I[i, :-1] - I[i, 0] = nn_idx - P[i, 1:] = P[i, :-1] - P[i, 0] = distance_profile[I[i, 0]] + if distance_profile[nn_idx] < P[i, -1]: + pos = np.searchsorted(P[i], distance_profile[nn_idx], side="right") + P[i] = np.insert(P[i], pos, distance_profile[nn_idx])[:-1] + I[i] = np.insert(I[i], pos, nn_idx)[:-1] # else: the idx, i.e. 1NN of `i`, was already obtained. it may not be # at the first index of array I[i] though! e.g. P[i] = [1e-10, 1e-9]), From d9a997d3dc7ff2e3a636b1eabbade255c4f14ce5 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 28 Jul 2022 17:16:13 -0600 Subject: [PATCH 351/416] Merge nested if statements into one --- stumpy/scrump.py | 90 ++++++++++++++++++++++++++++-------------------- tests/naive.py | 47 +++++++++++++------------ 2 files changed, 77 insertions(+), 60 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 7d9d86c09..de514f6cf 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -115,22 +115,24 @@ def _compute_PI( core._apply_exclusion_zone(squared_distance_profile, i, excl_zone, np.inf) nn_idx = np.argmin(squared_distance_profile) - if nn_idx not in I[thread_idx, i]: + if ( + squared_distance_profile[nn_idx] < P_squared[thread_idx, i, -1] + and nn_idx not in I[thread_idx, i] + ): # Since the top-k values for the `i`-th subsequence may already # be updated/populated in other previous iterations (i.e., not all # values in `I[thread_idx]` are equal to `-1` or not all values in # `P_squared[thread_idx, i]` are equal to `np.inf`), we must # shift-insert here rather than assign values to the first element. - if squared_distance_profile[nn_idx] < P_squared[thread_idx, i, -1]: - idx = np.searchsorted( - P_squared[thread_idx, i], - squared_distance_profile[nn_idx], - side="right", - ) - core._shift_insert_at_index( - P_squared[thread_idx, i], idx, squared_distance_profile[nn_idx] - ) - core._shift_insert_at_index(I[thread_idx, i], idx, nn_idx) + idx = np.searchsorted( + P_squared[thread_idx, i], + squared_distance_profile[nn_idx], + side="right", + ) + core._shift_insert_at_index( + P_squared[thread_idx, i], idx, squared_distance_profile[nn_idx] + ) + core._shift_insert_at_index(I[thread_idx, i], idx, nn_idx) if P_squared[thread_idx, i, 0] == np.inf: # pragma: no cover I[thread_idx, i, 0] = -1 @@ -159,25 +161,30 @@ def _compute_PI( μ_Q[i + g], σ_Q[i + g], ) - if D_squared < P_squared[thread_idx, i + g, -1]: + if ( + D_squared < P_squared[thread_idx, i + g, -1] + and (j + g) not in I[thread_idx, i + g] + ): idx = np.searchsorted( P_squared[thread_idx, i + g], D_squared, side="right" ) - if (j + g) not in I[thread_idx, i + g]: - core._shift_insert_at_index( - P_squared[thread_idx, i + g], idx, D_squared - ) - core._shift_insert_at_index(I[thread_idx, i + g], idx, j + g) + core._shift_insert_at_index( + P_squared[thread_idx, i + g], idx, D_squared + ) + core._shift_insert_at_index(I[thread_idx, i + g], idx, j + g) - if excl_zone is not None and D_squared < P_squared[thread_idx, j + g, -1]: + if ( + excl_zone is not None + and D_squared < P_squared[thread_idx, j + g, -1] + and (i + g) not in I[thread_idx, j + g] + ): idx = np.searchsorted( P_squared[thread_idx, j + g], D_squared, side="right" ) - if (i + g) not in I[thread_idx, j + g]: - core._shift_insert_at_index( - P_squared[thread_idx, j + g], idx, D_squared - ) - core._shift_insert_at_index(I[thread_idx, j + g], idx, i + g) + core._shift_insert_at_index( + P_squared[thread_idx, j + g], idx, D_squared + ) + core._shift_insert_at_index(I[thread_idx, j + g], idx, i + g) QT_j = QT_j_prime # Update top-k for both subsequences `S[i-g] = T[i-g:i-g+m]` and @@ -193,25 +200,30 @@ def _compute_PI( μ_Q[i - g], σ_Q[i - g], ) - if D_squared < P_squared[thread_idx, i - g, -1]: + if ( + D_squared < P_squared[thread_idx, i - g, -1] + and (j - g) not in I[thread_idx, i - g] + ): idx = np.searchsorted( P_squared[thread_idx, i - g], D_squared, side="right" ) - if (j - g) not in I[thread_idx, i - g]: - core._shift_insert_at_index( - P_squared[thread_idx, i - g], idx, D_squared - ) - core._shift_insert_at_index(I[thread_idx, i - g], idx, j - g) + core._shift_insert_at_index( + P_squared[thread_idx, i - g], idx, D_squared + ) + core._shift_insert_at_index(I[thread_idx, i - g], idx, j - g) - if excl_zone is not None and D_squared < P_squared[thread_idx, j - g, -1]: + if ( + excl_zone is not None + and D_squared < P_squared[thread_idx, j - g, -1] + and (i - g) not in I[thread_idx, j - g] + ): idx = np.searchsorted( P_squared[thread_idx, j - g], D_squared, side="right" ) - if (i - g) not in I[thread_idx, j - g]: - core._shift_insert_at_index( - P_squared[thread_idx, j - g], idx, D_squared - ) - core._shift_insert_at_index(I[thread_idx, j - g], idx, i - g) + core._shift_insert_at_index( + P_squared[thread_idx, j - g], idx, D_squared + ) + core._shift_insert_at_index(I[thread_idx, j - g], idx, i - g) # In the case of a self-join, the calculated distance profile can also be # used to refine the top-k for all non-trivial subsequences @@ -225,10 +237,12 @@ def _compute_PI( squared_distance_profile < P_squared[thread_idx, :, -1] ) for j in indices: - idx = np.searchsorted( - P_squared[thread_idx, j], squared_distance_profile[j], side="right" - ) if i not in I[thread_idx, j]: + idx = np.searchsorted( + P_squared[thread_idx, j], + squared_distance_profile[j], + side="right", + ) core._shift_insert_at_index( P_squared[thread_idx, j], idx, squared_distance_profile[j] ) diff --git a/tests/naive.py b/tests/naive.py index f1f6fb8e3..deee097ca 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1427,11 +1427,10 @@ def prescrump(T_A, m, T_B, s, exclusion_zone=None, k=1): apply_exclusion_zone(distance_profile, i, exclusion_zone, np.inf) nn_idx = np.argmin(distance_profile) - if nn_idx not in I[i]: - if distance_profile[nn_idx] < P[i, -1]: - pos = np.searchsorted(P[i], distance_profile[nn_idx], side="right") - P[i] = np.insert(P[i], pos, distance_profile[nn_idx])[:-1] - I[i] = np.insert(I[i], pos, nn_idx)[:-1] + if distance_profile[nn_idx] < P[i, -1] and nn_idx not in I[i]: + pos = np.searchsorted(P[i], distance_profile[nn_idx], side="right") + P[i] = np.insert(P[i], pos, distance_profile[nn_idx])[:-1] + I[i] = np.insert(I[i], pos, nn_idx)[:-1] # else: the idx, i.e. 1NN of `i`, was already obtained. it may not be # at the first index of array I[i] though! e.g. P[i] = [1e-10, 1e-9]), @@ -1448,39 +1447,43 @@ def prescrump(T_A, m, T_B, s, exclusion_zone=None, k=1): # the actual nn_idx rather than I[i, 0]. for g in range(1, min(s, l - i, w - j)): d = dist_matrix[i + g, j + g] - if d < P[i + g, -1]: + if d < P[i + g, -1] and (j + g) not in I[i + g]: pos = np.searchsorted(P[i + g], d, side="right") # Do NOT optimize the `condition` in the following if statement # and similar ones in this naive function. This is to ensure # we are avoiding duplicates in each row of I. - if (j + g) not in I[i + g]: - P[i + g] = np.insert(P[i + g], pos, d)[:-1] - I[i + g] = np.insert(I[i + g], pos, j + g)[:-1] - if exclusion_zone is not None and d < P[j + g, -1]: + P[i + g] = np.insert(P[i + g], pos, d)[:-1] + I[i + g] = np.insert(I[i + g], pos, j + g)[:-1] + if ( + exclusion_zone is not None + and d < P[j + g, -1] + and (i + g) not in I[j + g] + ): pos = np.searchsorted(P[j + g], d, side="right") - if (i + g) not in I[j + g]: - P[j + g] = np.insert(P[j + g], pos, d)[:-1] - I[j + g] = np.insert(I[j + g], pos, i + g)[:-1] + P[j + g] = np.insert(P[j + g], pos, d)[:-1] + I[j + g] = np.insert(I[j + g], pos, i + g)[:-1] for g in range(1, min(s, i + 1, j + 1)): d = dist_matrix[i - g, j - g] - if d < P[i - g, -1]: + if d < P[i - g, -1] and (j - g) not in I[i - g]: pos = np.searchsorted(P[i - g], d, side="right") - if (j - g) not in I[i - g]: - P[i - g] = np.insert(P[i - g], pos, d)[:-1] - I[i - g] = np.insert(I[i - g], pos, j - g)[:-1] - if exclusion_zone is not None and d < P[j - g, -1]: + P[i - g] = np.insert(P[i - g], pos, d)[:-1] + I[i - g] = np.insert(I[i - g], pos, j - g)[:-1] + if ( + exclusion_zone is not None + and d < P[j - g, -1] + and (i - g) not in I[j - g] + ): pos = np.searchsorted(P[j - g], d, side="right") - if (i - g) not in I[j - g]: - P[j - g] = np.insert(P[j - g], pos, d)[:-1] - I[j - g] = np.insert(I[j - g], pos, i - g)[:-1] + P[j - g] = np.insert(P[j - g], pos, d)[:-1] + I[j - g] = np.insert(I[j - g], pos, i - g)[:-1] # In the case of a self-join, the calculated distance profile can also be # used to refine the top-k for all non-trivial subsequences if exclusion_zone is not None: for idx in np.flatnonzero(distance_profile < P[:, -1]): - pos = np.searchsorted(P[idx], distance_profile[idx], side="right") if i not in I[idx]: + pos = np.searchsorted(P[idx], distance_profile[idx], side="right") P[idx] = np.insert(P[idx], pos, distance_profile[idx])[:-1] I[idx] = np.insert(I[idx], pos, i)[:-1] From a52564fd3c0cd3fe41a09fda80283f9255e69781 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 28 Jul 2022 17:18:09 -0600 Subject: [PATCH 352/416] Remove blank lines --- stumpy/core.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index c873b4a90..0bb59bcbe 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2618,14 +2618,12 @@ def _merge_topk_PI(PA, PB, IA, IB): for _ in range(2 * k): # 2 * k to traverse both A and B if idx >= k: break - if bj < k and PB[i, bj] < PA[i, aj]: if IB[i, bj] not in overlap: tmp_P[idx] = PB[i, bj] tmp_I[idx] = IB[i, bj] idx += 1 bj += 1 - else: tmp_P[idx] = PA[i, aj] tmp_I[idx] = IA[i, aj] From 526618cff40a77abdf948edee0db23fde209481c Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 28 Jul 2022 17:20:48 -0600 Subject: [PATCH 353/416] Fix typo --- tests/test_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_core.py b/tests/test_core.py index 7b4034f08..33a61c427 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1181,7 +1181,7 @@ def test_merge_topk_ρI_with_overlap(): ρB[i, col_IDX] = ρA[i, col_IDX] + imprecision IB[i, col_IDX] = IA[i, col_IDX] - # sort each row of PA/PB (and update IA/IB accordingly) + # sort each row of ρA/ρB (and update IA/IB accordingly) IDX = np.argsort(ρA, axis=1) ρA[:, :] = np.take_along_axis(ρA, IDX, axis=1) IA[:, :] = np.take_along_axis(IA, IDX, axis=1) From 36077119655b924d52d25df5961ca1da5d645d96 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 28 Jul 2022 17:33:19 -0600 Subject: [PATCH 354/416] Improve comment --- tests/test_scrump.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/test_scrump.py b/tests/test_scrump.py index 332eca78a..64a7e39f8 100644 --- a/tests/test_scrump.py +++ b/tests/test_scrump.py @@ -878,6 +878,19 @@ def test_prescrump_self_join_KNN_no_overlap(): # overlap while computing the top-k matrix profiles and matrix profile indices. # So, there would be no duplicates in each row of top-k matrix profile indices # excluding the elements filled with `-1`. + # Let's denote `I[i]` as the array with length `k` that contains the start index + # of best-so-far top-k neighbors of `subseq i`. Also, we denote `P[i]` as their + # corresponding distances to `subseq i`. After calculating the distance + # between `subseq i` to its neighbor `j` (let's call it `d`), we can insert `j` + # into I[i] only if j is not already in I[i], and for that we need to check + # the whole array I[i]. Although one might think to perform + # `idx = np.searchosrted(P[i], d)` first followed by the check `if j not in I[i, :idx]` + # HOWEVER, the latter approach may result in duplicates(!) due to the imprecision + # in calculation of ditances. In other words, it is possible that the distance + # between subseq i and subseq j was calculated in one of previous iterations of + # updating process and its value might be slightly higher than `d`. So, althought + # j might be already in I[i], it might not be in `I[i, :idx]`. + T = np.array( [ -916.64703784, From 1b19a4574cdd11fc0507e514c6619cfd0e089a5a Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 28 Jul 2022 17:35:50 -0600 Subject: [PATCH 355/416] Improve comments --- stumpy/core.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 0bb59bcbe..e825ddd89 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2615,7 +2615,8 @@ def _merge_topk_PI(PA, PB, IA, IB): overlap = set(IB[i]).intersection(set(IA[i])) aj, bj = 0, 0 idx = 0 - for _ in range(2 * k): # 2 * k to traverse both A and B + # 2 * k iterations are required to traverse both A and B if needed. + for _ in range(2 * k): if idx >= k: break if bj < k and PB[i, bj] < PA[i, aj]: @@ -2674,7 +2675,8 @@ def _merge_topk_ρI(ρA, ρB, IA, IB): overlap = set(IB[i]).intersection(set(IA[i])) aj, bj = last_idx, last_idx idx = last_idx - for _ in range(2 * k): # 2 * k to traverse both A and B if needed + # 2 * k iterations are required to traverse both A and B if needed. + for _ in range(2 * k): if idx < 0: break if bj >= 0 and ρB[i, bj] > ρA[i, aj]: From bba35e11493f68ef3b49db9dfd53531c4a0eedd9 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 28 Jul 2022 17:37:16 -0600 Subject: [PATCH 356/416] Improve docstring --- stumpy/core.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/stumpy/core.py b/stumpy/core.py index e825ddd89..70fa9dd7c 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2641,7 +2641,9 @@ def _merge_topk_ρI(ρA, ρB, IA, IB): Merge two top-k pearson profiles `ρA` and `ρB`, and update `ρA` (in place) while always prioritizing the values of `ρA` over the values of `ρB` in case of ties. (i.e., values from `ρB` are always inserted to the left of values from `ρA`). - Also, update `IA` accordingly. + Also, update `IA` accordingly. In case of overlapping values between two arrays + IA[i] and IB[i], the ones in IB[i] (and their corresponding values in ρB[i]) + are ignored throughout the updating process of IA[i] (and ρA[i]). Unlike `_merge_topk_PI`, where `top-k` smallest values are kept, this function keeps `top-k` largest values. From 6d4d1272231f2af9bc85c1dec6f74b53adaf2556 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 28 Jul 2022 17:39:17 -0600 Subject: [PATCH 357/416] Remove unnecessary comments --- tests/naive.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index deee097ca..6c5404161 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1432,19 +1432,11 @@ def prescrump(T_A, m, T_B, s, exclusion_zone=None, k=1): P[i] = np.insert(P[i], pos, distance_profile[nn_idx])[:-1] I[i] = np.insert(I[i], pos, nn_idx)[:-1] - # else: the idx, i.e. 1NN of `i`, was already obtained. it may not be - # at the first index of array I[i] though! e.g. P[i] = [1e-10, 1e-9]), - # I[i] = [a, b]. Here, `b` can be the actual nn of i. However, the - # distance between `seq i` and `seq b` might be calculated alrady in one - # of previous iteration and, due to slight numerical error, it might have - # been inserted to the rigth of a. - if P[i, 0] == np.inf: I[i, 0] = -1 continue - j = nn_idx # to follow the original paper even in top-k version, we use - # the actual nn_idx rather than I[i, 0]. + j = nn_idx for g in range(1, min(s, l - i, w - j)): d = dist_matrix[i + g, j + g] if d < P[i + g, -1] and (j + g) not in I[i + g]: From e53138506dbd00982670cedb8cd2ba6fb1d1019b Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 28 Jul 2022 17:43:01 -0600 Subject: [PATCH 358/416] passing copy of variable as input --- tests/test_core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_core.py b/tests/test_core.py index 33a61c427..b2025a712 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1155,8 +1155,8 @@ def test_merge_topk_ρI_without_overlap(): comp_ρ = ρA.copy() comp_I = IA.copy() - naive.merge_topk_ρI(ref_ρ, ρB, ref_I, IB) - core._merge_topk_ρI(comp_ρ, ρB, comp_I, IB) + naive.merge_topk_ρI(ref_ρ, ρB.copy(), ref_I, IB.copy()) + core._merge_topk_ρI(comp_ρ, ρB.copy(), comp_I, IB.copy()) npt.assert_array_equal(ref_ρ, comp_ρ) npt.assert_array_equal(ref_I, comp_I) From 8e28aeb946bae2415807e956ff5f56ecb2a1a6a1 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 28 Jul 2022 17:45:46 -0600 Subject: [PATCH 359/416] minor change in test functions --- tests/test_core.py | 20 ++++++++++---------- tests/test_scrump.py | 2 +- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/tests/test_core.py b/tests/test_core.py index b2025a712..eaa4dab28 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1087,8 +1087,8 @@ def test_merge_topk_PI_without_overlap(): naive.merge_topk_PI(ref_P, PB.copy(), ref_I, IB.copy()) core._merge_topk_PI(comp_P, PB.copy(), comp_I, IB.copy()) - npt.assert_array_equal(ref_P, comp_P) - npt.assert_array_equal(ref_I, comp_I) + npt.assert_almost_equal(ref_P, comp_P) + npt.assert_almost_equal(ref_I, comp_I) def test_merge_topk_PI_with_overlap(): @@ -1128,8 +1128,8 @@ def test_merge_topk_PI_with_overlap(): naive.merge_topk_PI(ref_P, PB.copy(), ref_I, IB.copy()) core._merge_topk_PI(comp_P, PB.copy(), comp_I, IB.copy()) - npt.assert_array_equal(ref_P, comp_P) - npt.assert_array_equal(ref_I, comp_I) + npt.assert_almost_equal(ref_P, comp_P) + npt.assert_almost_equal(ref_I, comp_I) def test_merge_topk_ρI_without_overlap(): @@ -1158,8 +1158,8 @@ def test_merge_topk_ρI_without_overlap(): naive.merge_topk_ρI(ref_ρ, ρB.copy(), ref_I, IB.copy()) core._merge_topk_ρI(comp_ρ, ρB.copy(), comp_I, IB.copy()) - npt.assert_array_equal(ref_ρ, comp_ρ) - npt.assert_array_equal(ref_I, comp_I) + npt.assert_almost_equal(ref_ρ, comp_ρ) + npt.assert_almost_equal(ref_I, comp_I) def test_merge_topk_ρI_with_overlap(): @@ -1199,8 +1199,8 @@ def test_merge_topk_ρI_with_overlap(): naive.merge_topk_ρI(ref_ρ, ρB.copy(), ref_I, IB.copy()) core._merge_topk_ρI(comp_ρ, ρB.copy(), comp_I, IB.copy()) - npt.assert_array_equal(ref_ρ, comp_ρ) - npt.assert_array_equal(ref_I, comp_I) + npt.assert_almost_equal(ref_ρ, comp_ρ) + npt.assert_almost_equal(ref_I, comp_I) def test_shift_insert_at_index(): @@ -1222,7 +1222,7 @@ def test_shift_insert_at_index(): comp, idx, v, shift="right" ) # update comp in place - npt.assert_array_equal(ref, comp) + npt.assert_almost_equal(ref, comp) # test shift = "left" for (idx, v) in zip(indices, values): @@ -1234,7 +1234,7 @@ def test_shift_insert_at_index(): comp, idx, v, shift="left" ) # update comp in place - npt.assert_array_equal(ref, comp) + npt.assert_almost_equal(ref, comp) def test_check_P(): diff --git a/tests/test_scrump.py b/tests/test_scrump.py index 64a7e39f8..5dfeddf52 100644 --- a/tests/test_scrump.py +++ b/tests/test_scrump.py @@ -976,4 +976,4 @@ def test_prescrump_self_join_KNN_no_overlap(): comp_P, comp_I = prescrump(T, m, s=1, k=k) npt.assert_almost_equal(ref_P, comp_P) - npt.assert_array_equal(ref_I, comp_I) + npt.assert_almost_equal(ref_I, comp_I) From be1d1e77ea04c6f89b031ac10121e1b7a7897795 Mon Sep 17 00:00:00 2001 From: ninimama Date: Thu, 28 Jul 2022 18:21:05 -0600 Subject: [PATCH 360/416] Correct style --- tests/test_scrump.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tests/test_scrump.py b/tests/test_scrump.py index 5dfeddf52..1153f2790 100644 --- a/tests/test_scrump.py +++ b/tests/test_scrump.py @@ -884,12 +884,13 @@ def test_prescrump_self_join_KNN_no_overlap(): # between `subseq i` to its neighbor `j` (let's call it `d`), we can insert `j` # into I[i] only if j is not already in I[i], and for that we need to check # the whole array I[i]. Although one might think to perform - # `idx = np.searchosrted(P[i], d)` first followed by the check `if j not in I[i, :idx]` - # HOWEVER, the latter approach may result in duplicates(!) due to the imprecision - # in calculation of ditances. In other words, it is possible that the distance - # between subseq i and subseq j was calculated in one of previous iterations of - # updating process and its value might be slightly higher than `d`. So, althought - # j might be already in I[i], it might not be in `I[i, :idx]`. + # `idx = np.searchosrted(P[i], d)` first followed by the check `if j not in + # `I[i, :idx]`. HOWEVER, the latter approach may result in duplicates(!) due + # to the imprecision in calculation of ditances. In other words, it is possible + # that the distance between subseq i and subseq j was calculated in one of + # previous iterations of updating process and its value might be slightly higher + # than `d`. So, althought j might be already in I[i], it might not be in + # `I[i, :idx]`. T = np.array( [ From 956fc31679fec83f673d7f6a363409fd1c6f0bd6 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 29 Jul 2022 11:04:27 -0600 Subject: [PATCH 361/416] Revise comment --- tests/test_scrump.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/tests/test_scrump.py b/tests/test_scrump.py index 1153f2790..ead48130f 100644 --- a/tests/test_scrump.py +++ b/tests/test_scrump.py @@ -874,23 +874,21 @@ def test_prescrump_A_B_join_larger_window_m_5_k_5(T_A, T_B): def test_prescrump_self_join_KNN_no_overlap(): - # This test is designed to ensure that the performant version prescrump avoids - # overlap while computing the top-k matrix profiles and matrix profile indices. - # So, there would be no duplicates in each row of top-k matrix profile indices - # excluding the elements filled with `-1`. + # This test is particularly designed to raise error in some rare cases. # Let's denote `I[i]` as the array with length `k` that contains the start index # of best-so-far top-k neighbors of `subseq i`. Also, we denote `P[i]` as their - # corresponding distances to `subseq i`. After calculating the distance - # between `subseq i` to its neighbor `j` (let's call it `d`), we can insert `j` - # into I[i] only if j is not already in I[i], and for that we need to check - # the whole array I[i]. Although one might think to perform - # `idx = np.searchosrted(P[i], d)` first followed by the check `if j not in - # `I[i, :idx]`. HOWEVER, the latter approach may result in duplicates(!) due - # to the imprecision in calculation of ditances. In other words, it is possible + # corresponding ascendingly-sorted distances to `subseq i`. After calculating + # the distance between `subseq i` to its neighbor `j` (let's call it `d`), `j` + # is eligible to be inserted into I[i] only if `j` is not already in I[i]. + # Otherwise, we will have duplicates in I[i]. One might think to first perform + # `idx = np.searchosrted(P[i], d, side="right")` and then check if `j` is in + # `I[i, :idx]` or not. HOWEVER, the latter approach may result in duplicates(!) + # due to the imprecision in calculation of ditances. In other words, it is possible # that the distance between subseq i and subseq j was calculated in one of - # previous iterations of updating process and its value might be slightly higher - # than `d`. So, althought j might be already in I[i], it might not be in - # `I[i, :idx]`. + # previous iterations that value might be slightly higher than `d` (In theory, + # they should be exactly the same). So, althought j might be already in I[i], + # it might not be in `I[i, :idx]`. Hence, we need to perform a full traversal + # of I[i] and check all of its elemenets. T = np.array( [ From 9b3daefe5a3c8e4af0d985cfb731a5facd333400 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 29 Jul 2022 11:07:52 -0600 Subject: [PATCH 362/416] Remove comment --- stumpy/scrump.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index de514f6cf..2867d4872 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -119,11 +119,6 @@ def _compute_PI( squared_distance_profile[nn_idx] < P_squared[thread_idx, i, -1] and nn_idx not in I[thread_idx, i] ): - # Since the top-k values for the `i`-th subsequence may already - # be updated/populated in other previous iterations (i.e., not all - # values in `I[thread_idx]` are equal to `-1` or not all values in - # `P_squared[thread_idx, i]` are equal to `np.inf`), we must - # shift-insert here rather than assign values to the first element. idx = np.searchsorted( P_squared[thread_idx, i], squared_distance_profile[nn_idx], From 6abd601804fd8a53722e7aa2faee8fa0974841b2 Mon Sep 17 00:00:00 2001 From: ninimama Date: Fri, 29 Jul 2022 11:24:37 -0600 Subject: [PATCH 363/416] Revise comment --- tests/test_scrump.py | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/tests/test_scrump.py b/tests/test_scrump.py index ead48130f..b927ebf92 100644 --- a/tests/test_scrump.py +++ b/tests/test_scrump.py @@ -874,21 +874,25 @@ def test_prescrump_A_B_join_larger_window_m_5_k_5(T_A, T_B): def test_prescrump_self_join_KNN_no_overlap(): - # This test is particularly designed to raise error in some rare cases. - # Let's denote `I[i]` as the array with length `k` that contains the start index - # of best-so-far top-k neighbors of `subseq i`. Also, we denote `P[i]` as their - # corresponding ascendingly-sorted distances to `subseq i`. After calculating - # the distance between `subseq i` to its neighbor `j` (let's call it `d`), `j` - # is eligible to be inserted into I[i] only if `j` is not already in I[i]. - # Otherwise, we will have duplicates in I[i]. One might think to first perform - # `idx = np.searchosrted(P[i], d, side="right")` and then check if `j` is in - # `I[i, :idx]` or not. HOWEVER, the latter approach may result in duplicates(!) - # due to the imprecision in calculation of ditances. In other words, it is possible - # that the distance between subseq i and subseq j was calculated in one of - # previous iterations that value might be slightly higher than `d` (In theory, - # they should be exactly the same). So, althought j might be already in I[i], - # it might not be in `I[i, :idx]`. Hence, we need to perform a full traversal - # of I[i] and check all of its elemenets. + # This test is particularly designed to raise error in a rare case described + # as follows: Let's denote `I[i]` as the array with length `k` that contains + # the start index of the best-so-far top-k nearest neighbors of `subseq i`, + # (`S_i`). Also, we denote `P[i]` as their corresponding ascendingly-sorted + # distances to `subseq i`. After calculating the distance between `subseq i` + # to its neighbor `subseq j` (`S_j`). Let's denote `d` as the distance between + # these two subseqs. `j` is eligible to be inserted into I[i] if `d` is less + # than the `P[i, -1]` and if `j` is not in I[i]. One might think to first perform + # `idx = np.searchosrted(P[i], d, side="right")` and then check if `j` + # is in `I[i, :idx]` or not. HOWEVER, this does not suffice! The latter approach + # may result in duplicates(!) due to the imprecision in calculation of ditances. + # It is possible that the distance between `S_i` and `S_j` was + # calculated in one of previous iterations and that value might be slightly + # higher than `d` (In theory, they should be exactly the same!!). Thus, althought + # `j` might be already in I[i], it might not appear in `I[i, :idx]`. In other + # words, we might have `j` as the element `I[i, w]`, where `w >= idx` and hence + # P[i, w] > d). In theory, P[i, w] and d should be equal as they both show the + # same distance, i.e. the distance between `S_i` and `S_j`. + # To sum up, we need to search whole I[i] for `j`. T = np.array( [ From 355c8e51af172e972ba0200f209c53751320ddbe Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 8 Aug 2022 18:01:29 -0600 Subject: [PATCH 364/416] Fix format --- tests/naive.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/naive.py b/tests/naive.py index a5f8574d1..af14a9016 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1880,4 +1880,3 @@ def find_matches(D, excl_zone, max_distance, max_matches=None): matches = [x for x in matches if x < idx - excl_zone or x > idx + excl_zone] return np.array(result[:max_matches], dtype=object) - From 04685c7e17125fdf01e5debe1ef116358afccfba Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 8 Aug 2022 18:06:54 -0600 Subject: [PATCH 365/416] Remove unnecessary newline --- stumpy/core.py | 1 - 1 file changed, 1 deletion(-) diff --git a/stumpy/core.py b/stumpy/core.py index 7573b19f8..5cd511a9f 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2669,7 +2669,6 @@ def _merge_topk_ρI(ρA, ρB, IA, IB): None """ k = ρA.shape[1] - tmp_ρ = np.empty(k, dtype=np.float64) tmp_I = np.empty(k, dtype=np.int64) last_idx = k - 1 From ba7b6ca8ba57a0b0fa9b759fc1967a063efd123b Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 8 Aug 2022 18:26:45 -0600 Subject: [PATCH 366/416] Return 1D array for matrix profile when `k` is 1 --- stumpy/scrump.py | 40 ++++++++++++++++++++++++---------------- tests/naive.py | 4 ++++ tests/test_scrump.py | 32 ++++++++++++++++---------------- 3 files changed, 44 insertions(+), 32 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 2867d4872..2328bf09f 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -517,15 +517,16 @@ class scrump: Attributes ---------- P_ : numpy.ndarray - The updated (top-k) matrix profile. When k=1 (default), the first (and only) - column in this 2D array consists of the matrix profile. When k > 1, the output - has exactly k columns consisting of the top-k matrix profile. + The updated (top-k) matrix profile. When `k=1` (default), this output is + a 1D array consisting of the matrix profile values. When `k > 1`, the output + is a 2D array that has exactly `k` columns consisting of the top-k matrix + profile values. I_ : numpy.ndarray - The updated (top-k) matrix profile indices. When k=1 (default), the first - (and only) column in this 2D array consists of the matrix profile indices. - When k > 1, the output has exactly k columns consisting of the top-k matrix - profile indices. + The updated (top-k) matrix profile indices. When `k=1` (default), this output is + a 1D array consisting of the matrix profile indices. When `k > 1`, the output + is a 2D array that has exactly `k` columns consisting of the top-k matrix + profile indiecs. left_I_ : numpy.ndarray The updated left (top-1) matrix profile indices @@ -789,21 +790,28 @@ def update(self): @property def P_(self): """ - Get the updated (top-k) matrix profile. When k=1 (default), the first (and only) - column in this 2D array consists of the matrix profile. When k > 1, the output - has exactly `k` columns consisting of the top-k matrix profile. + Get the updated (top-k) matrix profile. When `k=1` (default), this output is + a 1D array consisting of the updated matrix profile values. When `k > 1`, + the output is a 2D array that has exactly `k` columns consisting of the + updated top-k matrix profile values. """ - return self._P.astype(np.float64) + if self._k == 1: + return self._P.flatten().astype(np.float64) + else: + return self._P.astype(np.float64) @property def I_(self): """ - Get the updated (top-k) matrix profile indices. When k=1 (default), the - first (and only) column in this 2D array consists of the matrix profile - indices. When k > 1, the output has exactly `k` columns consisting of the top-k - matrix profile indices. + Get the updated (top-k) matrix profile indices. When `k=1` (default), this + output is a 1D array consisting of the updated matrix profile indices. When + `k > 1`, the output is a 2D array that has exactly `k` columns consisting + of the updated top-k matrix profile indices. """ - return self._I.astype(np.int64) + if self._k == 1: + return self._I.flatten().astype(np.int64) + else: + return self._I.astype(np.int64) @property def left_I_(self): diff --git a/tests/naive.py b/tests/naive.py index af14a9016..da88a6176 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1541,6 +1541,10 @@ def scrump(T_A, m, T_B, percentage, exclusion_zone, pre_scrump, s, k=1): PR[i] = d IR[i] = i + g + if k == 1: + P = P.flatten() + I = I.flatten() + return P, I, IL, IR diff --git a/tests/test_scrump.py b/tests/test_scrump.py index b927ebf92..41843df33 100644 --- a/tests/test_scrump.py +++ b/tests/test_scrump.py @@ -254,8 +254,8 @@ def test_scrump_self_join_full(T_A, T_B): npt.assert_almost_equal(ref_right_I, comp_right_I) ref_mp = stump(T_B, m, ignore_trivial=True) - ref_P = ref_mp[:, 0].reshape(-1, 1) # to match shape of comp_P when k=1 - ref_I = ref_mp[:, 1].reshape(-1, 1) # to match shape of comp_I when k=1 + ref_P = ref_mp[:, 0] + ref_I = ref_mp[:, 1] ref_left_I = ref_mp[:, 2] ref_right_I = ref_mp[:, 3] @@ -271,8 +271,8 @@ def test_scrump_A_B_join_full(T_A, T_B): m = 3 ref_mp = naive.stump(T_A, m, T_B=T_B, row_wise=True) - ref_P = ref_mp[:, 0].reshape(-1, 1) # to match shape of comp_P when k=1 - ref_I = ref_mp[:, 1].reshape(-1, 1) # to match shape of comp_I when k=1 + ref_P = ref_mp[:, 0] + ref_I = ref_mp[:, 1] ref_left_I = ref_mp[:, 2] ref_right_I = ref_mp[:, 3] @@ -292,8 +292,8 @@ def test_scrump_A_B_join_full(T_A, T_B): npt.assert_almost_equal(ref_right_I, comp_right_I) ref_mp = stump(T_A, m, T_B=T_B, ignore_trivial=False) - ref_P = ref_mp[:, 0].reshape(-1, 1) # to match shape of comp_P when k=1 - ref_I = ref_mp[:, 1].reshape(-1, 1) # to match shape of comp_I when k=1 + ref_P = ref_mp[:, 0] + ref_I = ref_mp[:, 1] ref_left_I = ref_mp[:, 2] ref_right_I = ref_mp[:, 3] @@ -309,8 +309,8 @@ def test_scrump_A_B_join_full_swap(T_A, T_B): m = 3 ref_mp = naive.stump(T_B, m, T_B=T_A, row_wise=True) - ref_P = ref_mp[:, 0].reshape(-1, 1) # to match shape of comp_P when k=1 - ref_I = ref_mp[:, 1].reshape(-1, 1) # to match shape of comp_I when k=1 + ref_P = ref_mp[:, 0] + ref_I = ref_mp[:, 1] ref_left_I = ref_mp[:, 2] ref_right_I = ref_mp[:, 3] @@ -337,8 +337,8 @@ def test_scrump_self_join_full_larger_window(T_A, T_B, m): zone = int(np.ceil(m / 4)) ref_mp = naive.stump(T_B, m, exclusion_zone=zone, row_wise=True) - ref_P = ref_mp[:, 0].reshape(-1, 1) # to match shape of comp_P when k=1 - ref_I = ref_mp[:, 1].reshape(-1, 1) # to match shape of comp_I when k=1 + ref_P = ref_mp[:, 0] + ref_I = ref_mp[:, 1] ref_left_I = ref_mp[:, 2] ref_right_I = ref_mp[:, 3] @@ -440,8 +440,8 @@ def test_scrump_plus_plus_self_join_full(T_A, T_B): zone = int(np.ceil(m / 4)) ref_mp = naive.stump(T_B, m, exclusion_zone=zone, row_wise=True) - ref_P = ref_mp[:, 0].reshape(-1, 1) # to match shape of comp_P when k=1 - ref_I = ref_mp[:, 1].reshape(-1, 1) # to match shape of comp_I when k=1 + ref_P = ref_mp[:, 0] + ref_I = ref_mp[:, 1] ref_left_I = ref_mp[:, 2] ref_right_I = ref_mp[:, 3] @@ -469,8 +469,8 @@ def test_scrump_plus_plus_A_B_join_full(T_A, T_B): zone = int(np.ceil(m / 4)) ref_mp = naive.stump(T_A, m, T_B=T_B, row_wise=True) - ref_P = ref_mp[:, 0].reshape(-1, 1) # to match shape of comp_P when k=1 - ref_I = ref_mp[:, 1].reshape(-1, 1) # to match shape of comp_I when k=1 + ref_P = ref_mp[:, 0] + ref_I = ref_mp[:, 1] ref_left_I = ref_mp[:, 2] ref_right_I = ref_mp[:, 3] @@ -498,8 +498,8 @@ def test_scrump_plus_plus_A_B_join_full_swap(T_A, T_B): zone = int(np.ceil(m / 4)) ref_mp = naive.stump(T_B, m, T_B=T_A, row_wise=True) - ref_P = ref_mp[:, 0].reshape(-1, 1) # to match shape of comp_P when k=1 - ref_I = ref_mp[:, 1].reshape(-1, 1) # to match shape of comp_I when k=1 + ref_P = ref_mp[:, 0] + ref_I = ref_mp[:, 1] ref_left_I = ref_mp[:, 2] ref_right_I = ref_mp[:, 3] From 36a7fcbd20d1c7b4637c7db86a0b7b929779495c Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 8 Aug 2022 18:30:43 -0600 Subject: [PATCH 367/416] Remove unnecessary flattening operatiton on array --- stumpy/stimp.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/stumpy/stimp.py b/stumpy/stimp.py index c67c005a6..c377a69aa 100644 --- a/stumpy/stimp.py +++ b/stumpy/stimp.py @@ -214,11 +214,12 @@ def update(self): ignore_trivial=True, percentage=self._percentage, pre_scrump=self._pre_scrump, + k=1, ) approx.update() self._PAN[ self._bfs_indices[self._n_processed], : approx.P_.shape[0] - ] = approx.P_.flatten() + ] = approx.P_ else: out = self._mp_func( self._T, From 4f1b2dcf9d651526cbc0ff95f840cf77f0af8f00 Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 8 Aug 2022 18:35:38 -0600 Subject: [PATCH 368/416] Fix comments --- stumpy/stump.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index 3b8c74bed..282cea0f9 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -474,12 +474,12 @@ def _stump( # update top-k arrays core._merge_topk_ρI(ρ[0], ρ[thread_idx], I[0], I[thread_idx]) - # update left matrix profile and matrix profile indices + # update left matrix profile and matrix profile indices mask = ρL[0] < ρL[thread_idx] ρL[0][mask] = ρL[thread_idx][mask] IL[0][mask] = IL[thread_idx][mask] - # update right matrix profile and matrix profile indices + # update right matrix profile and matrix profile indices mask = ρR[0] < ρR[thread_idx] ρR[0][mask] = ρR[thread_idx][mask] IR[0][mask] = IR[thread_idx][mask] From aa52529c60b4955ba85619ba61196251418bde8c Mon Sep 17 00:00:00 2001 From: ninimama Date: Mon, 8 Aug 2022 19:09:49 -0600 Subject: [PATCH 369/416] Make matrix profile and mp index 1D when k=1 --- stumpy/stumpi.py | 25 ++++++++++++++++--------- tests/naive.py | 18 +++++++++++++++--- tests/test_stumpi.py | 16 ++++++++-------- 3 files changed, 39 insertions(+), 20 deletions(-) diff --git a/stumpy/stumpi.py b/stumpy/stumpi.py index b5c4b5a93..82fac2da8 100644 --- a/stumpy/stumpi.py +++ b/stumpy/stumpi.py @@ -345,21 +345,28 @@ def _update(self, t): @property def P_(self): """ - Get the (top-k) matrix profile. When `k=1` (default), the first (and only) - column in this 2D array consists of the matrix profile. When `k > 1`, the - output has exactly `k` columns consisting of the top-k matrix profile. + Get the (top-k) matrix profile. When `k=1` (default), the output is + a 1D array consisting of the matrix profile. When `k > 1`, the + output is a 2D array that has exactly `k` columns and it consists of the + top-k matrix profile. """ - return self._P.astype(np.float64) + if self._k == 1: + return self._P.flatten().astype(np.float64) + else: + return self._P.astype(np.float64) @property def I_(self): """ - Get the (top-k) matrix profile indices. When `k=1` (default), the first - (and only) column in this 2D array consists of the matrix profile indices. - When `k > 1`, the output has exactly `k` columns consisting of the top-k - matrix profile indices. + Get the (top-k) matrix profile indices. When `k=1` (default), the output is + a 1D array consisting of the matrix profile indices. When `k > 1`, the + output is a 2D array that has exactly `k` columns and it consists of the + top-k matrix profile indices. """ - return self._I.astype(np.int64) + if self._k == 1: + return self._I.flatten().astype(np.int64) + else: + return self._I.astype(np.int64) @property def left_P_(self): diff --git a/tests/naive.py b/tests/naive.py index da88a6176..198915e97 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -800,7 +800,7 @@ def __init__(self, T, m, excl_zone=None, k=1): self._excl_zone = int(np.ceil(self._m / config.STUMPY_EXCL_ZONE_DENOM)) self._l = self._T.shape[0] - m + 1 - mp = stump(T, m, exclusion_zone=self._excl_zone, k=k) + mp = stump(T, m, exclusion_zone=self._excl_zone, k=self._k) self.P_ = mp[:, :k].astype(np.float64) self.I_ = mp[:, k : 2 * k].astype(np.int64) @@ -814,7 +814,15 @@ def __init__(self, T, m, excl_zone=None, k=1): self._n_appended = 0 + if self._k == 1: + self.P_ = self.P_.flatten() + self.I_ = self.I_.flatten() + def update(self, t): + if self._k == 1: + self.P_ = self.P_.reshape(-1, 1) + self.I_ = self.I_.reshape(-1, 1) + self._T[:] = np.roll(self._T, -1) self._T_isfinite[:] = np.roll(self._T_isfinite, -1) if np.isfinite(t): @@ -825,8 +833,8 @@ def update(self, t): self._T[-1] = 0 self._n_appended += 1 - self.P_[:, :] = np.roll(self.P_, -1, axis=0) - self.I_[:, :] = np.roll(self.I_, -1, axis=0) + self.P_ = np.roll(self.P_, -1, axis=0) + self.I_ = np.roll(self.I_, -1, axis=0) self.left_P_[:] = np.roll(self.left_P_, -1) self.left_I_[:] = np.roll(self.left_I_, -1) @@ -859,6 +867,10 @@ def update(self, t): self.left_P_[-1] = self.P_[-1, 0] self.left_I_[-1] = self.I_[-1, 0] + if self._k == 1: + self.P_ = self.P_.flatten() + self.I_ = self.I_.flatten() + def across_series_nearest_neighbors(Ts, Ts_idx, subseq_idx, m): """ diff --git a/tests/test_stumpi.py b/tests/test_stumpi.py index 13c4a6ee4..c794d9e6b 100644 --- a/tests/test_stumpi.py +++ b/tests/test_stumpi.py @@ -33,8 +33,8 @@ def test_stumpi_self_join(): comp_left_I = stream.left_I_ ref_mp = naive.stump(stream.T_, m, exclusion_zone=zone, row_wise=True) - ref_P = ref_mp[:, 0].reshape(-1, 1) - ref_I = ref_mp[:, 1].reshape(-1, 1) + ref_P = ref_mp[:, 0] + ref_I = ref_mp[:, 1] ref_left_I = ref_mp[:, 2].astype(np.int64) ref_left_P = np.full_like(ref_left_I, np.inf, dtype=np.float64) for i, j in enumerate(ref_left_I): @@ -210,8 +210,8 @@ def test_stumpi_init_nan_inf_self_join(substitute, substitution_locations): stream.T_[substitution_location] = substitute ref_mp = naive.stump(stream.T_, m, exclusion_zone=zone, row_wise=True) - ref_P = ref_mp[:, 0].reshape(-1, 1) - ref_I = ref_mp[:, 1].reshape(-1, 1) + ref_P = ref_mp[:, 0] + ref_I = ref_mp[:, 1] naive.replace_inf(ref_P) naive.replace_inf(comp_P) @@ -385,8 +385,8 @@ def test_stumpi_stream_nan_inf_self_join(substitute, substitution_locations): stream.T_[30:][substitution_location] = substitute ref_mp = naive.stump(stream.T_, m, exclusion_zone=zone, row_wise=True) - ref_P = ref_mp[:, 0].reshape(-1, 1) - ref_I = ref_mp[:, 1].reshape(-1, 1) + ref_P = ref_mp[:, 0] + ref_I = ref_mp[:, 1] naive.replace_inf(ref_P) naive.replace_inf(comp_P) @@ -546,7 +546,7 @@ def test_stumpi_constant_subsequence_self_join(): # comp_I = stream.I_ ref_mp = naive.stump(stream.T_, m, exclusion_zone=zone, row_wise=True) - ref_P = ref_mp[:, 0].reshape(-1, 1) + ref_P = ref_mp[:, 0] # ref_I = ref_mp[:, 1] naive.replace_inf(ref_P) @@ -701,7 +701,7 @@ def test_stumpi_identical_subsequence_self_join(): # comp_I = stream.I_ ref_mp = naive.stump(stream.T_, m, exclusion_zone=zone, row_wise=True) - ref_P = ref_mp[:, 0].reshape(-1, 1) + ref_P = ref_mp[:, 0] # ref_I = ref_mp[:, 1] naive.replace_inf(ref_P) From ab22972dbb541da1d06a6125b9129d03ef6d4233 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 9 Aug 2022 11:19:00 -0600 Subject: [PATCH 370/416] Revise tests functions --- tests/test_scrump.py | 56 ++++++++++++++++++++++++++------------------ tests/test_stumpi.py | 25 +++++++++----------- 2 files changed, 44 insertions(+), 37 deletions(-) diff --git a/tests/test_scrump.py b/tests/test_scrump.py index 41843df33..87d93ac03 100644 --- a/tests/test_scrump.py +++ b/tests/test_scrump.py @@ -233,8 +233,8 @@ def test_scrump_self_join_full(T_A, T_B): zone = int(np.ceil(m / 4)) ref_mp = naive.stump(T_B, m, exclusion_zone=zone, row_wise=True) - ref_P = ref_mp[:, 0].reshape(-1, 1) # to match shape of comp_P when k=1 - ref_I = ref_mp[:, 1].reshape(-1, 1) # to match shape of comp_I when k=1 + ref_P = ref_mp[:, 0] + ref_I = ref_mp[:, 1] ref_left_I = ref_mp[:, 2] ref_right_I = ref_mp[:, 3] @@ -369,10 +369,15 @@ def test_scrump_plus_plus_self_join(T_A, T_B, percentages): seed = np.random.randint(100000) np.random.seed(seed) - ref_P, ref_I = naive.prescrump(T_B, m, T_B, s=s, exclusion_zone=zone) + ref_P, ref_I = naive.prescrump(T_B, m, T_B, s=s, exclusion_zone=zone, k=1) ref_P_aux, ref_I_aux, _, _ = naive.scrump( - T_B, m, T_B, percentage, zone, True, s + T_B, m, T_B, percentage, zone, True, s, k=1 ) + + # ref_P and ref_I are always 2D arrays. naive.scrump, howeve, gives + # 1D array when k=1. + ref_P_aux = ref_P_aux.reshape(-1, 1) + ref_I_aux = ref_I_aux.reshape(-1, 1) naive.merge_topk_PI(ref_P, ref_P_aux, ref_I, ref_I_aux) np.random.seed(seed) @@ -386,6 +391,8 @@ def test_scrump_plus_plus_self_join(T_A, T_B, percentages): naive.replace_inf(ref_P) naive.replace_inf(comp_P) + ref_P = ref_P.flatten() + ref_I = ref_I.flatten() npt.assert_almost_equal(ref_P, comp_P) npt.assert_almost_equal(ref_I, comp_I) @@ -401,11 +408,16 @@ def test_scrump_plus_plus_A_B_join(T_A, T_B, percentages): seed = np.random.randint(100000) np.random.seed(seed) - ref_P, ref_I = naive.prescrump(T_A, m, T_B, s=s) + ref_P, ref_I = naive.prescrump(T_A, m, T_B, s=s, k=1) ref_P_aux, ref_I_aux, ref_left_I_aux, ref_right_I_aux = naive.scrump( - T_A, m, T_B, percentage, None, False, None + T_A, m, T_B, percentage, None, False, None, k=1 ) + + # ref_P and ref_I are always 2D arrays. naive.scrump, howeve, gives + # 1D array when k=1 + ref_P_aux = ref_P_aux.reshape(-1, 1) + ref_I_aux = ref_I_aux.reshape(-1, 1) naive.merge_topk_PI(ref_P, ref_P_aux, ref_I, ref_I_aux) ref_left_I = ref_left_I_aux ref_right_I = ref_right_I_aux @@ -428,6 +440,8 @@ def test_scrump_plus_plus_A_B_join(T_A, T_B, percentages): naive.replace_inf(ref_P) naive.replace_inf(comp_P) + ref_P = ref_P.flatten() + ref_I = ref_I.flatten() npt.assert_almost_equal(ref_P, comp_P) npt.assert_almost_equal(ref_I, comp_I) npt.assert_almost_equal(ref_left_I, comp_left_I) @@ -876,24 +890,20 @@ def test_prescrump_A_B_join_larger_window_m_5_k_5(T_A, T_B): def test_prescrump_self_join_KNN_no_overlap(): # This test is particularly designed to raise error in a rare case described # as follows: Let's denote `I[i]` as the array with length `k` that contains - # the start index of the best-so-far top-k nearest neighbors of `subseq i`, + # the start indices of the best-so-far top-k nearest neighbors of `subseq i`, # (`S_i`). Also, we denote `P[i]` as their corresponding ascendingly-sorted - # distances to `subseq i`. After calculating the distance between `subseq i` - # to its neighbor `subseq j` (`S_j`). Let's denote `d` as the distance between - # these two subseqs. `j` is eligible to be inserted into I[i] if `d` is less - # than the `P[i, -1]` and if `j` is not in I[i]. One might think to first perform - # `idx = np.searchosrted(P[i], d, side="right")` and then check if `j` - # is in `I[i, :idx]` or not. HOWEVER, this does not suffice! The latter approach - # may result in duplicates(!) due to the imprecision in calculation of ditances. - # It is possible that the distance between `S_i` and `S_j` was - # calculated in one of previous iterations and that value might be slightly - # higher than `d` (In theory, they should be exactly the same!!). Thus, althought - # `j` might be already in I[i], it might not appear in `I[i, :idx]`. In other - # words, we might have `j` as the element `I[i, w]`, where `w >= idx` and hence - # P[i, w] > d). In theory, P[i, w] and d should be equal as they both show the - # same distance, i.e. the distance between `S_i` and `S_j`. - # To sum up, we need to search whole I[i] for `j`. - + # distances. Let's denote `d` as the distane betweeen `S_i` and `S_j`. P[i] and + # I[i] must be updated if (1) `j` is not in I[i] and (2) `d` < P[i,-1]. Regarding + # the former condition, one needs to check the whole array I[i]. Checking the + # array I[i, :idx], where `idx = np.searchsorted(P[i], 'd', side='right')` is + # not completly correct and that is due to imprecision in numerical calculation. + # It may happen that `j` is not in `I[i, :idx]`, but it is in fact at `I[i, idx]` + # (or any other position in array I[i]). And, its corresponding distance, i.e + # P[i, idx], is d + 1e-5, for instance. In theory, this should be exactly `d`. + # However, due to imprecision, we may calculated a slightly different value + # for such distance in one of previous iterations in function prescrump. This + # test results in error if someone tries to change the performant code of prescrump + # function and check `I[i, :idx]` rather than the full array `I[i]`. T = np.array( [ -916.64703784, diff --git a/tests/test_stumpi.py b/tests/test_stumpi.py index c794d9e6b..5ab2023a7 100644 --- a/tests/test_stumpi.py +++ b/tests/test_stumpi.py @@ -868,22 +868,19 @@ def test_stumpi_profile_index_match(): t = T_full[i] stream.update(t) - P[:, :] = np.inf - mask = stream.I_ >= 0 - - for j in range(P.shape[1]): # `j` as j-th nearest neighbor - IDX = np.flatnonzero(mask[:, j]) - P[IDX, j] = naive.distance( - naive.z_norm(T_full_subseq[IDX + n + 1], axis=1), - naive.z_norm(T_full_subseq[stream.I_[IDX, j]], axis=1), - axis=1, - ) + P[:] = np.inf + indices = np.argwhere(stream.I_ >= 0).flatten() + P[indices] = naive.distance( + naive.z_norm(T_full_subseq[indices + n + 1], axis=1), + naive.z_norm(T_full_subseq[stream.I_[indices]], axis=1), + axis=1, + ) left_P[:] = np.inf - idx = np.argwhere(stream.left_I_ >= 0).flatten() - left_P[idx] = naive.distance( - naive.z_norm(T_full_subseq[idx + n + 1], axis=1), - naive.z_norm(T_full_subseq[stream.left_I_[idx]], axis=1), + indices = np.argwhere(stream.left_I_ >= 0).flatten() + left_P[indices] = naive.distance( + naive.z_norm(T_full_subseq[indices + n + 1], axis=1), + naive.z_norm(T_full_subseq[stream.left_I_[indices]], axis=1), axis=1, ) From 249d928ea98ff7bf0c78f5c9746897d526192a54 Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 9 Aug 2022 13:27:27 -0600 Subject: [PATCH 371/416] Improve Docstrings --- stumpy/scrump.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 2328bf09f..d259d4307 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -518,9 +518,9 @@ class scrump: ---------- P_ : numpy.ndarray The updated (top-k) matrix profile. When `k=1` (default), this output is - a 1D array consisting of the matrix profile values. When `k > 1`, the output + a 1D array consisting of the matrix profile. When `k > 1`, the output is a 2D array that has exactly `k` columns consisting of the top-k matrix - profile values. + profile. I_ : numpy.ndarray The updated (top-k) matrix profile indices. When `k=1` (default), this output is @@ -790,10 +790,10 @@ def update(self): @property def P_(self): """ - Get the updated (top-k) matrix profile. When `k=1` (default), this output is - a 1D array consisting of the updated matrix profile values. When `k > 1`, - the output is a 2D array that has exactly `k` columns consisting of the - updated top-k matrix profile values. + Get the updated (top-k) matrix profile. When `k=1` (default), this output + is a 1D array consisting of the updated matrix profile. When `k > 1`, the + output is a 2D array that has exactly `k` columns consisting of the updated + top-k matrix profile. """ if self._k == 1: return self._P.flatten().astype(np.float64) From 5e515c4d1b83d6c8a1b41ae44366b0bf54fb8efa Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Sun, 28 Aug 2022 12:34:06 -0600 Subject: [PATCH 372/416] Make prescrump output 1D when k is one --- stumpy/scrump.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 2328bf09f..ea5ebaf5d 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -455,7 +455,10 @@ def prescrump(T_A, m, T_B=None, s=None, normalize=True, p=2.0, k=1): k, ) - return P, I + if k == 1: + return P.flatten().astype(np.float64), I.flatten().astype(np.int64) + else: + return P, I @core.non_normalized( @@ -715,7 +718,11 @@ def __init__( else: P, I = prescrump(T_A, m, T_B=T_B, s=s, k=self._k) - core._merge_topk_PI(self._P, P, self._I, I) + # P and I are 1D when `self._k` is 1. So, we should reshape them + # before passing them to `_merge_topk_PI` + core._merge_topk_PI( + self._P, P.reshape(-1, self._k), self._I, I.reshape(-1, self._k) + ) if self._ignore_trivial: self._diags = np.random.permutation( From 752a22c89ab8fe6fa2230027ac3f98347b558690 Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Sun, 28 Aug 2022 12:40:19 -0600 Subject: [PATCH 373/416] minor change --- tests/naive.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 198915e97..1d03627e2 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -819,9 +819,9 @@ def __init__(self, T, m, excl_zone=None, k=1): self.I_ = self.I_.flatten() def update(self, t): - if self._k == 1: - self.P_ = self.P_.reshape(-1, 1) - self.I_ = self.I_.reshape(-1, 1) + # ensure than self.P_ and self.I_ are 2D + self.P_ = self.P_.reshape(-1, self._k) + self.I_ = self.I_.reshape(-1, self._k) self._T[:] = np.roll(self._T, -1) self._T_isfinite[:] = np.roll(self._T_isfinite, -1) @@ -1491,6 +1491,10 @@ def prescrump(T_A, m, T_B, s, exclusion_zone=None, k=1): P[idx] = np.insert(P[idx], pos, distance_profile[idx])[:-1] I[idx] = np.insert(I[idx], pos, i)[:-1] + if k == 1: + P = P.flatten() + I = I.flatten() + return P, I From e1f49afb497afbdeff890d922decf61af7563750 Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Sun, 28 Aug 2022 12:54:53 -0600 Subject: [PATCH 374/416] update test functions --- tests/naive.py | 7 +++++++ tests/test_scrump.py | 4 ++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 1d03627e2..d6ce20adb 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -867,6 +867,7 @@ def update(self, t): self.left_P_[-1] = self.P_[-1, 0] self.left_I_[-1] = self.I_[-1, 0] + # post-processing: ensure that self.P_ and self.I_ is 1D. if self._k == 1: self.P_ = self.P_.flatten() self.I_ = self.I_.flatten() @@ -1827,6 +1828,12 @@ def _total_diagonal_ndists(tile_lower_diag, tile_upper_diag, tile_height, tile_w def merge_topk_PI(PA, PB, IA, IB): k = PA.shape[1] + if k == 1: + mask = PB < PA + PA[mask] = PB[mask] + IA[mask] = IB[mask] + return + for i in range(PA.shape[0]): _, _, overlap_idx_B = np.intersect1d(IA[i], IB[i], return_indices=True) PB[i, overlap_idx_B] = np.inf diff --git a/tests/test_scrump.py b/tests/test_scrump.py index 41843df33..53652a483 100644 --- a/tests/test_scrump.py +++ b/tests/test_scrump.py @@ -233,8 +233,8 @@ def test_scrump_self_join_full(T_A, T_B): zone = int(np.ceil(m / 4)) ref_mp = naive.stump(T_B, m, exclusion_zone=zone, row_wise=True) - ref_P = ref_mp[:, 0].reshape(-1, 1) # to match shape of comp_P when k=1 - ref_I = ref_mp[:, 1].reshape(-1, 1) # to match shape of comp_I when k=1 + ref_P = ref_mp[:, 0] # .reshape(-1, 1) # to match shape of comp_P when k=1 + ref_I = ref_mp[:, 1] # .reshape(-1, 1) # to match shape of comp_I when k=1 ref_left_I = ref_mp[:, 2] ref_right_I = ref_mp[:, 3] From 6e541ea3dcffb364a88ce80db326661eef309128 Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Sun, 28 Aug 2022 12:55:14 -0600 Subject: [PATCH 375/416] Modify merge_topk to support 1D input --- stumpy/core.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/stumpy/core.py b/stumpy/core.py index 5cd511a9f..900bb2696 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2609,6 +2609,12 @@ def _merge_topk_PI(PA, PB, IA, IB): None """ k = PA.shape[1] + if k == 1: + mask = PB < PA + PA[mask] = PB[mask] + IA[mask] = IB[mask] + return + tmp_P = np.empty(k, dtype=np.float64) tmp_I = np.empty(k, dtype=np.int64) for i in range(PA.shape[0]): @@ -2669,6 +2675,12 @@ def _merge_topk_ρI(ρA, ρB, IA, IB): None """ k = ρA.shape[1] + if k == 1: + mask = ρB > ρA + ρA[mask] = ρB[mask] + IA[mask] = IB[mask] + return + tmp_ρ = np.empty(k, dtype=np.float64) tmp_I = np.empty(k, dtype=np.int64) last_idx = k - 1 From 0bff1aee1e5d84967c0429b97d8f4264c4867d99 Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Sun, 28 Aug 2022 13:22:31 -0600 Subject: [PATCH 376/416] Fix merge_topk --- stumpy/core.py | 8 ++++---- tests/naive.py | 10 ++++++++-- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 900bb2696..d6e7545ea 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2608,13 +2608,13 @@ def _merge_topk_PI(PA, PB, IA, IB): ------- None """ - k = PA.shape[1] - if k == 1: + if PA.ndim == 1: mask = PB < PA PA[mask] = PB[mask] IA[mask] = IB[mask] return + k = PA.shape[1] tmp_P = np.empty(k, dtype=np.float64) tmp_I = np.empty(k, dtype=np.int64) for i in range(PA.shape[0]): @@ -2674,13 +2674,13 @@ def _merge_topk_ρI(ρA, ρB, IA, IB): ------- None """ - k = ρA.shape[1] - if k == 1: + if ρA.ndim == 1: mask = ρB > ρA ρA[mask] = ρB[mask] IA[mask] = IB[mask] return + k = ρA.shape[1] tmp_ρ = np.empty(k, dtype=np.float64) tmp_I = np.empty(k, dtype=np.int64) last_idx = k - 1 diff --git a/tests/naive.py b/tests/naive.py index d6ce20adb..52e85bf42 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1827,13 +1827,13 @@ def _total_diagonal_ndists(tile_lower_diag, tile_upper_diag, tile_height, tile_w def merge_topk_PI(PA, PB, IA, IB): - k = PA.shape[1] - if k == 1: + if PA.ndim == 1: mask = PB < PA PA[mask] = PB[mask] IA[mask] = IB[mask] return + k = PA.shape[1] for i in range(PA.shape[0]): _, _, overlap_idx_B = np.intersect1d(IA[i], IB[i], return_indices=True) PB[i, overlap_idx_B] = np.inf @@ -1869,6 +1869,12 @@ def merge_topk_ρI(ρA, ρB, IA, IB): # merging `ρB` and `ρA` ascendingly while choosing `ρB` over `ρA` in case of # ties: [0_B, 0_A, 0'_A, 1_B, 1'_B, 1_A], and we just need to keep the second # half of this array, and discard the first half. + if ρA.ndim == 1: + mask = ρB > ρA + ρA[mask] = ρB[mask] + IA[mask] = IB[mask] + return + k = ρA.shape[1] for i in range(ρA.shape[0]): _, _, overlap_idx_B = np.intersect1d(IA[i], IB[i], return_indices=True) From 39e5ea30f89d174c653add18f5027d201edc1d06 Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Sun, 28 Aug 2022 14:15:28 -0600 Subject: [PATCH 377/416] Fix shape of variables in test functions --- tests/test_scrump.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/tests/test_scrump.py b/tests/test_scrump.py index 8141e8e39..1341dc1a1 100644 --- a/tests/test_scrump.py +++ b/tests/test_scrump.py @@ -233,8 +233,8 @@ def test_scrump_self_join_full(T_A, T_B): zone = int(np.ceil(m / 4)) ref_mp = naive.stump(T_B, m, exclusion_zone=zone, row_wise=True) - ref_P = ref_mp[:, 0] # .reshape(-1, 1) # to match shape of comp_P when k=1 - ref_I = ref_mp[:, 1] # .reshape(-1, 1) # to match shape of comp_I when k=1 + ref_P = ref_mp[:, 0] + ref_I = ref_mp[:, 1] ref_left_I = ref_mp[:, 2] ref_right_I = ref_mp[:, 3] @@ -374,10 +374,6 @@ def test_scrump_plus_plus_self_join(T_A, T_B, percentages): T_B, m, T_B, percentage, zone, True, s, k=1 ) - # ref_P and ref_I are always 2D arrays. naive.scrump, howeve, gives - # 1D array when k=1. - ref_P_aux = ref_P_aux.reshape(-1, 1) - ref_I_aux = ref_I_aux.reshape(-1, 1) naive.merge_topk_PI(ref_P, ref_P_aux, ref_I, ref_I_aux) np.random.seed(seed) @@ -414,10 +410,6 @@ def test_scrump_plus_plus_A_B_join(T_A, T_B, percentages): T_A, m, T_B, percentage, None, False, None, k=1 ) - # ref_P and ref_I are always 2D arrays. naive.scrump, howeve, gives - # 1D array when k=1 - ref_P_aux = ref_P_aux.reshape(-1, 1) - ref_I_aux = ref_I_aux.reshape(-1, 1) naive.merge_topk_PI(ref_P, ref_P_aux, ref_I, ref_I_aux) ref_left_I = ref_left_I_aux ref_right_I = ref_right_I_aux From e9fd14c2ecd81db9f047e02b7e4be9c6d2589d1a Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Sun, 28 Aug 2022 14:24:22 -0600 Subject: [PATCH 378/416] Remove unnecessary flatten operation --- tests/test_stimp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_stimp.py b/tests/test_stimp.py index f30514193..a56bec695 100644 --- a/tests/test_stimp.py +++ b/tests/test_stimp.py @@ -52,7 +52,7 @@ def test_stimp_1_percent(T): tmp_P, tmp_I = naive.prescrump(T, m, T, s=s, exclusion_zone=zone) ref_P, ref_I, _, _ = naive.scrump(T, m, T, percentage, zone, True, s) naive.merge_topk_PI(ref_P, tmp_P, ref_I, tmp_I) - ref_PAN[pan._bfs_indices[idx], : ref_P.shape[0]] = ref_P.flatten() + ref_PAN[pan._bfs_indices[idx], : ref_P.shape[0]] = ref_P # Compare raw pan cmp_PAN = pan._PAN @@ -107,7 +107,7 @@ def test_stimp_max_m(T): tmp_P, tmp_I = naive.prescrump(T, m, T, s=s, exclusion_zone=zone) ref_P, ref_I, _, _ = naive.scrump(T, m, T, percentage, zone, True, s) naive.merge_topk_PI(ref_P, tmp_P, ref_I, tmp_I) - ref_PAN[pan._bfs_indices[idx], : ref_P.shape[0]] = ref_P.flatten() + ref_PAN[pan._bfs_indices[idx], : ref_P.shape[0]] = ref_P # Compare raw pan cmp_PAN = pan._PAN From d3858298a7f52f8e0c73b86f44d71fa42f3a59d0 Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Sun, 28 Aug 2022 14:54:02 -0600 Subject: [PATCH 379/416] Update test function for case k=1 --- tests/test_core.py | 64 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/tests/test_core.py b/tests/test_core.py index 18b9f3d8e..814e3f488 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1078,6 +1078,21 @@ def test_merge_topk_PI_without_overlap(): IA = np.arange(n * k).reshape(n, k) IB = IA + n * k + # if k=1, make them 1D + if k == 1: + PA = PA.reshape( + -1, + ) + IA = IA.reshape( + -1, + ) + PB = PB.reshape( + -1, + ) + IB = IB.reshape( + -1, + ) + ref_P = PA.copy() ref_I = IA.copy() @@ -1096,6 +1111,8 @@ def test_merge_topk_PI_with_overlap(): # is overlap between row IA[i] and row IB[i]. n = 50 for k in range(1, 6): + # note: we do not have overlap issue when k is 1. The `k=1` is considered + # for the sake of consistency with the `without-overlap` test function. PA = np.random.rand(n * k).reshape(n, k) PB = np.random.rand(n * k).reshape(n, k) @@ -1119,6 +1136,21 @@ def test_merge_topk_PI_with_overlap(): PB[:, :] = np.take_along_axis(PB, IDX, axis=1) IB[:, :] = np.take_along_axis(IB, IDX, axis=1) + # if k=1, make them 1D + if k == 1: + PA = PA.reshape( + -1, + ) + IA = IA.reshape( + -1, + ) + PB = PB.reshape( + -1, + ) + IB = IB.reshape( + -1, + ) + ref_P = PA.copy() ref_I = IA.copy() @@ -1149,6 +1181,21 @@ def test_merge_topk_ρI_without_overlap(): IA = np.arange(n * k).reshape(n, k) IB = IA + n * k + # if k=1, make them 1D + if k == 1: + ρA = ρA.reshape( + -1, + ) + IA = IA.reshape( + -1, + ) + ρB = ρB.reshape( + -1, + ) + IB = IB.reshape( + -1, + ) + ref_ρ = ρA.copy() ref_I = IA.copy() @@ -1167,6 +1214,8 @@ def test_merge_topk_ρI_with_overlap(): # is overlap between row IA[i] and row IB[i]. n = 50 for k in range(1, 6): + # note: we do not have overlap issue when k is 1. The `k=1` is considered + # for the sake of consistency with the `without-overlap` test function. ρA = np.random.rand(n * k).reshape(n, k) ρB = np.random.rand(n * k).reshape(n, k) @@ -1190,6 +1239,21 @@ def test_merge_topk_ρI_with_overlap(): ρB[:, :] = np.take_along_axis(ρB, IDX, axis=1) IB[:, :] = np.take_along_axis(IB, IDX, axis=1) + # if k=1, make them 1D + if k == 1: + ρA = ρA.reshape( + -1, + ) + IA = IA.reshape( + -1, + ) + ρB = ρB.reshape( + -1, + ) + IB = IB.reshape( + -1, + ) + ref_ρ = ρA.copy() ref_I = IA.copy() From 90ab9e36ed5ff5ebfdc22334597189c2c91ce01d Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Mon, 29 Aug 2022 13:07:20 -0600 Subject: [PATCH 380/416] revise comment --- tests/naive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/naive.py b/tests/naive.py index 52e85bf42..2e7b9d160 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -867,7 +867,7 @@ def update(self, t): self.left_P_[-1] = self.P_[-1, 0] self.left_I_[-1] = self.I_[-1, 0] - # post-processing: ensure that self.P_ and self.I_ is 1D. + # post-processing: ensure that self.P_ and self.I_ are 1D. if self._k == 1: self.P_ = self.P_.flatten() self.I_ = self.I_.flatten() From 0b163eb7f86d04602f5714e6b2e0199d40ce5ff5 Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Mon, 29 Aug 2022 13:09:21 -0600 Subject: [PATCH 381/416] Avoid using return in the middle of code --- stumpy/core.py | 100 ++++++++++++++++++++++++------------------------- 1 file changed, 49 insertions(+), 51 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index d6e7545ea..e535d4163 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2612,33 +2612,32 @@ def _merge_topk_PI(PA, PB, IA, IB): mask = PB < PA PA[mask] = PB[mask] IA[mask] = IB[mask] - return - - k = PA.shape[1] - tmp_P = np.empty(k, dtype=np.float64) - tmp_I = np.empty(k, dtype=np.int64) - for i in range(PA.shape[0]): - overlap = set(IB[i]).intersection(set(IA[i])) - aj, bj = 0, 0 - idx = 0 - # 2 * k iterations are required to traverse both A and B if needed. - for _ in range(2 * k): - if idx >= k: - break - if bj < k and PB[i, bj] < PA[i, aj]: - if IB[i, bj] not in overlap: - tmp_P[idx] = PB[i, bj] - tmp_I[idx] = IB[i, bj] + else: + k = PA.shape[1] + tmp_P = np.empty(k, dtype=np.float64) + tmp_I = np.empty(k, dtype=np.int64) + for i in range(PA.shape[0]): + overlap = set(IB[i]).intersection(set(IA[i])) + aj, bj = 0, 0 + idx = 0 + # 2 * k iterations are required to traverse both A and B if needed. + for _ in range(2 * k): + if idx >= k: + break + if bj < k and PB[i, bj] < PA[i, aj]: + if IB[i, bj] not in overlap: + tmp_P[idx] = PB[i, bj] + tmp_I[idx] = IB[i, bj] + idx += 1 + bj += 1 + else: + tmp_P[idx] = PA[i, aj] + tmp_I[idx] = IA[i, aj] idx += 1 - bj += 1 - else: - tmp_P[idx] = PA[i, aj] - tmp_I[idx] = IA[i, aj] - idx += 1 - aj += 1 + aj += 1 - PA[i] = tmp_P - IA[i] = tmp_I + PA[i] = tmp_P + IA[i] = tmp_I @njit @@ -2678,34 +2677,33 @@ def _merge_topk_ρI(ρA, ρB, IA, IB): mask = ρB > ρA ρA[mask] = ρB[mask] IA[mask] = IB[mask] - return - - k = ρA.shape[1] - tmp_ρ = np.empty(k, dtype=np.float64) - tmp_I = np.empty(k, dtype=np.int64) - last_idx = k - 1 - for i in range(len(ρA)): - overlap = set(IB[i]).intersection(set(IA[i])) - aj, bj = last_idx, last_idx - idx = last_idx - # 2 * k iterations are required to traverse both A and B if needed. - for _ in range(2 * k): - if idx < 0: - break - if bj >= 0 and ρB[i, bj] > ρA[i, aj]: - if IB[i, bj] not in overlap: - tmp_ρ[idx] = ρB[i, bj] - tmp_I[idx] = IB[i, bj] + else: + k = ρA.shape[1] + tmp_ρ = np.empty(k, dtype=np.float64) + tmp_I = np.empty(k, dtype=np.int64) + last_idx = k - 1 + for i in range(len(ρA)): + overlap = set(IB[i]).intersection(set(IA[i])) + aj, bj = last_idx, last_idx + idx = last_idx + # 2 * k iterations are required to traverse both A and B if needed. + for _ in range(2 * k): + if idx < 0: + break + if bj >= 0 and ρB[i, bj] > ρA[i, aj]: + if IB[i, bj] not in overlap: + tmp_ρ[idx] = ρB[i, bj] + tmp_I[idx] = IB[i, bj] + idx -= 1 + bj -= 1 + else: + tmp_ρ[idx] = ρA[i, aj] + tmp_I[idx] = IA[i, aj] idx -= 1 - bj -= 1 - else: - tmp_ρ[idx] = ρA[i, aj] - tmp_I[idx] = IA[i, aj] - idx -= 1 - aj -= 1 + aj -= 1 - ρA[i] = tmp_ρ - IA[i] = tmp_I + ρA[i] = tmp_ρ + IA[i] = tmp_I @njit From bf6df9b3d727d00c194edf766fa4c382ebe17ab5 Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Tue, 30 Aug 2022 12:55:44 -0600 Subject: [PATCH 382/416] Add new private function to get 2D ouput when k=1 --- stumpy/scrump.py | 96 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 75 insertions(+), 21 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index ea5ebaf5d..d4fc9b239 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -250,7 +250,7 @@ def _compute_PI( parallel=True, fastmath=True, ) -def _prescrump( +def _compute_approx_PI( T_A, T_B, m, @@ -368,12 +368,11 @@ def _prescrump( return np.sqrt(P_squared[0]), I[0] -@core.non_normalized(scraamp.prescraamp) -def prescrump(T_A, m, T_B=None, s=None, normalize=True, p=2.0, k=1): +def _prescrump(T_A, m, T_B=None, s=None, normalize=True, p=2.0, k=1): """ - A convenience wrapper around the Numba JIT-compiled parallelized `_prescrump` - function which computes the approximate (top-k) matrix profile according to - the preSCRIMP algorithm + A convenience wrapper around the Numba JIT-compiled parallelized + `_compute_approx_PI` function which computes the approximate (top-k) matrix + profile according to the preSCRIMP algorithm. Parameters ---------- @@ -408,15 +407,12 @@ def prescrump(T_A, m, T_B=None, s=None, normalize=True, p=2.0, k=1): Returns ------- P : numpy.ndarray - The (top-k) matrix profile. When k = 1 (default), the first (and only) column - in this 2D array consists of the matrix profile. When k > 1, the output has - exactly `k` columns consisting of the top-k matrix profile. + The (top-k) matrix profile. This 2D array has exactly `k` columns consisting + of the top-k matrix profile. I : numpy.ndarray - The (top-k) matrix profile indices. When k = 1 (default), the first (and only) - column in this 2D array consists of the matrix profile indices. When k > 1, - the output has exactly `k` columns consisting of the top-k matrix profile - indices. + The (top-k) matrix profile indices. This 2D array has exactly `k` columns + consisting of the top-k matrix profile indices. Notes ----- @@ -441,7 +437,7 @@ def prescrump(T_A, m, T_B=None, s=None, normalize=True, p=2.0, k=1): s = excl_zone indices = np.random.permutation(range(0, l, s)).astype(np.int64) - P, I = _prescrump( + P, I = _compute_approx_PI( T_A, T_B, m, @@ -455,6 +451,68 @@ def prescrump(T_A, m, T_B=None, s=None, normalize=True, p=2.0, k=1): k, ) + return P, I + + +@core.non_normalized(scraamp.prescraamp) +def prescrump(T_A, m, T_B=None, s=None, normalize=True, p=2.0, k=1): + """ + A convenience wrapper around `prescrump` function which computes the approximate + (top-k) matrix profile according to the preSCRIMP algorithm. The output is 1D + when `k=1`. + + Parameters + ---------- + T_A : numpy.ndarray + The time series or sequence for which to compute the matrix profile + + m : int + Window size + + T_B : numpy.ndarray, default None + The time series or sequence that will be used to annotate T_A. For every + subsequence in T_A, its nearest neighbor in T_B will be recorded. + + s : int, default None + The sampling interval that defaults to + `int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM))` + + normalize : bool, default True + When set to `True`, this z-normalizes subsequences prior to computing distances. + Otherwise, this function gets re-routed to its complementary non-normalized + equivalent set in the `@core.non_normalized` function decorator. + + p : float, default 2.0 + The p-norm to apply for computing the Minkowski distance. This parameter is + ignored when `normalize == True`. + + k : int, default 1 + The number of top `k` smallest distances used to construct the matrix profile. + Note that this will increase the total computational time and memory usage + when k > 1. + + Returns + ------- + P : numpy.ndarray + The (top-k) matrix profile. When `k = 1` (default), this is a 1D array + consisting of the matrix profile. When `k > 1`, the output is a 2D array + that has exactly `k` columns consisting of the top-k matrix profile. + + I : numpy.ndarray + The (top-k) matrix profile indices. When `k = 1` (default), this is a 1D array + consisting of the matrix profile indices. When `k > 1`, the output is a + 2D array that has exactly `k` columns consisting of the top-k matrix profile + indices. + + Notes + ----- + `DOI: 10.1109/ICDM.2018.00099 \ + `__ + + See Algorithm 2 + """ + P, I = _prescrump(T_A, m, T_B, s, normalize, p, k) + if k == 1: return P.flatten().astype(np.float64), I.flatten().astype(np.int64) else: @@ -714,15 +772,11 @@ def __init__( if pre_scrump: if self._ignore_trivial: - P, I = prescrump(T_A, m, s=s, k=self._k) + P, I = _prescrump(T_A, m, s=s, k=self._k) else: - P, I = prescrump(T_A, m, T_B=T_B, s=s, k=self._k) + P, I = _prescrump(T_A, m, T_B=T_B, s=s, k=self._k) - # P and I are 1D when `self._k` is 1. So, we should reshape them - # before passing them to `_merge_topk_PI` - core._merge_topk_PI( - self._P, P.reshape(-1, self._k), self._I, I.reshape(-1, self._k) - ) + core._merge_topk_PI(self._P, P, self._I, I) if self._ignore_trivial: self._diags = np.random.permutation( From e6a05d625a176500a5e58bb187dc6771ff9883c4 Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Tue, 30 Aug 2022 13:03:22 -0600 Subject: [PATCH 383/416] Remove check for 1D in merge_topk --- tests/test_core.py | 30 ------------------------------ 1 file changed, 30 deletions(-) diff --git a/tests/test_core.py b/tests/test_core.py index 814e3f488..c1e219744 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1078,21 +1078,6 @@ def test_merge_topk_PI_without_overlap(): IA = np.arange(n * k).reshape(n, k) IB = IA + n * k - # if k=1, make them 1D - if k == 1: - PA = PA.reshape( - -1, - ) - IA = IA.reshape( - -1, - ) - PB = PB.reshape( - -1, - ) - IB = IB.reshape( - -1, - ) - ref_P = PA.copy() ref_I = IA.copy() @@ -1136,21 +1121,6 @@ def test_merge_topk_PI_with_overlap(): PB[:, :] = np.take_along_axis(PB, IDX, axis=1) IB[:, :] = np.take_along_axis(IB, IDX, axis=1) - # if k=1, make them 1D - if k == 1: - PA = PA.reshape( - -1, - ) - IA = IA.reshape( - -1, - ) - PB = PB.reshape( - -1, - ) - IB = IB.reshape( - -1, - ) - ref_P = PA.copy() ref_I = IA.copy() From fe905d23d744c1b34c4d9384721265e18920502e Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Tue, 30 Aug 2022 13:27:18 -0600 Subject: [PATCH 384/416] Revise test functions --- tests/test_core.py | 72 +++++++++++++++++++++++++++------------------- 1 file changed, 42 insertions(+), 30 deletions(-) diff --git a/tests/test_core.py b/tests/test_core.py index c1e219744..1765c048d 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1134,6 +1134,27 @@ def test_merge_topk_PI_with_overlap(): npt.assert_almost_equal(ref_I, comp_I) +def test_merge_topk_PI_with_1D_input(): + n = 50 + PA = np.random.rand(n) + PB = np.random.rand(n) + + IA = np.arange(n) + IB = IA + n + + ref_P = PA.copy() + ref_I = IA.copy() + + comp_P = PA.copy() + comp_I = IA.copy() + + naive.merge_topk_PI(ref_P, PB.copy(), ref_I, IB.copy()) + core._merge_topk_PI(comp_P, PB.copy(), comp_I, IB.copy()) + + npt.assert_almost_equal(ref_P, comp_P) + npt.assert_almost_equal(ref_I, comp_I) + + def test_merge_topk_ρI_without_overlap(): # This is to test function `core._merge_topk_ρI(ρA, ρB, IA, IB)` when there # is no overlap between row IA[i] and row IB[i]. @@ -1151,21 +1172,6 @@ def test_merge_topk_ρI_without_overlap(): IA = np.arange(n * k).reshape(n, k) IB = IA + n * k - # if k=1, make them 1D - if k == 1: - ρA = ρA.reshape( - -1, - ) - IA = IA.reshape( - -1, - ) - ρB = ρB.reshape( - -1, - ) - IB = IB.reshape( - -1, - ) - ref_ρ = ρA.copy() ref_I = IA.copy() @@ -1209,21 +1215,6 @@ def test_merge_topk_ρI_with_overlap(): ρB[:, :] = np.take_along_axis(ρB, IDX, axis=1) IB[:, :] = np.take_along_axis(IB, IDX, axis=1) - # if k=1, make them 1D - if k == 1: - ρA = ρA.reshape( - -1, - ) - IA = IA.reshape( - -1, - ) - ρB = ρB.reshape( - -1, - ) - IB = IB.reshape( - -1, - ) - ref_ρ = ρA.copy() ref_I = IA.copy() @@ -1237,6 +1228,27 @@ def test_merge_topk_ρI_with_overlap(): npt.assert_almost_equal(ref_I, comp_I) +def test_merge_topk_ρI_with_1D_input(): + n = 50 + ρA = np.random.rand(n) + ρB = np.random.rand(n) + + IA = np.arange(n) + IB = IA + n + + ref_ρ = ρA.copy() + ref_I = IA.copy() + + comp_ρ = ρA.copy() + comp_I = IA.copy() + + naive.merge_topk_PI(ref_ρ, ρB.copy(), ref_I, IB.copy()) + core._merge_topk_PI(comp_ρ, ρB.copy(), comp_I, IB.copy()) + + npt.assert_almost_equal(ref_ρ, comp_ρ) + npt.assert_almost_equal(ref_I, comp_I) + + def test_shift_insert_at_index(): for k in range(1, 6): a = np.random.rand(k) From 8e8d48b447cf10abc505eb5d190243a7611fe527 Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Tue, 30 Aug 2022 13:39:05 -0600 Subject: [PATCH 385/416] Revise docstring to provide description for 1D case --- stumpy/core.py | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index e535d4163..fd9a3e511 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2578,12 +2578,15 @@ def _select_P_ABBA_value(P_ABBA, k, custom_func=None): @njit def _merge_topk_PI(PA, PB, IA, IB): """ - Merge two top-k matrix profiles `PA` and `PB`, and update `PA` (in place) while - always prioritizing the values of `PA` over the values of `PB` in case of ties. - (i.e., values from `PB` are always inserted to the right of values from `PA`). - Also, update `IA` accordingly. In case of overlapping values between two arrays - IA[i] and IB[i], the ones in IB[i] (and their corresponding values in PB[i]) - are ignored throughout the updating process of IA[i] (and PA[i]). + Merge two top-k matrix profiles `PA` and `PB`, and update `PA` (in place). + When the inputs are 1D arrays, PA[i] is updated if it is less than PB[i]. In + such case, PA[i] and IA[i] are replaced with PB[i] and IB[i], respectively. + When the inputs are 2D arrays, always prioritizing the values of `PA` over the + values of `PB` in case of ties. (i.e., values from `PB` are always inserted to + the right of values from `PA`). Also, update `IA` accordingly. In case of + overlapping values between two arrays IA[i] and IB[i], the ones in IB[i] (and + their corresponding values in PB[i]) are ignored throughout the updating process o + f IA[i] (and PA[i]). Unlike `_merge_topk_ρI`, where `top-k` largest values are kept, this function keeps `top-k` smallest values. @@ -2643,12 +2646,15 @@ def _merge_topk_PI(PA, PB, IA, IB): @njit def _merge_topk_ρI(ρA, ρB, IA, IB): """ - Merge two top-k pearson profiles `ρA` and `ρB`, and update `ρA` (in place) while - always prioritizing the values of `ρA` over the values of `ρB` in case of ties. - (i.e., values from `ρB` are always inserted to the left of values from `ρA`). - Also, update `IA` accordingly. In case of overlapping values between two arrays - IA[i] and IB[i], the ones in IB[i] (and their corresponding values in ρB[i]) - are ignored throughout the updating process of IA[i] (and ρA[i]). + Merge two top-k pearson profiles `ρA` and `ρB`, and update `ρA` (in place). + When the inputs are 1D arrays, ρA[i] is updated if it is more than ρB[i]. In + such case, ρA[i] and IA[i] are replaced with ρB[i] and IB[i], respectively. + When the inputs are 2D arrays, always prioritizing the values of `ρA` over + the values of `ρB` in case of ties. (i.e., values from `ρB` are always inserted + to the left of values from `ρA`). Also, update `IA` accordingly. In case of + overlapping values between two arrays IA[i] and IB[i], the ones in IB[i] (and + their corresponding values in ρB[i]) are ignored throughout the updating process + of IA[i] (and ρA[i]). Unlike `_merge_topk_PI`, where `top-k` smallest values are kept, this function keeps `top-k` largest values. From 3bebc47457cbc1572cf138802960da57b0b42341 Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Tue, 30 Aug 2022 14:29:53 -0600 Subject: [PATCH 386/416] Add overlap check in merge_topk with 1D input --- stumpy/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index fd9a3e511..6e3046f9f 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2612,7 +2612,7 @@ def _merge_topk_PI(PA, PB, IA, IB): None """ if PA.ndim == 1: - mask = PB < PA + mask = (PB < PA) & (IB != IA) PA[mask] = PB[mask] IA[mask] = IB[mask] else: @@ -2680,7 +2680,7 @@ def _merge_topk_ρI(ρA, ρB, IA, IB): None """ if ρA.ndim == 1: - mask = ρB > ρA + mask = (ρB > ρA) & (IB != IA) ρA[mask] = ρB[mask] IA[mask] = IB[mask] else: From 4fcf797a1f4235ffa28912eb8b762dc42f7be6a6 Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Wed, 31 Aug 2022 12:36:25 -0600 Subject: [PATCH 387/416] Add overlap check in 1D and revise docstring --- stumpy/core.py | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index e535d4163..6e3046f9f 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2578,12 +2578,15 @@ def _select_P_ABBA_value(P_ABBA, k, custom_func=None): @njit def _merge_topk_PI(PA, PB, IA, IB): """ - Merge two top-k matrix profiles `PA` and `PB`, and update `PA` (in place) while - always prioritizing the values of `PA` over the values of `PB` in case of ties. - (i.e., values from `PB` are always inserted to the right of values from `PA`). - Also, update `IA` accordingly. In case of overlapping values between two arrays - IA[i] and IB[i], the ones in IB[i] (and their corresponding values in PB[i]) - are ignored throughout the updating process of IA[i] (and PA[i]). + Merge two top-k matrix profiles `PA` and `PB`, and update `PA` (in place). + When the inputs are 1D arrays, PA[i] is updated if it is less than PB[i]. In + such case, PA[i] and IA[i] are replaced with PB[i] and IB[i], respectively. + When the inputs are 2D arrays, always prioritizing the values of `PA` over the + values of `PB` in case of ties. (i.e., values from `PB` are always inserted to + the right of values from `PA`). Also, update `IA` accordingly. In case of + overlapping values between two arrays IA[i] and IB[i], the ones in IB[i] (and + their corresponding values in PB[i]) are ignored throughout the updating process o + f IA[i] (and PA[i]). Unlike `_merge_topk_ρI`, where `top-k` largest values are kept, this function keeps `top-k` smallest values. @@ -2609,7 +2612,7 @@ def _merge_topk_PI(PA, PB, IA, IB): None """ if PA.ndim == 1: - mask = PB < PA + mask = (PB < PA) & (IB != IA) PA[mask] = PB[mask] IA[mask] = IB[mask] else: @@ -2643,12 +2646,15 @@ def _merge_topk_PI(PA, PB, IA, IB): @njit def _merge_topk_ρI(ρA, ρB, IA, IB): """ - Merge two top-k pearson profiles `ρA` and `ρB`, and update `ρA` (in place) while - always prioritizing the values of `ρA` over the values of `ρB` in case of ties. - (i.e., values from `ρB` are always inserted to the left of values from `ρA`). - Also, update `IA` accordingly. In case of overlapping values between two arrays - IA[i] and IB[i], the ones in IB[i] (and their corresponding values in ρB[i]) - are ignored throughout the updating process of IA[i] (and ρA[i]). + Merge two top-k pearson profiles `ρA` and `ρB`, and update `ρA` (in place). + When the inputs are 1D arrays, ρA[i] is updated if it is more than ρB[i]. In + such case, ρA[i] and IA[i] are replaced with ρB[i] and IB[i], respectively. + When the inputs are 2D arrays, always prioritizing the values of `ρA` over + the values of `ρB` in case of ties. (i.e., values from `ρB` are always inserted + to the left of values from `ρA`). Also, update `IA` accordingly. In case of + overlapping values between two arrays IA[i] and IB[i], the ones in IB[i] (and + their corresponding values in ρB[i]) are ignored throughout the updating process + of IA[i] (and ρA[i]). Unlike `_merge_topk_PI`, where `top-k` smallest values are kept, this function keeps `top-k` largest values. @@ -2674,7 +2680,7 @@ def _merge_topk_ρI(ρA, ρB, IA, IB): None """ if ρA.ndim == 1: - mask = ρB > ρA + mask = (ρB > ρA) & (IB != IA) ρA[mask] = ρB[mask] IA[mask] = IB[mask] else: From 41097a7a2462fdfd6b49cf52f1030cca6cda2a6d Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Wed, 31 Aug 2022 12:36:46 -0600 Subject: [PATCH 388/416] Add separate test function for _merge_topk 1D case --- tests/test_core.py | 102 +++++++++++++++++++-------------------------- 1 file changed, 42 insertions(+), 60 deletions(-) diff --git a/tests/test_core.py b/tests/test_core.py index 814e3f488..1765c048d 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1078,21 +1078,6 @@ def test_merge_topk_PI_without_overlap(): IA = np.arange(n * k).reshape(n, k) IB = IA + n * k - # if k=1, make them 1D - if k == 1: - PA = PA.reshape( - -1, - ) - IA = IA.reshape( - -1, - ) - PB = PB.reshape( - -1, - ) - IB = IB.reshape( - -1, - ) - ref_P = PA.copy() ref_I = IA.copy() @@ -1136,21 +1121,6 @@ def test_merge_topk_PI_with_overlap(): PB[:, :] = np.take_along_axis(PB, IDX, axis=1) IB[:, :] = np.take_along_axis(IB, IDX, axis=1) - # if k=1, make them 1D - if k == 1: - PA = PA.reshape( - -1, - ) - IA = IA.reshape( - -1, - ) - PB = PB.reshape( - -1, - ) - IB = IB.reshape( - -1, - ) - ref_P = PA.copy() ref_I = IA.copy() @@ -1164,6 +1134,27 @@ def test_merge_topk_PI_with_overlap(): npt.assert_almost_equal(ref_I, comp_I) +def test_merge_topk_PI_with_1D_input(): + n = 50 + PA = np.random.rand(n) + PB = np.random.rand(n) + + IA = np.arange(n) + IB = IA + n + + ref_P = PA.copy() + ref_I = IA.copy() + + comp_P = PA.copy() + comp_I = IA.copy() + + naive.merge_topk_PI(ref_P, PB.copy(), ref_I, IB.copy()) + core._merge_topk_PI(comp_P, PB.copy(), comp_I, IB.copy()) + + npt.assert_almost_equal(ref_P, comp_P) + npt.assert_almost_equal(ref_I, comp_I) + + def test_merge_topk_ρI_without_overlap(): # This is to test function `core._merge_topk_ρI(ρA, ρB, IA, IB)` when there # is no overlap between row IA[i] and row IB[i]. @@ -1181,21 +1172,6 @@ def test_merge_topk_ρI_without_overlap(): IA = np.arange(n * k).reshape(n, k) IB = IA + n * k - # if k=1, make them 1D - if k == 1: - ρA = ρA.reshape( - -1, - ) - IA = IA.reshape( - -1, - ) - ρB = ρB.reshape( - -1, - ) - IB = IB.reshape( - -1, - ) - ref_ρ = ρA.copy() ref_I = IA.copy() @@ -1239,21 +1215,6 @@ def test_merge_topk_ρI_with_overlap(): ρB[:, :] = np.take_along_axis(ρB, IDX, axis=1) IB[:, :] = np.take_along_axis(IB, IDX, axis=1) - # if k=1, make them 1D - if k == 1: - ρA = ρA.reshape( - -1, - ) - IA = IA.reshape( - -1, - ) - ρB = ρB.reshape( - -1, - ) - IB = IB.reshape( - -1, - ) - ref_ρ = ρA.copy() ref_I = IA.copy() @@ -1267,6 +1228,27 @@ def test_merge_topk_ρI_with_overlap(): npt.assert_almost_equal(ref_I, comp_I) +def test_merge_topk_ρI_with_1D_input(): + n = 50 + ρA = np.random.rand(n) + ρB = np.random.rand(n) + + IA = np.arange(n) + IB = IA + n + + ref_ρ = ρA.copy() + ref_I = IA.copy() + + comp_ρ = ρA.copy() + comp_I = IA.copy() + + naive.merge_topk_PI(ref_ρ, ρB.copy(), ref_I, IB.copy()) + core._merge_topk_PI(comp_ρ, ρB.copy(), comp_I, IB.copy()) + + npt.assert_almost_equal(ref_ρ, comp_ρ) + npt.assert_almost_equal(ref_I, comp_I) + + def test_shift_insert_at_index(): for k in range(1, 6): a = np.random.rand(k) From 948d674d4b0fc5df3dbb2c834f79f30f3eb30d86 Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Wed, 31 Aug 2022 13:03:51 -0600 Subject: [PATCH 389/416] Add preprocessing function for prescrump --- stumpy/scrump.py | 125 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 102 insertions(+), 23 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index ea5ebaf5d..1ab30c89d 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -14,6 +14,80 @@ logger = logging.getLogger(__name__) +def _preprocess_prescrump(T_A, m, T_B=None, s=None): + """ + Performs several preprocessings and returns outputs that are needed for the + prescrump algorithm. + + Parameters + ---------- + T_A : numpy.ndarray + The time series or sequence for which to compute the matrix profile + + m : int + Window size + + T_B : numpy.ndarray, default None + The time series or sequence that will be used to annotate T_A. For every + subsequence in T_A, its nearest neighbor in T_B will be recorded. + + s : int, default None + The sampling interval that defaults to + `int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM))` + + Returns + ------- + T_A : numpy.ndarray + A copy of the time series input `T_A`, where all NaN and inf values + are replaced with zero. + + T_B : numpy.ndarray + A copy of the time series input `T_B`, where all NaN and inf values + are replaced with zero. If the input `T_B` is not provided (default), + this array is just a copy of `T_A`. + + μ_Q : numpy.ndarray + Sliding window mean for `T_A` + + σ_Q : numpy.ndarray + Sliding window standard deviation for `T_A` + + M_T : numpy.ndarray + Sliding window mean for `T_B` + + Σ_T : numpy.ndarray + Sliding window standard deviation for `T_B` + + indices : numpy.ndarray + The subsequence indices to compute `prescrump` for + + s : int + The sampling interval that defaults to + `int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM))` + + excl_zone : int + The half width for the exclusion zone + """ + if T_B is None: + T_B = T_A + excl_zone = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM)) + else: + excl_zone = None + + T_A, μ_Q, σ_Q = core.preprocess(T_A, m) + T_B, M_T, Σ_T = core.preprocess(T_B, m) + + n_A = T_A.shape[0] + l = n_A - m + 1 + + if s is None: # pragma: no cover + s = excl_zone + + indices = np.random.permutation(range(0, l, s)).astype(np.int64) + + return (T_A, T_B, μ_Q, σ_Q, M_T, Σ_T, indices, s, excl_zone) + + @njit(fastmath=True) def _compute_PI( T_A, @@ -425,22 +499,10 @@ def prescrump(T_A, m, T_B=None, s=None, normalize=True, p=2.0, k=1): See Algorithm 2 """ - if T_B is None: - T_B = T_A - excl_zone = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM)) - else: - excl_zone = None - - T_A, μ_Q, σ_Q = core.preprocess(T_A, m) - T_B, M_T, Σ_T = core.preprocess(T_B, m) - - n_A = T_A.shape[0] - l = n_A - m + 1 - - if s is None: # pragma: no cover - s = excl_zone + T_A, T_B, μ_Q, σ_Q, M_T, Σ_T, indices, s, excl_zone = _preprocess_prescrump( + T_A, m, T_B=T_B, s=s + ) - indices = np.random.permutation(range(0, l, s)).astype(np.int64) P, I = _prescrump( T_A, T_B, @@ -714,15 +776,32 @@ def __init__( if pre_scrump: if self._ignore_trivial: - P, I = prescrump(T_A, m, s=s, k=self._k) + ( + T_A, + T_B, + μ_Q, + σ_Q, + M_T, + Σ_T, + indices, + s, + excl_zone, + ) = _preprocess_prescrump(T_A, m, s=s) else: - P, I = prescrump(T_A, m, T_B=T_B, s=s, k=self._k) - - # P and I are 1D when `self._k` is 1. So, we should reshape them - # before passing them to `_merge_topk_PI` - core._merge_topk_PI( - self._P, P.reshape(-1, self._k), self._I, I.reshape(-1, self._k) - ) + ( + T_A, + T_B, + μ_Q, + σ_Q, + M_T, + Σ_T, + indices, + s, + excl_zone, + ) = _preprocess_prescrump(T_A, m, T_B=T_B, s=s) + + P, I = _prescrump(T_A, T_B, μ_Q, σ_Q, M_T, Σ_T, indices, s, excl_zone, k) + core._merge_topk_PI(self._P, P, self._I, I) if self._ignore_trivial: self._diags = np.random.permutation( From 391c97dd7c47ad3e53c6120ecaff737899833737 Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Wed, 31 Aug 2022 13:42:21 -0600 Subject: [PATCH 390/416] Update test function --- tests/test_scrump.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_scrump.py b/tests/test_scrump.py index 1341dc1a1..de978c856 100644 --- a/tests/test_scrump.py +++ b/tests/test_scrump.py @@ -387,8 +387,8 @@ def test_scrump_plus_plus_self_join(T_A, T_B, percentages): naive.replace_inf(ref_P) naive.replace_inf(comp_P) - ref_P = ref_P.flatten() - ref_I = ref_I.flatten() + ref_P = ref_P + ref_I = ref_I npt.assert_almost_equal(ref_P, comp_P) npt.assert_almost_equal(ref_I, comp_I) @@ -432,8 +432,8 @@ def test_scrump_plus_plus_A_B_join(T_A, T_B, percentages): naive.replace_inf(ref_P) naive.replace_inf(comp_P) - ref_P = ref_P.flatten() - ref_I = ref_I.flatten() + ref_P = ref_P + ref_I = ref_I npt.assert_almost_equal(ref_P, comp_P) npt.assert_almost_equal(ref_I, comp_I) npt.assert_almost_equal(ref_left_I, comp_left_I) From 4d7cccfb1a96c70dc5e25033b644d7b7324d887b Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Wed, 31 Aug 2022 14:33:04 -0600 Subject: [PATCH 391/416] fix missing argument --- stumpy/scrump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 1ab30c89d..49825546b 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -800,7 +800,7 @@ def __init__( excl_zone, ) = _preprocess_prescrump(T_A, m, T_B=T_B, s=s) - P, I = _prescrump(T_A, T_B, μ_Q, σ_Q, M_T, Σ_T, indices, s, excl_zone, k) + P, I = _prescrump(T_A, T_B, m, μ_Q, σ_Q, M_T, Σ_T, indices, s, excl_zone, k) core._merge_topk_PI(self._P, P, self._I, I) if self._ignore_trivial: From e8814cf2181b9d5333d2b79bc0fba03a5f4e870d Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Wed, 31 Aug 2022 14:48:02 -0600 Subject: [PATCH 392/416] Fix Docstring --- stumpy/scrump.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 49825546b..fa47c44fd 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -482,14 +482,14 @@ def prescrump(T_A, m, T_B=None, s=None, normalize=True, p=2.0, k=1): Returns ------- P : numpy.ndarray - The (top-k) matrix profile. When k = 1 (default), the first (and only) column - in this 2D array consists of the matrix profile. When k > 1, the output has - exactly `k` columns consisting of the top-k matrix profile. + The (top-k) matrix profile. When k = 1 (default), this is a 1D array + consisting of the matrix profile. When k > 1, the output is a 2D array that + has exactly `k` columns consisting of the top-k matrix profile. I : numpy.ndarray - The (top-k) matrix profile indices. When k = 1 (default), the first (and only) - column in this 2D array consists of the matrix profile indices. When k > 1, - the output has exactly `k` columns consisting of the top-k matrix profile + The (top-k) matrix profile indices. When k = 1 (default), this is a 1D array + consisting of the matrix profile indices. When k > 1, the output is a 2D + array that has exactly `k` columns consisting of the top-k matrix profile indices. Notes From 39469153a503c38be5d184b50e044549bbecfe87 Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Wed, 31 Aug 2022 16:06:32 -0600 Subject: [PATCH 393/416] Put back the missing decorator --- stumpy/scrump.py | 1 + 1 file changed, 1 insertion(+) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index e82c454e5..921aae7a6 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -442,6 +442,7 @@ def _prescrump( return np.sqrt(P_squared[0]), I[0] +@core.non_normalized(scraamp.prescraamp) def prescrump(T_A, m, T_B=None, s=None, normalize=True, p=2.0, k=1): """ A convenience wrapper around the Numba JIT-compiled parallelized From eff9ca4aaf175455c8f3ba90f82c323ee9b4a4e3 Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Wed, 31 Aug 2022 18:46:29 -0600 Subject: [PATCH 394/416] Add preprocessing function in prescraamp --- stumpy/scraamp.py | 127 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 110 insertions(+), 17 deletions(-) diff --git a/stumpy/scraamp.py b/stumpy/scraamp.py index 143dc233e..b9fdc72e7 100644 --- a/stumpy/scraamp.py +++ b/stumpy/scraamp.py @@ -14,6 +14,76 @@ logger = logging.getLogger(__name__) +def _preprocess_prescraamp(T_A, m, T_B=None, s=None): + """ + Performs several preprocessings and returns outputs that are needed for the + non-normalized preSCRIMP algorithm. + + Parameters + ---------- + T_A : numpy.ndarray + The time series or sequence for which to compute the matrix profile + + m : int + Window size + + T_B : numpy.ndarray, default None + The time series or sequence that will be used to annotate T_A. For every + subsequence in T_A, its nearest neighbor in T_B will be recorded. + + s : int, default None + The sampling interval that defaults to + `int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM))` + + Returns + ------- + T_A : numpy.ndarray + A copy of the time series input `T_A`, where all NaN and inf values + are replaced with zero. + + T_B : numpy.ndarray + A copy of the time series input `T_B`, where all NaN and inf values + are replaced with zero. If the input `T_B` is not provided (default), + this array is just a copy of `T_A`. + + T_A_subseq_isfinite : numpy.ndarray + A boolean array that indicates whether a subsequence in `T_A` contains a + `np.nan`/`np.inf` value (False) + + T_B_subseq_isfinite : numpy.ndarray + A boolean array that indicates whether a subsequence in `T_B` contains a + `np.nan`/`np.inf` value (False) + + indices : numpy.ndarray + The subsequence indices to compute `prescrump` for + + s : int + The sampling interval that defaults to + `int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM))` + + excl_zone : int + The half width for the exclusion zone + """ + if T_B is None: + T_B = T_A + excl_zone = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM)) + else: + excl_zone = None + + T_A, T_A_subseq_isfinite = core.preprocess_non_normalized(T_A, m) + T_B, T_B_subseq_isfinite = core.preprocess_non_normalized(T_B, m) + + n_A = T_A.shape[0] + l = n_A - m + 1 + + if s is None: # pragma: no cover + s = excl_zone + + indices = np.random.permutation(range(0, l, s)).astype(np.int64) + + return (T_A, T_B, T_A_subseq_isfinite, T_B_subseq_isfinite, indices, s, excl_zone) + + @njit(fastmath=True) def _compute_PI( T_A, @@ -318,22 +388,16 @@ def prescraamp(T_A, m, T_B=None, s=None, p=2.0, k=1): See Algorithm 2 """ - if T_B is None: - T_B = T_A - excl_zone = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM)) - else: - excl_zone = None - - T_A, T_A_subseq_isfinite = core.preprocess_non_normalized(T_A, m) - T_B, T_B_subseq_isfinite = core.preprocess_non_normalized(T_B, m) - - n_A = T_A.shape[0] - l = n_A - m + 1 - - if s is None: # pragma: no cover - s = excl_zone + ( + T_A, + T_B, + T_A_subseq_isfinite, + T_B_subseq_isfinite, + indices, + s, + excl_zone, + ) = _preprocess_prescraamp(T_A, m, T_B=T_B, s=s) - indices = np.random.permutation(range(0, l, s)).astype(np.int64) P, I = _prescraamp( T_A, T_B, @@ -532,9 +596,38 @@ def __init__( if pre_scraamp: if self._ignore_trivial: - P, I = prescraamp(T_A, m, s=s, p=p) + ( + T_A, + T_B, + T_A_subseq_isfinite, + T_B_subseq_isfinite, + indices, + s, + excl_zone, + ) = _preprocess_prescraamp(T_A, m, s=s) else: - P, I = prescraamp(T_A, m, T_B=T_B, s=s, p=p) + ( + T_A, + T_B, + T_A_subseq_isfinite, + T_B_subseq_isfinite, + indices, + s, + excl_zone, + ) = _preprocess_prescraamp(T_A, m, T_B=T_B, s=s) + + P, I = _prescraamp( + T_A, + T_B, + m, + T_A_subseq_isfinite, + T_B_subseq_isfinite, + p, + indices, + s, + excl_zone, + ) + for i in range(P.shape[0]): if self._P[i, 0] > P[i]: self._P[i, 0] = P[i] From 666b93e0638a247c878f271ab347e839ee2e0cfb Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Wed, 31 Aug 2022 18:56:16 -0600 Subject: [PATCH 395/416] Revise naive function --- tests/naive.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 2e7b9d160..b2be9f86d 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1828,7 +1828,7 @@ def _total_diagonal_ndists(tile_lower_diag, tile_upper_diag, tile_height, tile_w def merge_topk_PI(PA, PB, IA, IB): if PA.ndim == 1: - mask = PB < PA + mask = (PB < PA) & (IB != IA) PA[mask] = PB[mask] IA[mask] = IB[mask] return @@ -1870,7 +1870,7 @@ def merge_topk_ρI(ρA, ρB, IA, IB): # ties: [0_B, 0_A, 0'_A, 1_B, 1'_B, 1_A], and we just need to keep the second # half of this array, and discard the first half. if ρA.ndim == 1: - mask = ρB > ρA + mask = (ρB > ρA) & (IB != IA) ρA[mask] = ρB[mask] IA[mask] = IB[mask] return From eee6d75db6bbba55b90acc3803ab529983da7f60 Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Wed, 31 Aug 2022 18:59:17 -0600 Subject: [PATCH 396/416] Fix value of imprecision in test functions --- tests/test_core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_core.py b/tests/test_core.py index 1765c048d..52d00f706 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1108,7 +1108,7 @@ def test_merge_topk_PI_with_overlap(): for i in range(n): # create overlaps col_IDX = np.random.choice(np.arange(k), num_overlaps[i], replace=False) - imprecision = np.random.uniform(low=-1e6, high=1e6, size=len(col_IDX)) + imprecision = np.random.uniform(low=-1e-06, high=1e-06, size=len(col_IDX)) PB[i, col_IDX] = PA[i, col_IDX] + imprecision IB[i, col_IDX] = IA[i, col_IDX] @@ -1202,7 +1202,7 @@ def test_merge_topk_ρI_with_overlap(): for i in range(n): # create overlaps col_IDX = np.random.choice(np.arange(k), num_overlaps[i], replace=False) - imprecision = np.random.uniform(low=-1e6, high=1e6, size=len(col_IDX)) + imprecision = np.random.uniform(low=-1e-06, high=1e-06, size=len(col_IDX)) ρB[i, col_IDX] = ρA[i, col_IDX] + imprecision IB[i, col_IDX] = IA[i, col_IDX] From 27d229b9f325cb661d618b93e0146dc53044d8ed Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Wed, 31 Aug 2022 19:13:05 -0600 Subject: [PATCH 397/416] create overlaps randomly for test merge_topk in 1D case --- tests/test_core.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/test_core.py b/tests/test_core.py index 52d00f706..693920973 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1135,6 +1135,7 @@ def test_merge_topk_PI_with_overlap(): def test_merge_topk_PI_with_1D_input(): + # including some overlaps randomly n = 50 PA = np.random.rand(n) PB = np.random.rand(n) @@ -1142,6 +1143,12 @@ def test_merge_topk_PI_with_1D_input(): IA = np.arange(n) IB = IA + n + n_overlaps = np.random.randint(1, n + 1) + IDX_rows_with_overlaps = np.random.choice(np.arange(n), n_overlaps, replace=False) + imprecision = np.random.uniform(low=-1e-06, high=1e-06, size=n_overlaps) + PB[IDX_rows_with_overlaps] = PA[IDX_rows_with_overlaps] + imprecision + IB[IDX_rows_with_overlaps] = IA[IDX_rows_with_overlaps] + ref_P = PA.copy() ref_I = IA.copy() @@ -1229,6 +1236,7 @@ def test_merge_topk_ρI_with_overlap(): def test_merge_topk_ρI_with_1D_input(): + # including some overlaps randomly n = 50 ρA = np.random.rand(n) ρB = np.random.rand(n) @@ -1242,6 +1250,12 @@ def test_merge_topk_ρI_with_1D_input(): comp_ρ = ρA.copy() comp_I = IA.copy() + n_overlaps = np.random.randint(1, n + 1) + IDX_rows_with_overlaps = np.random.choice(np.arange(n), n_overlaps, replace=False) + imprecision = np.random.uniform(low=-1e-06, high=1e-06, size=n_overlaps) + ρB[IDX_rows_with_overlaps] = ρA[IDX_rows_with_overlaps] + imprecision + IB[IDX_rows_with_overlaps] = IA[IDX_rows_with_overlaps] + naive.merge_topk_PI(ref_ρ, ρB.copy(), ref_I, IB.copy()) core._merge_topk_PI(comp_ρ, ρB.copy(), comp_I, IB.copy()) From 03f19d8ef2bf4cceee7c770df036a47d27818cb1 Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Sat, 3 Sep 2022 13:51:47 -0600 Subject: [PATCH 398/416] Revise docstrings --- stumpy/core.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 6e3046f9f..6cc90ed1e 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2579,8 +2579,13 @@ def _select_P_ABBA_value(P_ABBA, k, custom_func=None): def _merge_topk_PI(PA, PB, IA, IB): """ Merge two top-k matrix profiles `PA` and `PB`, and update `PA` (in place). - When the inputs are 1D arrays, PA[i] is updated if it is less than PB[i]. In - such case, PA[i] and IA[i] are replaced with PB[i] and IB[i], respectively. + When the inputs are 1D arrays, PA[i] is updated if it is less than PB[i] and + IA[i] != IB[i]. In such case, PA[i] and IA[i] are replaced with PB[i] and IB[i], + respectively. (Note that it might happen that IA[i]=IB[i] but PA[i] != PB[i]. + This situation can occur if there is slight imprecision in numerical calculations. + In that case, we do not update PA[i] and IA[i]. While updating PA[i] and IA[i] + is harmless in this case, we avoid doing that so to be consistent with the merging + process when the inputs are 2D arrays) When the inputs are 2D arrays, always prioritizing the values of `PA` over the values of `PB` in case of ties. (i.e., values from `PB` are always inserted to the right of values from `PA`). Also, update `IA` accordingly. In case of @@ -2595,7 +2600,7 @@ def _merge_topk_PI(PA, PB, IA, IB): ---------- PA : numpy.ndarray A (top-k) matrix profile where values in each row are sorted in ascending - order. `PA` must be 2-dimensional. + order. `PA` must be 1- or 2-dimensional. PB : numpy.ndarray A (top-k) matrix profile where values in each row are sorted in ascending @@ -2647,8 +2652,13 @@ def _merge_topk_PI(PA, PB, IA, IB): def _merge_topk_ρI(ρA, ρB, IA, IB): """ Merge two top-k pearson profiles `ρA` and `ρB`, and update `ρA` (in place). - When the inputs are 1D arrays, ρA[i] is updated if it is more than ρB[i]. In - such case, ρA[i] and IA[i] are replaced with ρB[i] and IB[i], respectively. + When the inputs are 1D arrays, ρA[i] is updated if it is more than ρB[i] and + IA[i] != IB[i]. In such case, ρA[i] and IA[i] are replaced with ρB[i] and IB[i], + respectively. (Note that it might happen that IA[i]=IB[i] but ρA[i] != ρB[i]. + This situation can occur if there is slight imprecision in numerical calculations. + In that case, we do not update ρA[i] and IA[i]. While updating ρA[i] and IA[i] + is harmless in this case, we avoid doing that so to be consistent with the merging + process when the inputs are 2D arrays) When the inputs are 2D arrays, always prioritizing the values of `ρA` over the values of `ρB` in case of ties. (i.e., values from `ρB` are always inserted to the left of values from `ρA`). Also, update `IA` accordingly. In case of @@ -2663,7 +2673,7 @@ def _merge_topk_ρI(ρA, ρB, IA, IB): ---------- ρA : numpy.ndarray A (top-k) pearson profile where values in each row are sorted in ascending - order. `ρA` must be 2-dimensional. + order. `ρA` must be 1- or 2-dimensional. ρB : numpy.ndarray A (top-k) pearson profile, where values in each row are sorted in ascending From d35de3e439c972a4746d2df4895bb4e5c7e3d78c Mon Sep 17 00:00:00 2001 From: ninimama Date: Wed, 14 Sep 2022 10:05:36 -0600 Subject: [PATCH 399/416] Fix docstrings --- stumpy/core.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 6cc90ed1e..3c8a9895e 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2579,19 +2579,19 @@ def _select_P_ABBA_value(P_ABBA, k, custom_func=None): def _merge_topk_PI(PA, PB, IA, IB): """ Merge two top-k matrix profiles `PA` and `PB`, and update `PA` (in place). - When the inputs are 1D arrays, PA[i] is updated if it is less than PB[i] and + When the inputs are 1D arrays, PA[i] is updated if it is greater than PB[i] and IA[i] != IB[i]. In such case, PA[i] and IA[i] are replaced with PB[i] and IB[i], respectively. (Note that it might happen that IA[i]=IB[i] but PA[i] != PB[i]. This situation can occur if there is slight imprecision in numerical calculations. In that case, we do not update PA[i] and IA[i]. While updating PA[i] and IA[i] is harmless in this case, we avoid doing that so to be consistent with the merging process when the inputs are 2D arrays) - When the inputs are 2D arrays, always prioritizing the values of `PA` over the + When the inputs are 2D arrays, we always prioritize the values of `PA` over the values of `PB` in case of ties. (i.e., values from `PB` are always inserted to the right of values from `PA`). Also, update `IA` accordingly. In case of overlapping values between two arrays IA[i] and IB[i], the ones in IB[i] (and - their corresponding values in PB[i]) are ignored throughout the updating process o - f IA[i] (and PA[i]). + their corresponding values in PB[i]) are ignored throughout the updating process + of IA[i] (and PA[i]). Unlike `_merge_topk_ρI`, where `top-k` largest values are kept, this function keeps `top-k` smallest values. @@ -2652,14 +2652,14 @@ def _merge_topk_PI(PA, PB, IA, IB): def _merge_topk_ρI(ρA, ρB, IA, IB): """ Merge two top-k pearson profiles `ρA` and `ρB`, and update `ρA` (in place). - When the inputs are 1D arrays, ρA[i] is updated if it is more than ρB[i] and + When the inputs are 1D arrays, ρA[i] is updated if it is less than ρB[i] and IA[i] != IB[i]. In such case, ρA[i] and IA[i] are replaced with ρB[i] and IB[i], respectively. (Note that it might happen that IA[i]=IB[i] but ρA[i] != ρB[i]. This situation can occur if there is slight imprecision in numerical calculations. In that case, we do not update ρA[i] and IA[i]. While updating ρA[i] and IA[i] is harmless in this case, we avoid doing that so to be consistent with the merging process when the inputs are 2D arrays) - When the inputs are 2D arrays, always prioritizing the values of `ρA` over + When the inputs are 2D arrays, we always prioritize the values of `ρA` over the values of `ρB` in case of ties. (i.e., values from `ρB` are always inserted to the left of values from `ρA`). Also, update `IA` accordingly. In case of overlapping values between two arrays IA[i] and IB[i], the ones in IB[i] (and From 0c80852759d05965a25989810ae8b9e032955af1 Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Wed, 12 Oct 2022 23:52:44 -0600 Subject: [PATCH 400/416] minor changes --- tests/naive.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index b2be9f86d..9e21d7f1a 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1828,9 +1828,10 @@ def _total_diagonal_ndists(tile_lower_diag, tile_upper_diag, tile_height, tile_w def merge_topk_PI(PA, PB, IA, IB): if PA.ndim == 1: - mask = (PB < PA) & (IB != IA) - PA[mask] = PB[mask] - IA[mask] = IB[mask] + for i in range(PA.shape[0]): + if PB[i] < PA[i] and IB[i] != IA[i]: + PA[i] = PB[i] + IA[i] = IB[i] return k = PA.shape[1] @@ -1870,9 +1871,10 @@ def merge_topk_ρI(ρA, ρB, IA, IB): # ties: [0_B, 0_A, 0'_A, 1_B, 1'_B, 1_A], and we just need to keep the second # half of this array, and discard the first half. if ρA.ndim == 1: - mask = (ρB > ρA) & (IB != IA) - ρA[mask] = ρB[mask] - IA[mask] = IB[mask] + for i in range(ρA.shape[0]): + if ρB[i] > ρA[i] and IB[i] != IA[i]: + ρA[i] = ρB[i] + IA[i] = IB[i] return k = ρA.shape[1] From 2e3af6a51135c5c11e7087d3307c79d8428eeed3 Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Thu, 13 Oct 2022 00:08:02 -0600 Subject: [PATCH 401/416] minor fix --- tests/naive.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 9e21d7f1a..05f101222 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -720,12 +720,12 @@ def __init__(self, T, m, excl_zone=None, p=2.0): self._m = m self._p = p - if excl_zone is None: # apply similar changes in naive `class stumpi_egress` - excl_zone = int(np.ceil(self._m / config.STUMPY_EXCL_ZONE_DENOM)) self._excl_zone = excl_zone + if self._excl_zone is None: + self._excl_zone = int(np.ceil(self._m / config.STUMPY_EXCL_ZONE_DENOM)) self._l = self._T.shape[0] - m + 1 - mp = aamp(T, m, p=p) + mp = aamp(T, m, exclusion_zone=self._excl_zone, p=p) self.P_ = mp[:, 0] self.I_ = mp[:, 1].astype(np.int64) self.left_P_ = np.full(self.P_.shape, np.inf) From a6460340749dc3cc990ddec19b1b4e9b9bb28d5e Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Fri, 14 Oct 2022 23:09:06 -0600 Subject: [PATCH 402/416] change variable name --- tests/naive.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 05f101222..96dacba82 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -807,10 +807,12 @@ def __init__(self, T, m, excl_zone=None, k=1): self.left_I_ = mp[:, 2 * k].astype(np.int64) self.left_P_ = np.full_like(self.left_I_, np.inf, dtype=np.float64) - for i, nn_i in enumerate(self.left_I_): - if nn_i >= 0: - D = core.mass(self._T[i : i + self._m], self._T[nn_i : nn_i + self._m]) - self.left_P_[i] = D[0] + for idx, nn_idx in enumerate(self.left_I_): + if nn_idx >= 0: + D = core.mass( + self._T[idx : idx + self._m], self._T[nn_idx : nn_idx + self._m] + ) + self.left_P_[idx] = D[0] self._n_appended = 0 From d6a0a3d00470767cd63dbb033360ef6975f3e432 Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Fri, 14 Oct 2022 23:36:41 -0600 Subject: [PATCH 403/416] change variables names --- tests/test_gpu_stump.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_gpu_stump.py b/tests/test_gpu_stump.py index 3f86ab03f..a6d4e6953 100644 --- a/tests/test_gpu_stump.py +++ b/tests/test_gpu_stump.py @@ -51,14 +51,14 @@ def test_gpu_stump_int_input(): @cuda.jit("(f8[:, :], f8[:], i8[:], i8, b1, i8[:])") -def _gpu_searchsorted_kernel(A, V, bfs, nlevel, is_left, IDX): +def _gpu_searchsorted_kernel(a, v, bfs, nlevel, is_left, idx): # A wrapper kernel for calling device function _gpu_searchsorted_left/right. i = cuda.grid(1) - if i < A.shape[0]: + if i < a.shape[0]: if is_left: - IDX[i] = _gpu_searchsorted_left(A[i], V[i], bfs, nlevel) + idx[i] = _gpu_searchsorted_left(a[i], v[i], bfs, nlevel) else: - IDX[i] = _gpu_searchsorted_right(A[i], V[i], bfs, nlevel) + idx[i] = _gpu_searchsorted_right(a[i], v[i], bfs, nlevel) @pytest.mark.filterwarnings("ignore", category=NumbaPerformanceWarning) From 6ae95ec5d64955640d7c15f62afef023b66c80a6 Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Fri, 14 Oct 2022 23:53:54 -0600 Subject: [PATCH 404/416] convert attr to property attr to get 1D when k is 1 --- tests/naive.py | 71 ++++++++++++++++++++++++++++---------------------- 1 file changed, 40 insertions(+), 31 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 96dacba82..8774f4b46 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -801,30 +801,22 @@ def __init__(self, T, m, excl_zone=None, k=1): self._l = self._T.shape[0] - m + 1 mp = stump(T, m, exclusion_zone=self._excl_zone, k=self._k) - self.P_ = mp[:, :k].astype(np.float64) - self.I_ = mp[:, k : 2 * k].astype(np.int64) + self._P = mp[:, :k].astype(np.float64) + self._I = mp[:, k : 2 * k].astype(np.int64) - self.left_I_ = mp[:, 2 * k].astype(np.int64) - self.left_P_ = np.full_like(self.left_I_, np.inf, dtype=np.float64) + self._left_I = mp[:, 2 * k].astype(np.int64) + self._left_P = np.full_like(self._left_I, np.inf, dtype=np.float64) - for idx, nn_idx in enumerate(self.left_I_): + for idx, nn_idx in enumerate(self._left_I): if nn_idx >= 0: D = core.mass( self._T[idx : idx + self._m], self._T[nn_idx : nn_idx + self._m] ) - self.left_P_[idx] = D[0] + self._left_P[idx] = D[0] self._n_appended = 0 - if self._k == 1: - self.P_ = self.P_.flatten() - self.I_ = self.I_.flatten() - def update(self, t): - # ensure than self.P_ and self.I_ are 2D - self.P_ = self.P_.reshape(-1, self._k) - self.I_ = self.I_.reshape(-1, self._k) - self._T[:] = np.roll(self._T, -1) self._T_isfinite[:] = np.roll(self._T_isfinite, -1) if np.isfinite(t): @@ -835,10 +827,10 @@ def update(self, t): self._T[-1] = 0 self._n_appended += 1 - self.P_ = np.roll(self.P_, -1, axis=0) - self.I_ = np.roll(self.I_, -1, axis=0) - self.left_P_[:] = np.roll(self.left_P_, -1) - self.left_I_[:] = np.roll(self.left_I_, -1) + self._P = np.roll(self._P, -1, axis=0) + self._I = np.roll(self._I, -1, axis=0) + self._left_P[:] = np.roll(self._left_P, -1) + self._left_I[:] = np.roll(self._left_I, -1) D = core.mass(self._T[-self._m :], self._T) T_subseq_isfinite = np.all( @@ -851,28 +843,45 @@ def update(self, t): apply_exclusion_zone(D, D.shape[0] - 1, self._excl_zone, np.inf) # update top-k matrix profile using newly calculated distance profile `D` for j in range(D.shape[0]): - if D[j] < self.P_[j, -1]: - pos = np.searchsorted(self.P_[j], D[j], side="right") - self.P_[j] = np.insert(self.P_[j], pos, D[j])[:-1] - self.I_[j] = np.insert( - self.I_[j], pos, D.shape[0] - 1 + self._n_appended + if D[j] < self._P[j, -1]: + pos = np.searchsorted(self._P[j], D[j], side="right") + self._P[j] = np.insert(self._P[j], pos, D[j])[:-1] + self._I[j] = np.insert( + self._I[j], pos, D.shape[0] - 1 + self._n_appended )[:-1] # update top-k for the last, newly-updated index I_last_topk = np.argsort(D, kind="mergesort")[: self._k] - self.P_[-1] = D[I_last_topk] - self.I_[-1] = I_last_topk + self._n_appended - self.I_[-1][self.P_[-1] == np.inf] = -1 + self._P[-1] = D[I_last_topk] + self._I[-1] = I_last_topk + self._n_appended + self._I[-1][self._P[-1] == np.inf] = -1 # for the last index, the left matrix profile value is self.P_[-1, 0] # and the same goes for the left matrix profile index - self.left_P_[-1] = self.P_[-1, 0] - self.left_I_[-1] = self.I_[-1, 0] + self._left_P[-1] = self._P[-1, 0] + self._left_I[-1] = self._I[-1, 0] - # post-processing: ensure that self.P_ and self.I_ are 1D. + @property + def P_(self): if self._k == 1: - self.P_ = self.P_.flatten() - self.I_ = self.I_.flatten() + return self._P.flatten().astype(np.float64) + else: + return self._P.astype(np.float64) + + @property + def I_(self): + if self._k == 1: + return self._I.flatten().astype(np.int64) + else: + return self._I.astype(np.int64) + + @property + def left_P_(self): + return self._left_P.astype(np.float64) + + @property + def left_I_(self): + return self._left_I.astype(np.int64) def across_series_nearest_neighbors(Ts, Ts_idx, subseq_idx, m): From 73ebe404def40c4566edf29f03edf58e31c61590 Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Sat, 15 Oct 2022 00:03:51 -0600 Subject: [PATCH 405/416] avoid calling performant function in a naive function --- tests/naive.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index 8774f4b46..aa7d27315 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -809,8 +809,8 @@ def __init__(self, T, m, excl_zone=None, k=1): for idx, nn_idx in enumerate(self._left_I): if nn_idx >= 0: - D = core.mass( - self._T[idx : idx + self._m], self._T[nn_idx : nn_idx + self._m] + D = distance_profile( + self._T[idx : idx + self._m], self._T[nn_idx : nn_idx + self._m], m ) self._left_P[idx] = D[0] From 4719e2f9c366ba62e5033ac8198cd0abdc7df2aa Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Sat, 15 Oct 2022 00:14:37 -0600 Subject: [PATCH 406/416] minor modification on z_norm functions --- stumpy/core.py | 8 ++++++-- tests/naive.py | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index fa8fc4737..1a02316fc 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -254,7 +254,7 @@ def rolling_window(a, window): return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) -def z_norm(a, axis=0): +def z_norm(a, axis=0, threshold=config.STUMPY_STDDEV_THRESHOLD): """ Calculate the z-normalized input array `a` by subtracting the mean and dividing by the standard deviation along a given axis. @@ -267,13 +267,17 @@ def z_norm(a, axis=0): axis : int, default 0 NumPy array axis + threshold : float, default to config.STUMPY_STDDEV_THRESHOLD + A non-nan std value being less than `threshold` will be replaced with 1.0 + Returns ------- output : numpy.ndarray An array with z-normalized values computed along a specified axis. """ std = np.std(a, axis, keepdims=True) - std[std == 0] = 1 + mask = ~np.isnan(std) & std < config.STUMPY_STDDEV_THRESHOLD + std[mask] = 1.0 return (a - np.mean(a, axis, keepdims=True)) / std diff --git a/tests/naive.py b/tests/naive.py index aa7d27315..c02f79e25 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -6,7 +6,7 @@ from stumpy import core, config -def z_norm(a, axis=0, threshold=1e-7): +def z_norm(a, axis=0, threshold=config.STUMPY_STDDEV_THRESHOLD): std = np.std(a, axis, keepdims=True) std[np.less(std, threshold, where=~np.isnan(std))] = 1.0 From 63b28289d964ef9496ba4783d5dee79bf918bb04 Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Sat, 15 Oct 2022 00:27:05 -0600 Subject: [PATCH 407/416] fix function --- stumpy/core.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index 0416cb510..f29d45500 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -276,8 +276,7 @@ def z_norm(a, axis=0, threshold=config.STUMPY_STDDEV_THRESHOLD): An array with z-normalized values computed along a specified axis. """ std = np.std(a, axis, keepdims=True) - mask = ~np.isnan(std) & std < config.STUMPY_STDDEV_THRESHOLD - std[mask] = 1.0 + std[np.less(std, threshold, where=~np.isnan(std))] = 1.0 return (a - np.mean(a, axis, keepdims=True)) / std From d1f3119ce5e5cf1012c0fdb6a66e57106bb14b4e Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Mon, 17 Oct 2022 21:33:11 -0600 Subject: [PATCH 408/416] revise docstrings --- stumpy/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stumpy/core.py b/stumpy/core.py index f29d45500..ef99a0746 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -2582,7 +2582,7 @@ def _merge_topk_PI(PA, PB, IA, IB): Merge two top-k matrix profiles `PA` and `PB`, and update `PA` (in place). When the inputs are 1D arrays, PA[i] is updated if it is greater than PB[i] and IA[i] != IB[i]. In such case, PA[i] and IA[i] are replaced with PB[i] and IB[i], - respectively. (Note that it might happen that IA[i]=IB[i] but PA[i] != PB[i]. + respectively. (Note that it might happen that IA[i]==IB[i] but PA[i] != PB[i]. This situation can occur if there is slight imprecision in numerical calculations. In that case, we do not update PA[i] and IA[i]. While updating PA[i] and IA[i] is harmless in this case, we avoid doing that so to be consistent with the merging @@ -2655,7 +2655,7 @@ def _merge_topk_ρI(ρA, ρB, IA, IB): Merge two top-k pearson profiles `ρA` and `ρB`, and update `ρA` (in place). When the inputs are 1D arrays, ρA[i] is updated if it is less than ρB[i] and IA[i] != IB[i]. In such case, ρA[i] and IA[i] are replaced with ρB[i] and IB[i], - respectively. (Note that it might happen that IA[i]=IB[i] but ρA[i] != ρB[i]. + respectively. (Note that it might happen that IA[i]==IB[i] but ρA[i] != ρB[i]. This situation can occur if there is slight imprecision in numerical calculations. In that case, we do not update ρA[i] and IA[i]. While updating ρA[i] and IA[i] is harmless in this case, we avoid doing that so to be consistent with the merging From 4a94c0ed0ea05a0daac7877d41ab1b3a51fc7e02 Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Mon, 17 Oct 2022 21:46:01 -0600 Subject: [PATCH 409/416] change variable name --- stumpy/scrump.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 52b9a99ab..0482ecbec 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -191,26 +191,26 @@ def _compute_PI( if excl_zone is not None: core._apply_exclusion_zone(squared_distance_profile, i, excl_zone, np.inf) - nn_idx = np.argmin(squared_distance_profile) + nn_i = np.argmin(squared_distance_profile) if ( - squared_distance_profile[nn_idx] < P_squared[thread_idx, i, -1] - and nn_idx not in I[thread_idx, i] + squared_distance_profile[nn_i] < P_squared[thread_idx, i, -1] + and nn_i not in I[thread_idx, i] ): idx = np.searchsorted( P_squared[thread_idx, i], - squared_distance_profile[nn_idx], + squared_distance_profile[nn_i], side="right", ) core._shift_insert_at_index( - P_squared[thread_idx, i], idx, squared_distance_profile[nn_idx] + P_squared[thread_idx, i], idx, squared_distance_profile[nn_i] ) - core._shift_insert_at_index(I[thread_idx, i], idx, nn_idx) + core._shift_insert_at_index(I[thread_idx, i], idx, nn_i) if P_squared[thread_idx, i, 0] == np.inf: # pragma: no cover I[thread_idx, i, 0] = -1 continue - j = nn_idx + j = nn_i # Given the squared distance, work backwards and compute QT QT_j = (m - P_squared[thread_idx, i, 0] / 2.0) * (Σ_T[j] * σ_Q[i]) + ( m * M_T[j] * μ_Q[i] From 34361f7935023cad5ba7c4c9cd71d5841405f3a8 Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Mon, 17 Oct 2022 21:51:01 -0600 Subject: [PATCH 410/416] Relocate comment --- tests/naive.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index c02f79e25..e123f37ff 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1463,11 +1463,11 @@ def prescrump(T_A, m, T_B, s, exclusion_zone=None, k=1): j = nn_idx for g in range(1, min(s, l - i, w - j)): d = dist_matrix[i + g, j + g] + # Do NOT optimize the `condition` in the following if statement + # and similar ones in this naive function. This is to ensure + # we are avoiding duplicates in each row of I. if d < P[i + g, -1] and (j + g) not in I[i + g]: pos = np.searchsorted(P[i + g], d, side="right") - # Do NOT optimize the `condition` in the following if statement - # and similar ones in this naive function. This is to ensure - # we are avoiding duplicates in each row of I. P[i + g] = np.insert(P[i + g], pos, d)[:-1] I[i + g] = np.insert(I[i + g], pos, j + g)[:-1] if ( From 8d0258a71d56dcd1689109da11db010326b6db78 Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Mon, 17 Oct 2022 22:08:31 -0600 Subject: [PATCH 411/416] minor changes --- tests/naive.py | 66 ++++++++++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 29 deletions(-) diff --git a/tests/naive.py b/tests/naive.py index e123f37ff..a58069bba 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -1845,25 +1845,30 @@ def merge_topk_PI(PA, PB, IA, IB): IA[i] = IB[i] return - k = PA.shape[1] - for i in range(PA.shape[0]): - _, _, overlap_idx_B = np.intersect1d(IA[i], IB[i], return_indices=True) - PB[i, overlap_idx_B] = np.inf - IB[i, overlap_idx_B] = -1 + else: + k = PA.shape[1] + for i in range(PA.shape[0]): + _, _, overlap_idx_B = np.intersect1d(IA[i], IB[i], return_indices=True) + PB[i, overlap_idx_B] = np.inf + IB[i, overlap_idx_B] = -1 - profile = np.column_stack((PA, PB)) - indices = np.column_stack((IA, IB)) - IDX = np.argsort(profile, axis=1, kind="mergesort") - profile[:, :] = np.take_along_axis(profile, IDX, axis=1) - indices[:, :] = np.take_along_axis(indices, IDX, axis=1) + profile = np.column_stack((PA, PB)) + indices = np.column_stack((IA, IB)) + IDX = np.argsort(profile, axis=1, kind="mergesort") + profile[:, :] = np.take_along_axis(profile, IDX, axis=1) + indices[:, :] = np.take_along_axis(indices, IDX, axis=1) - PA[:, :] = profile[:, :k] - IA[:, :] = indices[:, :k] + PA[:, :] = profile[:, :k] + IA[:, :] = indices[:, :k] + + return def merge_topk_ρI(ρA, ρB, IA, IB): - # this is to merge two pearson profiles `ρA` and `ρB`, where each is a 2D array - # and each row is sorted ascendingly. we want to keep top-k largest values in + # This function merges two pearson profiles `ρA` and `ρB`, and updates `ρA` + # and `IA` accordingly. When the inputs are 1D, `ρA[i]` is updated if + # `ρA[i] < ρB[i]` and IA[i] != IB[i]. When the inputs are 2D, each row in + # `ρA` and `ρB` is sorted ascendingly. we want to keep top-k largest values in # merging row `ρA[i]` and `ρB[i]`. # In case of ties between `ρA` and `ρB`, the priority is with `ρA`. In case @@ -1879,8 +1884,8 @@ def merge_topk_ρI(ρA, ρB, IA, IB): # For the same example: # merging `ρB` and `ρA` ascendingly while choosing `ρB` over `ρA` in case of - # ties: [0_B, 0_A, 0'_A, 1_B, 1'_B, 1_A], and we just need to keep the second - # half of this array, and discard the first half. + # ties: [0_B, 0_A, 0'_A, 1_B, 1'_B, 1_A], and the second half of this array + # is the desribale outcome. if ρA.ndim == 1: for i in range(ρA.shape[0]): if ρB[i] > ρA[i] and IB[i] != IA[i]: @@ -1888,22 +1893,25 @@ def merge_topk_ρI(ρA, ρB, IA, IB): IA[i] = IB[i] return - k = ρA.shape[1] - for i in range(ρA.shape[0]): - _, _, overlap_idx_B = np.intersect1d(IA[i], IB[i], return_indices=True) - ρB[i, overlap_idx_B] = np.NINF - IB[i, overlap_idx_B] = -1 + else: + k = ρA.shape[1] + for i in range(ρA.shape[0]): + _, _, overlap_idx_B = np.intersect1d(IA[i], IB[i], return_indices=True) + ρB[i, overlap_idx_B] = np.NINF + IB[i, overlap_idx_B] = -1 - profile = np.column_stack((ρB, ρA)) - indices = np.column_stack((IB, IA)) + profile = np.column_stack((ρB, ρA)) + indices = np.column_stack((IB, IA)) - idx = np.argsort(profile, axis=1, kind="mergesort") - profile[:, :] = np.take_along_axis(profile, idx, axis=1) - indices[:, :] = np.take_along_axis(indices, idx, axis=1) + idx = np.argsort(profile, axis=1, kind="mergesort") + profile[:, :] = np.take_along_axis(profile, idx, axis=1) + indices[:, :] = np.take_along_axis(indices, idx, axis=1) - # keep the last k elements (top-k largest values) - ρA[:, :] = profile[:, k:] - IA[:, :] = indices[:, k:] + # keep the last k elements (top-k largest values) + ρA[:, :] = profile[:, k:] + IA[:, :] = indices[:, k:] + + return def find_matches(D, excl_zone, max_distance, max_matches=None): From abb45181c5f2e95acf50016339c8da425e6449c1 Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Tue, 8 Nov 2022 01:18:00 -0700 Subject: [PATCH 412/416] fix uint --- stumpy/stump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index aeac32f3f..49657370a 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -222,7 +222,7 @@ def _compute_diagonal( # when the newly-calculated `pearson` value becomes greater than the # first (i.e. smallest) element in this array. Note that a higher # pearson value corresponds to a lower distance. - if pearson > ρ[thread_idx, i, 0]: + if pearson > ρ[thread_idx, uint64_i, 0]: idx = np.searchsorted(ρ[thread_idx, uint64_i], pearson) core._shift_insert_at_index( ρ[thread_idx, uint64_i], idx, pearson, shift="left" From 329889eb1f7e9172eafa3f3f7b6c25090f0ccbea Mon Sep 17 00:00:00 2001 From: ninimama Date: Tue, 8 Nov 2022 06:59:38 -0700 Subject: [PATCH 413/416] fixed uint --- stumpy/stump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stumpy/stump.py b/stumpy/stump.py index aeac32f3f..49657370a 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -222,7 +222,7 @@ def _compute_diagonal( # when the newly-calculated `pearson` value becomes greater than the # first (i.e. smallest) element in this array. Note that a higher # pearson value corresponds to a lower distance. - if pearson > ρ[thread_idx, i, 0]: + if pearson > ρ[thread_idx, uint64_i, 0]: idx = np.searchsorted(ρ[thread_idx, uint64_i], pearson) core._shift_insert_at_index( ρ[thread_idx, uint64_i], idx, pearson, shift="left" From c0e9f74cb6c05aac8cadc54b05c52fa8d3374c48 Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Tue, 8 Nov 2022 21:34:20 -0700 Subject: [PATCH 414/416] fixed test function --- tests/test_core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_core.py b/tests/test_core.py index 693920973..9130c11b3 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1256,8 +1256,8 @@ def test_merge_topk_ρI_with_1D_input(): ρB[IDX_rows_with_overlaps] = ρA[IDX_rows_with_overlaps] + imprecision IB[IDX_rows_with_overlaps] = IA[IDX_rows_with_overlaps] - naive.merge_topk_PI(ref_ρ, ρB.copy(), ref_I, IB.copy()) - core._merge_topk_PI(comp_ρ, ρB.copy(), comp_I, IB.copy()) + naive._merge_topk_ρI(ref_ρ, ρB.copy(), ref_I, IB.copy()) + core._merge_topk_ρI(comp_ρ, ρB.copy(), comp_I, IB.copy()) npt.assert_almost_equal(ref_ρ, comp_ρ) npt.assert_almost_equal(ref_I, comp_I) From 27c05c35cd3a798a64f72de5caecf3c0982fcc9d Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Tue, 8 Nov 2022 21:50:52 -0700 Subject: [PATCH 415/416] fixed calling function --- tests/test_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_core.py b/tests/test_core.py index 9130c11b3..16d2d0fd2 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1256,7 +1256,7 @@ def test_merge_topk_ρI_with_1D_input(): ρB[IDX_rows_with_overlaps] = ρA[IDX_rows_with_overlaps] + imprecision IB[IDX_rows_with_overlaps] = IA[IDX_rows_with_overlaps] - naive._merge_topk_ρI(ref_ρ, ρB.copy(), ref_I, IB.copy()) + naive.merge_topk_ρI(ref_ρ, ρB.copy(), ref_I, IB.copy()) core._merge_topk_ρI(comp_ρ, ρB.copy(), comp_I, IB.copy()) npt.assert_almost_equal(ref_ρ, comp_ρ) From c45b8a4b6fee5a64304b2bf602cfa41e60b4e6e4 Mon Sep 17 00:00:00 2001 From: SolidAhmad Date: Tue, 8 Nov 2022 22:25:58 -0700 Subject: [PATCH 416/416] Removed redundant return statement --- stumpy/scrump.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/stumpy/scrump.py b/stumpy/scrump.py index 0482ecbec..f874966e2 100644 --- a/stumpy/scrump.py +++ b/stumpy/scrump.py @@ -525,8 +525,6 @@ def prescrump(T_A, m, T_B=None, s=None, normalize=True, p=2.0, k=1): else: return P, I - return P, I - @core.non_normalized( scraamp.scraamp,