[MNT] Purging similarity search from distance as argument numba funct…

…ions (#2176) * WIP purging similarity search from naive functions * Update Docs and testing configs * Removes channel independence, add img, fix and add functions and tests * Update notebook to avoid timeout and remove test from removed func * Update normalize to normalise and tests * Update normalize to normalise and tests * Adjust test to not execute those caused by a known issue and uncomment others * remove float_ and typo * Add similarity search test structure * Add test for data generators similarity search * Add similarity search test structure * Fix tests * Fix tests * test adding back expected results rdst * Add back exclusion for RDSTRegressor * Empty commit for CI * Fix buggy tests * Empty commit for CI --------- Co-authored-by: MatthewMiddlehurst <[email protected]> Co-authored-by: Matthew Middlehurst <[email protected]>
aeon-toolkit · Nov 9, 2024 · 01495e7 · 01495e7
1 parent d59bd25
commit 01495e7
Show file tree

Hide file tree

Showing 41 changed files with 1,553 additions and 2,182 deletions.
diff --git a/aeon/classification/shapelet_based/_rdst.py b/aeon/classification/shapelet_based/_rdst.py
@@ -132,7 +132,6 @@ class RDSTClassifier(BaseClassifier):
         "capability:unequal_length": True,
         "capability:multithreading": True,
         "X_inner_type": ["np-list", "numpy3D"],
-        "non_deterministic": True,  # due to random_state bug in MacOS #324
         "algorithm_type": "shapelet",
     }
 

diff --git a/aeon/regression/shapelet_based/_rdst.py b/aeon/regression/shapelet_based/_rdst.py
@@ -113,7 +113,6 @@ class RDSTRegressor(BaseRegressor):
         "capability:unequal_length": True,
         "capability:multithreading": True,
         "X_inner_type": ["np-list", "numpy3D"],
-        "non_deterministic": True,  # due to random_state bug in MacOS #324
         "algorithm_type": "shapelet",
     }
 

diff --git a/aeon/similarity_search/_commons.py b/aeon/similarity_search/_commons.py
@@ -6,8 +6,152 @@
 
 import numpy as np
 from numba import njit, prange
+from numba.typed import List
 from scipy.signal import convolve
 
+from aeon.utils.numba.general import (
+    get_all_subsequences,
+    normalise_subsequences,
+    sliding_mean_std_one_series,
+    z_normalise_series_2d,
+)
+
+
+@njit(cache=True, fastmath=True)
+def _compute_dist_profile(X_subs, q):
+    """
+    Compute the distance profile between subsequences and a query.
+
+    Parameters
+    ----------
+    X_subs : array, shape=(n_samples, n_channels, query_length)
+        Input subsequences extracted from a time series.
+    q : array, shape=(n_channels, query_length)
+        Query used for the distance computation
+
+    Returns
+    -------
+    dist_profile : np.ndarray, 1D array of shape (n_samples)
+        The distance between the query all subsequences.
+
+    """
+    n_candidates, n_channels, q_length = X_subs.shape
+    dist_profile = np.zeros(n_candidates)
+    for i in range(n_candidates):
+        for j in range(n_channels):
+            for k in range(q_length):
+                dist_profile[i] += (X_subs[i, j, k] - q[j, k]) ** 2
+    return dist_profile
+
+
+@njit(cache=True, fastmath=True)
+def naive_squared_distance_profile(
+    X,
+    q,
+    mask,
+    normalise=False,
+    X_means=None,
+    X_stds=None,
+):
+    """
+    Compute a squared euclidean distance profile.
+
+    Parameters
+    ----------
+    X : array, shape=(n_samples, n_channels, n_timepoints)
+        Input time series dataset to search in.
+    q : array, shape=(n_channels, query_length)
+        Query used during the search.
+    mask : array, shape=(n_samples,  n_timepoints - query_length + 1)
+        Boolean mask indicating candidates for which the distance
+        profiles computed for each query should be set to infinity.
+    normalise : bool
+        Wheter to use a z-normalised distance.
+    X_means : array, shape=(n_samples, n_channels, n_timepoints - query_length + 1)
+        Mean of each candidate (subsequence) of length query_length in X. The
+        default is None, meaning that these values will be computed if normalise
+        is True. If provided, the computations will be skipped.
+    X_stds : array, shape=(n_samples, n_channels, n_timepoints - query_length + 1)
+        Standard deviation of each candidate (subsequence) of length query_length
+        in X. The default is None, meaning that these values will be computed if
+        normalise is True. If provided, the computations will be skipped.
+
+    Returns
+    -------
+    out : np.ndarray, 1D array of shape (n_samples, n_timepoints_t - query_length + 1)
+        The distance between the query and all candidates in X.
+
+    """
+    query_length = q.shape[1]
+    dist_profiles = List()
+    # Init distance profile array with unequal length support
+    for i in range(len(X)):
+        dist_profiles.append(np.zeros(X[i].shape[1] - query_length + 1))
+    if normalise:
+        q = z_normalise_series_2d(q)
+    else:
+        q = q.astype(np.float64)
+    for i in range(len(X)):
+        # Numba don't support strides with integers ?
+
+        X_subs = get_all_subsequences(X[i].astype(np.float64), query_length, 1)
+        if normalise:
+            if X_means is None and X_stds is None:
+                _X_means, _X_stds = sliding_mean_std_one_series(X[i], query_length, 1)
+            else:
+                _X_means, _X_stds = X_means[i], X_stds[i]
+            X_subs = normalise_subsequences(X_subs, _X_means, _X_stds)
+        dist_profile = _compute_dist_profile(X_subs, q)
+        dist_profile[~mask[i]] = np.inf
+        dist_profiles[i] = dist_profile
+    return dist_profiles
+
+
+@njit(cache=True, fastmath=True)
+def naive_squared_matrix_profile(X, T, query_length, mask, normalise=False):
+    """
+    Compute a squared euclidean matrix profile.
+
+    Parameters
+    ----------
+    X : array, shape=(n_samples, n_channels, n_timepoints_x)
+        Input time series dataset to search in.
+    T : array, shape=(n_channels, n_timepoints_t)
+        Time series from which queries are extracted.
+    query_length : int
+        Length of the queries to extract from T.
+    mask : array, shape=(n_samples, n_timepoints_x - query_length + 1)
+        Boolean mask indicating candidates for which the distance
+        profiles computed for each query should be set to infinity.
+    normalise : bool
+        Wheter to use a z-normalised distance.
+
+    Returns
+    -------
+    out : np.ndarray, 1D array of shape (n_timepoints_t - query_length + 1)
+        The minimum distance between each query in T and all candidates in X.
+    """
+    X_subs = List()
+    for i in range(len(X)):
+        i_subs = get_all_subsequences(X[i].astype(np.float64), query_length, 1)
+        if normalise:
+            X_means, X_stds = sliding_mean_std_one_series(X[i], query_length, 1)
+            i_subs = normalise_subsequences(i_subs, X_means, X_stds)
+        X_subs.append(i_subs)
+
+    n_candidates = T.shape[1] - query_length + 1
+    mp = np.full(n_candidates, np.inf)
+
+    for i in range(n_candidates):
+        q = T[:, i : i + query_length]
+        if normalise:
+            q = z_normalise_series_2d(q)
+        for id_sample in range(len(X)):
+            dist_profile = _compute_dist_profile(X_subs[id_sample], q)
+            dist_profile[~mask[id_sample]] = np.inf
+            mp[i] = min(mp[i], dist_profile.min())
+    return mp
+
 
 def fft_sliding_dot_product(X, q):
     """
@@ -23,7 +167,6 @@ def fft_sliding_dot_product(X, q):
     ----------
     X : array, shape=(n_channels, n_timepoints)
         Input time series
-
     q : array, shape=(n_channels, query_length)
         Input query
 

diff --git a/aeon/similarity_search/base.py b/aeon/similarity_search/base.py
@@ -30,8 +30,8 @@ class BaseSimilaritySearch(BaseCollectionEstimator):
     inverse_distance : bool, default=False
         If True, the matching will be made on the inverse of the distance, and thus, the
         worst matches to the query will be returned instead of the best ones.
-    normalize : bool, default=False
-        Whether the distance function should be z-normalized.
+    normalise : bool, default=False
+        Whether the distance function should be z-normalised.
     speed_up : str, default='fastest'
         Which speed up technique to use with for the selected distance
         function. By default, the fastest algorithm is used. A list of available
@@ -56,6 +56,7 @@ class BaseSimilaritySearch(BaseCollectionEstimator):
         "capability:multivariate": True,
         "capability:unequal_length": True,
         "capability:multithreading": True,
+        "fit_is_empty": False,
         "X_inner_type": ["np-list", "numpy3D"],
     }
 
@@ -64,14 +65,14 @@ def __init__(
         distance: str = "euclidean",
         distance_args: Optional[dict] = None,
         inverse_distance: bool = False,
-        normalize: bool = False,
+        normalise: bool = False,
         speed_up: str = "fastest",
         n_jobs: int = 1,
     ):
         self.distance = distance
         self.distance_args = distance_args
         self.inverse_distance = inverse_distance
-        self.normalize = normalize
+        self.normalise = normalise
         self.n_jobs = n_jobs
         self.speed_up = speed_up
         super().__init__()
@@ -108,6 +109,7 @@ def fit(self, X: np.ndarray, y=None):
         set_num_threads(self._n_jobs)
         self._fit(X, y)
         set_num_threads(prev_threads)
+        self.is_fitted = True
         return self
 
     def _store_mean_std_from_inputs(self, query_length: int) -> None:

diff --git a/aeon/similarity_search/distance_profiles/__init__.py b/aeon/similarity_search/distance_profiles/__init__.py
@@ -1,24 +1,18 @@
 """Distance profiles."""
 
 __all__ = [
-    "naive_distance_profile",
-    "normalized_naive_distance_profile",
     "euclidean_distance_profile",
-    "normalized_euclidean_distance_profile",
+    "normalised_euclidean_distance_profile",
     "squared_distance_profile",
-    "normalized_squared_distance_profile",
+    "normalised_squared_distance_profile",
 ]
 
 
 from aeon.similarity_search.distance_profiles.euclidean_distance_profile import (
     euclidean_distance_profile,
-    normalized_euclidean_distance_profile,
-)
-from aeon.similarity_search.distance_profiles.naive_distance_profile import (
-    naive_distance_profile,
-    normalized_naive_distance_profile,
+    normalised_euclidean_distance_profile,
 )
 from aeon.similarity_search.distance_profiles.squared_distance_profile import (
-    normalized_squared_distance_profile,
+    normalised_squared_distance_profile,
     squared_distance_profile,
 )
diff --git a/aeon/similarity_search/distance_profiles/euclidean_distance_profile.py b/aeon/similarity_search/distance_profiles/euclidean_distance_profile.py
@@ -9,7 +9,7 @@
 from numba.typed import List
 
 from aeon.similarity_search.distance_profiles.squared_distance_profile import (
-    normalized_squared_distance_profile,
+    normalised_squared_distance_profile,
     squared_distance_profile,
 )
 
@@ -39,18 +39,18 @@ def euclidean_distance_profile(
     Returns
     -------
     distance_profiles : np.ndarray
-        3D array of shape (n_cases, n_channels, n_timepoints - query_length + 1)
-        The distance profile between q and the input time series X independently
-        for each channel.
+        3D array of shape (n_cases, n_timepoints - query_length + 1)
+        The distance profile between q and the input time series X.
 
     """
     distance_profiles = squared_distance_profile(X, q, mask)
+    # Need loop as we can return a list of np array in the unequal length case
     for i in range(len(distance_profiles)):
         distance_profiles[i] = distance_profiles[i] ** 0.5
     return distance_profiles
 
 
-def normalized_euclidean_distance_profile(
+def normalised_euclidean_distance_profile(
     X: Union[np.ndarray, List],
     q: np.ndarray,
     mask: np.ndarray,
@@ -89,14 +89,14 @@ def normalized_euclidean_distance_profile(
     Returns
     -------
     distance_profiles : np.ndarray
-        3D array of shape (n_cases, n_channels, n_timepoints - query_length + 1)
-        The distance profile between q and the input time series X independently
-        for each channel.
+        3D array of shape (n_cases, n_timepoints - query_length + 1)
+        The distance profile between q and the input time series X.
 
     """
-    distance_profiles = normalized_squared_distance_profile(
+    distance_profiles = normalised_squared_distance_profile(
         X, q, mask, X_means, X_stds, q_means, q_stds
     )
+    # Need loop as we can return a list of np array in the unequal length case
     for i in range(len(distance_profiles)):
         distance_profiles[i] = distance_profiles[i] ** 0.5
     return distance_profiles