Skip to content

Commit

Permalink
[MNT] Purging similarity search from distance as argument numba funct…
Browse files Browse the repository at this point in the history
…ions (#2176)

* WIP purging similarity search from naive functions

* Update Docs and testing configs

* Removes channel independence, add img, fix and add functions and tests

* Update notebook to avoid timeout and remove test from removed func

* Update normalize to normalise and tests

* Update normalize to normalise and tests

* Adjust test to not execute those caused by a known issue and uncomment others

* remove float_ and typo

* Add similarity search test structure

* Add test for data generators similarity search

* Add similarity search test structure

* Fix tests

* Fix tests

* test adding back expected results rdst

* Add back exclusion for RDSTRegressor

* Empty commit for CI

* Fix buggy tests

* Empty commit for CI

---------

Co-authored-by: MatthewMiddlehurst <[email protected]>
Co-authored-by: Matthew Middlehurst <[email protected]>
  • Loading branch information
3 people authored Nov 9, 2024
1 parent d59bd25 commit 01495e7
Show file tree
Hide file tree
Showing 41 changed files with 1,553 additions and 2,182 deletions.
1 change: 0 additions & 1 deletion aeon/classification/shapelet_based/_rdst.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,6 @@ class RDSTClassifier(BaseClassifier):
"capability:unequal_length": True,
"capability:multithreading": True,
"X_inner_type": ["np-list", "numpy3D"],
"non_deterministic": True, # due to random_state bug in MacOS #324
"algorithm_type": "shapelet",
}

Expand Down
1 change: 0 additions & 1 deletion aeon/regression/shapelet_based/_rdst.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,6 @@ class RDSTRegressor(BaseRegressor):
"capability:unequal_length": True,
"capability:multithreading": True,
"X_inner_type": ["np-list", "numpy3D"],
"non_deterministic": True, # due to random_state bug in MacOS #324
"algorithm_type": "shapelet",
}

Expand Down
145 changes: 144 additions & 1 deletion aeon/similarity_search/_commons.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,152 @@

import numpy as np
from numba import njit, prange
from numba.typed import List
from scipy.signal import convolve

from aeon.utils.numba.general import (
get_all_subsequences,
normalise_subsequences,
sliding_mean_std_one_series,
z_normalise_series_2d,
)


@njit(cache=True, fastmath=True)
def _compute_dist_profile(X_subs, q):
"""
Compute the distance profile between subsequences and a query.
Parameters
----------
X_subs : array, shape=(n_samples, n_channels, query_length)
Input subsequences extracted from a time series.
q : array, shape=(n_channels, query_length)
Query used for the distance computation
Returns
-------
dist_profile : np.ndarray, 1D array of shape (n_samples)
The distance between the query all subsequences.
"""
n_candidates, n_channels, q_length = X_subs.shape
dist_profile = np.zeros(n_candidates)
for i in range(n_candidates):
for j in range(n_channels):
for k in range(q_length):
dist_profile[i] += (X_subs[i, j, k] - q[j, k]) ** 2
return dist_profile


@njit(cache=True, fastmath=True)
def naive_squared_distance_profile(
X,
q,
mask,
normalise=False,
X_means=None,
X_stds=None,
):
"""
Compute a squared euclidean distance profile.
Parameters
----------
X : array, shape=(n_samples, n_channels, n_timepoints)
Input time series dataset to search in.
q : array, shape=(n_channels, query_length)
Query used during the search.
mask : array, shape=(n_samples, n_timepoints - query_length + 1)
Boolean mask indicating candidates for which the distance
profiles computed for each query should be set to infinity.
normalise : bool
Wheter to use a z-normalised distance.
X_means : array, shape=(n_samples, n_channels, n_timepoints - query_length + 1)
Mean of each candidate (subsequence) of length query_length in X. The
default is None, meaning that these values will be computed if normalise
is True. If provided, the computations will be skipped.
X_stds : array, shape=(n_samples, n_channels, n_timepoints - query_length + 1)
Standard deviation of each candidate (subsequence) of length query_length
in X. The default is None, meaning that these values will be computed if
normalise is True. If provided, the computations will be skipped.
Returns
-------
out : np.ndarray, 1D array of shape (n_samples, n_timepoints_t - query_length + 1)
The distance between the query and all candidates in X.
"""
query_length = q.shape[1]
dist_profiles = List()
# Init distance profile array with unequal length support
for i in range(len(X)):
dist_profiles.append(np.zeros(X[i].shape[1] - query_length + 1))
if normalise:
q = z_normalise_series_2d(q)
else:
q = q.astype(np.float64)
for i in range(len(X)):
# Numba don't support strides with integers ?

X_subs = get_all_subsequences(X[i].astype(np.float64), query_length, 1)
if normalise:
if X_means is None and X_stds is None:
_X_means, _X_stds = sliding_mean_std_one_series(X[i], query_length, 1)
else:
_X_means, _X_stds = X_means[i], X_stds[i]
X_subs = normalise_subsequences(X_subs, _X_means, _X_stds)
dist_profile = _compute_dist_profile(X_subs, q)
dist_profile[~mask[i]] = np.inf
dist_profiles[i] = dist_profile
return dist_profiles


@njit(cache=True, fastmath=True)
def naive_squared_matrix_profile(X, T, query_length, mask, normalise=False):
"""
Compute a squared euclidean matrix profile.
Parameters
----------
X : array, shape=(n_samples, n_channels, n_timepoints_x)
Input time series dataset to search in.
T : array, shape=(n_channels, n_timepoints_t)
Time series from which queries are extracted.
query_length : int
Length of the queries to extract from T.
mask : array, shape=(n_samples, n_timepoints_x - query_length + 1)
Boolean mask indicating candidates for which the distance
profiles computed for each query should be set to infinity.
normalise : bool
Wheter to use a z-normalised distance.
Returns
-------
out : np.ndarray, 1D array of shape (n_timepoints_t - query_length + 1)
The minimum distance between each query in T and all candidates in X.
"""
X_subs = List()
for i in range(len(X)):
i_subs = get_all_subsequences(X[i].astype(np.float64), query_length, 1)
if normalise:
X_means, X_stds = sliding_mean_std_one_series(X[i], query_length, 1)
i_subs = normalise_subsequences(i_subs, X_means, X_stds)
X_subs.append(i_subs)

n_candidates = T.shape[1] - query_length + 1
mp = np.full(n_candidates, np.inf)

for i in range(n_candidates):
q = T[:, i : i + query_length]
if normalise:
q = z_normalise_series_2d(q)
for id_sample in range(len(X)):
dist_profile = _compute_dist_profile(X_subs[id_sample], q)
dist_profile[~mask[id_sample]] = np.inf
mp[i] = min(mp[i], dist_profile.min())
return mp


def fft_sliding_dot_product(X, q):
"""
Expand All @@ -23,7 +167,6 @@ def fft_sliding_dot_product(X, q):
----------
X : array, shape=(n_channels, n_timepoints)
Input time series
q : array, shape=(n_channels, query_length)
Input query
Expand Down
10 changes: 6 additions & 4 deletions aeon/similarity_search/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ class BaseSimilaritySearch(BaseCollectionEstimator):
inverse_distance : bool, default=False
If True, the matching will be made on the inverse of the distance, and thus, the
worst matches to the query will be returned instead of the best ones.
normalize : bool, default=False
Whether the distance function should be z-normalized.
normalise : bool, default=False
Whether the distance function should be z-normalised.
speed_up : str, default='fastest'
Which speed up technique to use with for the selected distance
function. By default, the fastest algorithm is used. A list of available
Expand All @@ -56,6 +56,7 @@ class BaseSimilaritySearch(BaseCollectionEstimator):
"capability:multivariate": True,
"capability:unequal_length": True,
"capability:multithreading": True,
"fit_is_empty": False,
"X_inner_type": ["np-list", "numpy3D"],
}

Expand All @@ -64,14 +65,14 @@ def __init__(
distance: str = "euclidean",
distance_args: Optional[dict] = None,
inverse_distance: bool = False,
normalize: bool = False,
normalise: bool = False,
speed_up: str = "fastest",
n_jobs: int = 1,
):
self.distance = distance
self.distance_args = distance_args
self.inverse_distance = inverse_distance
self.normalize = normalize
self.normalise = normalise
self.n_jobs = n_jobs
self.speed_up = speed_up
super().__init__()
Expand Down Expand Up @@ -108,6 +109,7 @@ def fit(self, X: np.ndarray, y=None):
set_num_threads(self._n_jobs)
self._fit(X, y)
set_num_threads(prev_threads)
self.is_fitted = True
return self

def _store_mean_std_from_inputs(self, query_length: int) -> None:
Expand Down
14 changes: 4 additions & 10 deletions aeon/similarity_search/distance_profiles/__init__.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,18 @@
"""Distance profiles."""

__all__ = [
"naive_distance_profile",
"normalized_naive_distance_profile",
"euclidean_distance_profile",
"normalized_euclidean_distance_profile",
"normalised_euclidean_distance_profile",
"squared_distance_profile",
"normalized_squared_distance_profile",
"normalised_squared_distance_profile",
]


from aeon.similarity_search.distance_profiles.euclidean_distance_profile import (
euclidean_distance_profile,
normalized_euclidean_distance_profile,
)
from aeon.similarity_search.distance_profiles.naive_distance_profile import (
naive_distance_profile,
normalized_naive_distance_profile,
normalised_euclidean_distance_profile,
)
from aeon.similarity_search.distance_profiles.squared_distance_profile import (
normalized_squared_distance_profile,
normalised_squared_distance_profile,
squared_distance_profile,
)
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from numba.typed import List

from aeon.similarity_search.distance_profiles.squared_distance_profile import (
normalized_squared_distance_profile,
normalised_squared_distance_profile,
squared_distance_profile,
)

Expand Down Expand Up @@ -39,18 +39,18 @@ def euclidean_distance_profile(
Returns
-------
distance_profiles : np.ndarray
3D array of shape (n_cases, n_channels, n_timepoints - query_length + 1)
The distance profile between q and the input time series X independently
for each channel.
3D array of shape (n_cases, n_timepoints - query_length + 1)
The distance profile between q and the input time series X.
"""
distance_profiles = squared_distance_profile(X, q, mask)
# Need loop as we can return a list of np array in the unequal length case
for i in range(len(distance_profiles)):
distance_profiles[i] = distance_profiles[i] ** 0.5
return distance_profiles


def normalized_euclidean_distance_profile(
def normalised_euclidean_distance_profile(
X: Union[np.ndarray, List],
q: np.ndarray,
mask: np.ndarray,
Expand Down Expand Up @@ -89,14 +89,14 @@ def normalized_euclidean_distance_profile(
Returns
-------
distance_profiles : np.ndarray
3D array of shape (n_cases, n_channels, n_timepoints - query_length + 1)
The distance profile between q and the input time series X independently
for each channel.
3D array of shape (n_cases, n_timepoints - query_length + 1)
The distance profile between q and the input time series X.
"""
distance_profiles = normalized_squared_distance_profile(
distance_profiles = normalised_squared_distance_profile(
X, q, mask, X_means, X_stds, q_means, q_stds
)
# Need loop as we can return a list of np array in the unequal length case
for i in range(len(distance_profiles)):
distance_profiles[i] = distance_profiles[i] ** 0.5
return distance_profiles
Loading

0 comments on commit 01495e7

Please sign in to comment.