From 1646895bc931f5e47553da51c1b533461cafb688 Mon Sep 17 00:00:00 2001 From: baraline Date: Wed, 4 Dec 2024 22:47:52 +0100 Subject: [PATCH 01/18] WIP remake module structure --- aeon/similarity_search/__init__.py | 6 +- aeon/similarity_search/_commons.py | 504 -------------- aeon/similarity_search/base.py | 209 ++---- .../distance_profiles/__init__.py | 18 - .../euclidean_distance_profile.py | 102 --- .../squared_distance_profile.py | 319 --------- .../distance_profiles/tests/__init__.py | 1 - .../tests/test_euclidean_distance.py | 208 ------ .../tests/test_squared_distance.py | 200 ------ .../matrix_profiles/__init__.py | 14 - .../matrix_profiles/stomp.py | 633 ------------------ .../matrix_profiles/tests/test_stomp.py | 205 ------ aeon/similarity_search/query_search.py | 428 ------------ aeon/similarity_search/series_search.py | 436 ------------ .../series_search/__init__.py | 7 + aeon/similarity_search/series_search/base.py | 22 + .../subsequence_search/__init__.py | 5 + .../subsequence_search/_brute_force.py | 284 ++++++++ .../subsequence_search/_commons.py | 138 ++++ .../subsequence_search/_stomp.py | 596 +++++++++++++++++ .../subsequence_search/base.py | 358 ++++++++++ .../tests/__init__.py | 0 .../subsequence_search/tests/test__commons.py | 64 ++ .../subsequence_search/tests/test_stomp.py | 238 +++++++ aeon/similarity_search/tests/test__commons.py | 49 -- .../tests/test_query_search.py | 176 ----- .../tests/test_series_search.py | 74 -- aeon/utils/numba/general.py | 3 + 28 files changed, 1785 insertions(+), 3512 deletions(-) delete mode 100644 aeon/similarity_search/_commons.py delete mode 100644 aeon/similarity_search/distance_profiles/__init__.py delete mode 100644 aeon/similarity_search/distance_profiles/euclidean_distance_profile.py delete mode 100644 aeon/similarity_search/distance_profiles/squared_distance_profile.py delete mode 100644 aeon/similarity_search/distance_profiles/tests/__init__.py delete mode 100644 aeon/similarity_search/distance_profiles/tests/test_euclidean_distance.py delete mode 100644 aeon/similarity_search/distance_profiles/tests/test_squared_distance.py delete mode 100644 aeon/similarity_search/matrix_profiles/__init__.py delete mode 100644 aeon/similarity_search/matrix_profiles/stomp.py delete mode 100644 aeon/similarity_search/matrix_profiles/tests/test_stomp.py delete mode 100644 aeon/similarity_search/query_search.py delete mode 100644 aeon/similarity_search/series_search.py create mode 100644 aeon/similarity_search/series_search/__init__.py create mode 100644 aeon/similarity_search/series_search/base.py create mode 100644 aeon/similarity_search/subsequence_search/__init__.py create mode 100644 aeon/similarity_search/subsequence_search/_brute_force.py create mode 100644 aeon/similarity_search/subsequence_search/_commons.py create mode 100644 aeon/similarity_search/subsequence_search/_stomp.py create mode 100644 aeon/similarity_search/subsequence_search/base.py rename aeon/similarity_search/{matrix_profiles => subsequence_search}/tests/__init__.py (100%) create mode 100644 aeon/similarity_search/subsequence_search/tests/test__commons.py create mode 100644 aeon/similarity_search/subsequence_search/tests/test_stomp.py delete mode 100644 aeon/similarity_search/tests/test__commons.py delete mode 100644 aeon/similarity_search/tests/test_query_search.py delete mode 100644 aeon/similarity_search/tests/test_series_search.py diff --git a/aeon/similarity_search/__init__.py b/aeon/similarity_search/__init__.py index f576c41f03..53f80b2cdf 100644 --- a/aeon/similarity_search/__init__.py +++ b/aeon/similarity_search/__init__.py @@ -1,7 +1,3 @@ """Similarity search module.""" -__all__ = ["BaseSimilaritySearch", "QuerySearch", "SeriesSearch"] - -from aeon.similarity_search.base import BaseSimilaritySearch -from aeon.similarity_search.query_search import QuerySearch -from aeon.similarity_search.series_search import SeriesSearch +__all__ = [] diff --git a/aeon/similarity_search/_commons.py b/aeon/similarity_search/_commons.py deleted file mode 100644 index 1d20a6a5b0..0000000000 --- a/aeon/similarity_search/_commons.py +++ /dev/null @@ -1,504 +0,0 @@ -"""Helper and common function for similarity search estimators and functions.""" - -__maintainer__ = ["baraline"] - -import warnings - -import numpy as np -from numba import njit, prange -from numba.typed import List -from scipy.signal import convolve - -from aeon.utils.numba.general import ( - get_all_subsequences, - normalise_subsequences, - sliding_mean_std_one_series, - z_normalise_series_2d, -) - - -@njit(cache=True, fastmath=True) -def _compute_dist_profile(X_subs, q): - """ - Compute the distance profile between subsequences and a query. - - Parameters - ---------- - X_subs : array, shape=(n_samples, n_channels, query_length) - Input subsequences extracted from a time series. - q : array, shape=(n_channels, query_length) - Query used for the distance computation - - Returns - ------- - dist_profile : np.ndarray, 1D array of shape (n_samples) - The distance between the query all subsequences. - - """ - n_candidates, n_channels, q_length = X_subs.shape - dist_profile = np.zeros(n_candidates) - for i in range(n_candidates): - for j in range(n_channels): - for k in range(q_length): - dist_profile[i] += (X_subs[i, j, k] - q[j, k]) ** 2 - return dist_profile - - -@njit(cache=True, fastmath=True) -def naive_squared_distance_profile( - X, - q, - mask, - normalise=False, - X_means=None, - X_stds=None, -): - """ - Compute a squared euclidean distance profile. - - Parameters - ---------- - X : array, shape=(n_samples, n_channels, n_timepoints) - Input time series dataset to search in. - q : array, shape=(n_channels, query_length) - Query used during the search. - mask : array, shape=(n_samples, n_timepoints - query_length + 1) - Boolean mask indicating candidates for which the distance - profiles computed for each query should be set to infinity. - normalise : bool - Wheter to use a z-normalised distance. - X_means : array, shape=(n_samples, n_channels, n_timepoints - query_length + 1) - Mean of each candidate (subsequence) of length query_length in X. The - default is None, meaning that these values will be computed if normalise - is True. If provided, the computations will be skipped. - X_stds : array, shape=(n_samples, n_channels, n_timepoints - query_length + 1) - Standard deviation of each candidate (subsequence) of length query_length - in X. The default is None, meaning that these values will be computed if - normalise is True. If provided, the computations will be skipped. - - Returns - ------- - out : np.ndarray, 1D array of shape (n_samples, n_timepoints_t - query_length + 1) - The distance between the query and all candidates in X. - - """ - query_length = q.shape[1] - dist_profiles = List() - # Init distance profile array with unequal length support - for i in range(len(X)): - dist_profiles.append(np.zeros(X[i].shape[1] - query_length + 1)) - if normalise: - q = z_normalise_series_2d(q) - else: - q = q.astype(np.float64) - for i in range(len(X)): - # Numba don't support strides with integers ? - - X_subs = get_all_subsequences(X[i].astype(np.float64), query_length, 1) - if normalise: - if X_means is None and X_stds is None: - _X_means, _X_stds = sliding_mean_std_one_series(X[i], query_length, 1) - else: - _X_means, _X_stds = X_means[i], X_stds[i] - X_subs = normalise_subsequences(X_subs, _X_means, _X_stds) - dist_profile = _compute_dist_profile(X_subs, q) - dist_profile[~mask[i]] = np.inf - dist_profiles[i] = dist_profile - return dist_profiles - - -@njit(cache=True, fastmath=True) -def naive_squared_matrix_profile(X, T, query_length, mask, normalise=False): - """ - Compute a squared euclidean matrix profile. - - Parameters - ---------- - X : array, shape=(n_samples, n_channels, n_timepoints_x) - Input time series dataset to search in. - T : array, shape=(n_channels, n_timepoints_t) - Time series from which queries are extracted. - query_length : int - Length of the queries to extract from T. - mask : array, shape=(n_samples, n_timepoints_x - query_length + 1) - Boolean mask indicating candidates for which the distance - profiles computed for each query should be set to infinity. - normalise : bool - Wheter to use a z-normalised distance. - - Returns - ------- - out : np.ndarray, 1D array of shape (n_timepoints_t - query_length + 1) - The minimum distance between each query in T and all candidates in X. - """ - X_subs = List() - for i in range(len(X)): - i_subs = get_all_subsequences(X[i].astype(np.float64), query_length, 1) - if normalise: - X_means, X_stds = sliding_mean_std_one_series(X[i], query_length, 1) - i_subs = normalise_subsequences(i_subs, X_means, X_stds) - X_subs.append(i_subs) - - n_candidates = T.shape[1] - query_length + 1 - mp = np.full(n_candidates, np.inf) - - for i in range(n_candidates): - q = T[:, i : i + query_length] - if normalise: - q = z_normalise_series_2d(q) - for id_sample in range(len(X)): - dist_profile = _compute_dist_profile(X_subs[id_sample], q) - dist_profile[~mask[id_sample]] = np.inf - mp[i] = min(mp[i], dist_profile.min()) - return mp - - -def fft_sliding_dot_product(X, q): - """ - Use FFT convolution to calculate the sliding window dot product. - - This function applies the Fast Fourier Transform (FFT) to efficiently compute - the sliding dot product between the input time series `X` and the query `q`. - The dot product is computed for each channel individually. The sliding window - approach ensures that the dot product is calculated for every possible subsequence - of `X` that matches the length of `q` - - Parameters - ---------- - X : array, shape=(n_channels, n_timepoints) - Input time series - q : array, shape=(n_channels, query_length) - Input query - - Returns - ------- - out : np.ndarray, 2D array of shape (n_channels, n_timepoints - query_length + 1) - Sliding dot product between q and X. - """ - n_channels, n_timepoints = X.shape - query_length = q.shape[1] - out = np.zeros((n_channels, n_timepoints - query_length + 1)) - for i in range(n_channels): - out[i, :] = convolve(np.flipud(q[i, :]), X[i, :], mode="valid").real - return out - - -def get_ith_products(X, T, L, ith): - """ - Compute dot products between X and the i-th subsequence of size L in T. - - Parameters - ---------- - X : array, shape = (n_channels, n_timepoints_X) - Input data. - T : array, shape = (n_channels, n_timepoints_T) - Data containing the query. - L : int - Overall query length. - ith : int - Query starting index in T. - - Returns - ------- - np.ndarray, 2D array of shape (n_channels, n_timepoints_X - L + 1) - Sliding dot product between the i-th subsequence of size L in T and X. - - """ - return fft_sliding_dot_product(X, T[:, ith : ith + L]) - - -@njit(cache=True) -def numba_roll_1D_no_warparound(array, shift, warparound_value): - """ - Roll the rows of an array. - - Wheter to allow values at the end of the array to appear at the start after - being rolled out of the array length. - - Parameters - ---------- - array : np.ndarray of shape (n_columns) - Array to roll. - shift : int - The amount of indexes the values will be rolled on each row of the array. - Must be inferior or equal to n_columns. - warparound_value : any type - A value of the type of array to insert instead of the value that got rolled - over the array length - - Returns - ------- - rolled_array : np.ndarray of shape (n_rows, n_columns) - The rolled array. Can also be a TypedList in the case where n_columns changes - between rows. - - """ - length = array.shape[0] - _a1 = array[: length - shift] - array[shift:] = _a1 - array[:shift] = warparound_value - return array - - -@njit(cache=True) -def numba_roll_2D_no_warparound(array, shift, warparound_value): - """ - Roll the rows of an array. - - Wheter to allow values at the end of the array to appear at the start after - being rolled out of the array length. - - Parameters - ---------- - array : np.ndarray of shape (n_rows, n_columns) - Array to roll. Can also be a TypedList in the case where n_columns changes - between rows. - shift : int - The amount of indexes the values will be rolled on each row of the array. - Must be inferior or equal to n_columns. - warparound_value : any type - A value of the type of array to insert instead of the value that got rolled - over the array length - - Returns - ------- - rolled_array : np.ndarray of shape (n_rows, n_columns) - The rolled array. Can also be a TypedList in the case where n_columns changes - between rows. - - """ - for i in prange(len(array)): - length = len(array[i]) - _a1 = array[i][: length - shift] - array[i][shift:] = _a1 - array[i][:shift] = warparound_value - return array - - -@njit(cache=True) -def extract_top_k_and_threshold_from_distance_profiles_one_series( - distance_profiles, - id_x, - k=1, - threshold=np.inf, - exclusion_size=None, - inverse_distance=False, -): - """ - Extract the top-k smallest values from distance profiles and apply threshold. - - This function processes a distance profile and extracts the top-k smallest - distance values, optionally applying a threshold to exclude distances above - a given value. It also optionally handles exclusion zones to avoid selecting - neighboring timestamps. - - Parameters - ---------- - distance_profiles : np.ndarray, 2D array of shape (n_cases, n_candidates) - Precomputed distance profile. Can be a TypedList if n_candidates vary between - cases. - id_x : int - Identifier of the series or subsequence from which the distance profile - is computed. - k : int - Number of matches to returns - threshold : float - All matches below this threshold will be returned - exclusion_size : int or None, optional, default=None - Size of the exclusion zone around the current subsequence. This prevents - selecting neighboring subsequences within the specified range, useful for - avoiding trivial matches in time series data. If set to `None`, no - exclusion zone is applied. - inverse_distance : bool, optional - Wheter to return the worst matches instead of the bests. The default is False. - - Returns - ------- - top_k_dist : np.ndarray - Array of the top-k smallest distance values, potentially excluding values above - the threshold or those within the exclusion zone. - top_k : np.ndarray - Array of shape (k, 2) where each row contains the `id_x` identifier and the - index of the corresponding subsequence (or timestamp) with the top-k smallest - distances. - """ - if inverse_distance: - # To avoid div by 0 case - distance_profiles += 1e-8 - distance_profiles[distance_profiles != np.inf] = ( - 1 / distance_profiles[distance_profiles != np.inf] - ) - - if threshold != np.inf: - distance_profiles[distance_profiles > threshold] = np.inf - - _argsort = np.argsort(distance_profiles) - - if distance_profiles[distance_profiles <= threshold].shape[0] < k: - _k = distance_profiles[distance_profiles <= threshold].shape[0] - elif _argsort.shape[0] < k: - _k = _argsort.shape[0] - else: - _k = k - - if exclusion_size is None: - indexes = np.zeros((_k, 2), dtype=np.int_) - for i in range(_k): - indexes[i, 0] = id_x - indexes[i, 1] = _argsort[i] - return distance_profiles[_argsort[:_k]], indexes - else: - # Apply exclusion zone to avoid neighboring matches - top_k = np.zeros((_k, 2), dtype=np.int_) - exclusion_size - top_k_dist = np.zeros((_k), dtype=np.float64) - - top_k[0, 0] = id_x - top_k[0, 1] = _argsort[0] - - top_k_dist[0] = distance_profiles[_argsort[0]] - - n_inserted = 1 - i_current = 1 - - while n_inserted < _k and i_current < _argsort.shape[0]: - candidate_timestamp = _argsort[i_current] - - insert = True - LB = candidate_timestamp >= (top_k[:, 1] - exclusion_size) - UB = candidate_timestamp <= (top_k[:, 1] + exclusion_size) - if np.any(UB & LB): - insert = False - - if insert: - top_k[n_inserted, 0] = id_x - top_k[n_inserted, 1] = _argsort[i_current] - top_k_dist[n_inserted] = distance_profiles[_argsort[i_current]] - n_inserted += 1 - i_current += 1 - return top_k_dist[:n_inserted], top_k[:n_inserted] - - -def extract_top_k_and_threshold_from_distance_profiles( - distance_profiles, - k=1, - threshold=np.inf, - exclusion_size=None, - inverse_distance=False, -): - """ - Extract the best matches from a distance profile given k and threshold parameters. - - Parameters - ---------- - distance_profiles : np.ndarray, 2D array of shape (n_cases, n_candidates) - Precomputed distance profile. Can be a TypedList if n_candidates vary between - cases. - k : int - Number of matches to returns - threshold : float - All matches below this threshold will be returned - exclusion_size : int, optional - The size of the exclusion zone used to prevent returning as top k candidates - the ones that are close to each other (for example i and i+1). - It is used to define a region between - :math:`id_timestamp - exclusion_size` and - :math:`id_timestamp + exclusion_size` which cannot be returned - as best match if :math:`id_timestamp` was already selected. By default, - the value None means that this is not used. - inverse_distance : bool, optional - Wheter to return the worst matches instead of the bests. The default is False. - - Returns - ------- - Tuple(ndarray, ndarray) - The first array, of shape ``(n_matches)``, contains the distance between - the query and its best matches in X_. The second array, of shape - ``(n_matches, 2)``, contains the indexes of these matches as - ``(id_sample, id_timepoint)``. The corresponding match can be - retrieved as ``X_[id_sample, :, id_timepoint : id_timepoint + length]``. - - """ - # This whole function could be optimized and maybe made in numba to avoid stepping - # out of numba mode during distance computations - - n_cases_ = len(distance_profiles) - - id_timestamps = np.concatenate( - [np.arange(distance_profiles[i].shape[0]) for i in range(n_cases_)] - ) - id_samples = np.concatenate( - [[i] * distance_profiles[i].shape[0] for i in range(n_cases_)] - ) - - distance_profiles = np.concatenate(distance_profiles) - - if inverse_distance: - # To avoid div by 0 case - distance_profiles += 1e-8 - distance_profiles[distance_profiles != np.inf] = ( - 1 / distance_profiles[distance_profiles != np.inf] - ) - - if threshold != np.inf: - distance_profiles[distance_profiles > threshold] = np.inf - - _argsort_1d = np.argsort(distance_profiles) - _argsort = np.asarray( - [ - [id_samples[_argsort_1d[i]], id_timestamps[_argsort_1d[i]]] - for i in range(len(_argsort_1d)) - ], - dtype=int, - ) - - if distance_profiles[distance_profiles <= threshold].shape[0] < k: - _k = distance_profiles[distance_profiles <= threshold].shape[0] - warnings.warn( - f"Only {_k} matches are bellow the threshold of {threshold}, while" - f" k={k}. The number of returned match will be {_k}.", - stacklevel=2, - ) - elif _argsort.shape[0] < k: - _k = _argsort.shape[0] - warnings.warn( - f"The number of possible match is {_argsort.shape[0]}, but got" - f" k={k}. The number of returned match will be {_k}.", - stacklevel=2, - ) - else: - _k = k - - if exclusion_size is None: - return distance_profiles[_argsort_1d[:_k]], _argsort[:_k] - else: - # Apply exclusion zone to avoid neighboring matches - top_k = np.zeros((_k, 2), dtype=int) - top_k_dist = np.zeros((_k), dtype=float) - - top_k[0] = _argsort[0, :] - top_k_dist[0] = distance_profiles[_argsort_1d[0]] - - n_inserted = 1 - i_current = 1 - - while n_inserted < _k and i_current < _argsort.shape[0]: - candidate_sample, candidate_timestamp = _argsort[i_current] - - insert = True - is_from_same_sample = top_k[:, 0] == candidate_sample - if np.any(is_from_same_sample): - LB = candidate_timestamp >= ( - top_k[is_from_same_sample, 1] - exclusion_size - ) - UB = candidate_timestamp <= ( - top_k[is_from_same_sample, 1] + exclusion_size - ) - if np.any(UB & LB): - insert = False - - if insert: - top_k[n_inserted] = _argsort[i_current] - top_k_dist[n_inserted] = distance_profiles[_argsort_1d[i_current]] - n_inserted += 1 - i_current += 1 - return top_k_dist[:n_inserted], top_k[:n_inserted] diff --git a/aeon/similarity_search/base.py b/aeon/similarity_search/base.py index 5b0ce8c555..8a9e9547d7 100644 --- a/aeon/similarity_search/base.py +++ b/aeon/similarity_search/base.py @@ -3,15 +3,13 @@ __maintainer__ = ["baraline"] from abc import abstractmethod -from collections.abc import Iterable -from typing import Optional, final +from typing import Optional, Union, final import numpy as np from numba import get_num_threads, set_num_threads from numba.typed import List from aeon.base import BaseCollectionEstimator -from aeon.utils.numba.general import sliding_mean_std_one_series class BaseSimilaritySearch(BaseCollectionEstimator): @@ -20,36 +18,10 @@ class BaseSimilaritySearch(BaseCollectionEstimator): Parameters ---------- - distance : str, default="euclidean" - Name of the distance function to use. A list of valid strings can be found in - the documentation for :func:`aeon.distances.get_distance_function`. - If a callable is passed it must either be a python function or numba function - with nopython=True, that takes two 1d numpy arrays as input and returns a float. - distance_args : dict, default=None - Optional keyword arguments for the distance function. - inverse_distance : bool, default=False - If True, the matching will be made on the inverse of the distance, and thus, the - worst matches to the query will be returned instead of the best ones. - normalise : bool, default=False - Whether the distance function should be z-normalised. - speed_up : str, default='fastest' - Which speed up technique to use with for the selected distance - function. By default, the fastest algorithm is used. A list of available - algorithm for each distance can be obtained by calling the - `get_speedup_function_names` function of the child classes. - n_jobs : int, default=1 - Number of parallel jobs to use. - - Attributes - ---------- - X_ : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints) - The input time series stored during the fit method. - - Notes - ----- - For now, the multivariate case is only treated as independent. - Distances are computed for each channel independently and then - summed together. + normalize : bool, optional + Whether the inputs should be z-normalized. The default is False. + n_jobs : int, optional + Number of parallel jobs to use. The default is 1. """ _tags = { @@ -63,30 +35,27 @@ class BaseSimilaritySearch(BaseCollectionEstimator): @abstractmethod def __init__( self, - distance: str = "euclidean", - distance_args: Optional[dict] = None, - inverse_distance: bool = False, - normalise: bool = False, - speed_up: str = "fastest", - n_jobs: int = 1, + normalize: Optional[bool] = False, + n_jobs: Optional[int] = 1, ): - self.distance = distance - self.distance_args = distance_args - self.inverse_distance = inverse_distance - self.normalise = normalise self.n_jobs = n_jobs - self.speed_up = speed_up + self.normalize = normalize super().__init__() @final - def fit(self, X: np.ndarray, y=None): + def fit( + self, + X: Union[np.ndarray, List], + y=None, + ): """ Fit method: data preprocessing and storage. Parameters ---------- X : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints) - Input array to be used as database for the similarity search + Input array to be used as database for the similarity search. If it is an + unequal length collection, it should be a list of 2d numpy arrays. y : optional Not used. @@ -113,120 +82,80 @@ def fit(self, X: np.ndarray, y=None): self.is_fitted = True return self - def _store_mean_std_from_inputs(self, query_length: int) -> None: + @abstractmethod + def find_motifs( + self, + k: int, + threshold: float, + X: Optional[np.ndarray] = None, + allow_overlap: Optional[bool] = True, + ): """ - Store the mean and std of each subsequence of size query_length in X_. + Find the top-k motifs in the training data. + + Given ``k`` and ``threshold`` parameters, this methods returns the top-k motif + sets. We define a motif set as a set of candidates which all are at a distance + of at most ``threshold`` from each other. The top-k motifs sets are the + motif sets with the most candidates. Parameters ---------- - query_length : int - Length of the query. + X : np.ndarray, optional + The query in which we want to indentify motifs. If provided, the motifs + extracted should appear in X and in the database given in fit. If not + provided, the motifs will be extracted only from the database given in fit. + k : int, optional + Number of motifs to return + threshold : int, optional + A threshold on the similarity measure to determine which candidates will be + part of a motif set. + allow_overlap: bool, optional + Wheter a candidate can be part of multiple motif sets (True), or if motif + sets should be mutually exclusive (False). Returns ------- - None + list of ndarray, shape=(k,) + A list of at most ``k`` numpy arrays containing the indexes of the + candidates in each motif. """ - means = [] - stds = [] - - for i in range(len(self.X_)): - _mean, _std = sliding_mean_std_one_series(self.X_[i], query_length, 1) - - stds.append(_std) - means.append(_mean) - - self.X_means_ = List(means) - self.X_stds_ = List(stds) + ... - def _init_X_index_mask( + @abstractmethod + def find_neighbors( self, - X_index: Optional[Iterable[int]], - query_length: int, - exclusion_factor: Optional[float] = 2.0, - ) -> np.ndarray: + X: np.ndarray, + k: Optional[int] = 1, + threshold: Optional[float] = np.inf, + ): """ - Initiliaze the mask indicating the candidates to be evaluated in the search. + Find the top-k neighbors of X in the database. + + Given ``k`` and ``threshold`` parameters, this methods returns the top-k + neighbors of X, such as each of the ``k`` neighbors as a distance inferior or + equal to ``threshold``. By default, ``threshold`` is set to infinity. It is + possible for this method to return less than ``k`` neighbors, either if there + is less than ``k`` admissible candidate in the database, or if in the top-k + candidates, some do not meet the ``threshold`` condition. Parameters ---------- - X_index : Iterable - Any Iterable (tuple, list, array) of length two used to specify the index of - the query X if it was extracted from the input data X given during the fit - method. Given the tuple (id_sample, id_timestamp), the similarity search - will define an exclusion zone around the X_index in order to avoid matching - X with itself. If None, it is considered that the query is not extracted - from X_ (the training data). - query_length : int - Length of the queries. - exclusion_factor : float, optional - The exclusion factor is used to prevent candidates close or equal to the - query sample point to be returned as best matches. It is used to define a - region between :math:`id_timestamp - query_length//exclusion_factor` and - :math:`id_timestamp + query_length//exclusion_factor` which cannot be used - in the search. The default is 2.0. - - Raises - ------ - ValueError - If the length of the q_index iterable is not two, will raise a ValueError. - TypeError - If q_index is not an iterable, will raise a TypeError. + X: np.ndarray + The query for which we want to identify nearest neighbors in the database. + k : int, optional + Number of neighbors to return. + threshold : int, optional + A threshold on the distance to determine which candidates will be returned. Returns ------- - mask : np.ndarray, 2D array of shape (n_cases, n_timepoints - query_length + 1) - Boolean array which indicates the candidates that should be evaluated in the - similarity search. + ndarray, shape=(k,) + A numpy array of at most ``k`` elements containing the indexes of the + candidates in each motif. """ - if self.metadata_["unequal_length"]: - mask = List( - [ - np.ones(self.X_[i].shape[1] - query_length + 1, dtype=bool) - for i in range(self.n_cases_) - ] - ) - else: - mask = np.ones( - (self.n_cases_, self.min_timepoints_ - query_length + 1), - dtype=bool, - ) - if X_index is not None: - if isinstance(X_index, Iterable): - if len(X_index) != 2: - raise ValueError( - "The X_index should contain an interable of size 2 such as " - "(id_sample, id_timestamp), but got an iterable of " - "size {}".format(len(X_index)) - ) - else: - raise TypeError( - "If not None, the X_index parameter should be an iterable, here " - "X_index is of type {}".format(type(X_index)) - ) - - if exclusion_factor <= 0: - raise ValueError( - "The value of exclusion_factor should be superior to 0, but got " - "{}".format(len(exclusion_factor)) - ) - - i_instance, i_timestamp = X_index - profile_length = self.X_[i_instance].shape[1] - query_length + 1 - exclusion_LB = max(0, int(i_timestamp - query_length // exclusion_factor)) - exclusion_UB = min( - profile_length, - int(i_timestamp + query_length // exclusion_factor), - ) - mask[i_instance][exclusion_LB:exclusion_UB] = False - - return mask + ... @abstractmethod def _fit(self, X, y=None): ... - - @abstractmethod - def get_speedup_function_names(self): - """Return a dictionnary containing the name of the speedup functions.""" - ... diff --git a/aeon/similarity_search/distance_profiles/__init__.py b/aeon/similarity_search/distance_profiles/__init__.py deleted file mode 100644 index 4be73f9d8e..0000000000 --- a/aeon/similarity_search/distance_profiles/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -"""Distance profiles.""" - -__all__ = [ - "euclidean_distance_profile", - "normalised_euclidean_distance_profile", - "squared_distance_profile", - "normalised_squared_distance_profile", -] - - -from aeon.similarity_search.distance_profiles.euclidean_distance_profile import ( - euclidean_distance_profile, - normalised_euclidean_distance_profile, -) -from aeon.similarity_search.distance_profiles.squared_distance_profile import ( - normalised_squared_distance_profile, - squared_distance_profile, -) diff --git a/aeon/similarity_search/distance_profiles/euclidean_distance_profile.py b/aeon/similarity_search/distance_profiles/euclidean_distance_profile.py deleted file mode 100644 index 1dd781e467..0000000000 --- a/aeon/similarity_search/distance_profiles/euclidean_distance_profile.py +++ /dev/null @@ -1,102 +0,0 @@ -"""Optimized distance profile for euclidean distance.""" - -__maintainer__ = ["baraline"] - - -from typing import Union - -import numpy as np -from numba.typed import List - -from aeon.similarity_search.distance_profiles.squared_distance_profile import ( - normalised_squared_distance_profile, - squared_distance_profile, -) - - -def euclidean_distance_profile( - X: Union[np.ndarray, List], q: np.ndarray, mask: np.ndarray -) -> np.ndarray: - """ - Compute a distance profile using the squared Euclidean distance. - - It computes the distance profiles between the input time series and the query using - the squared Euclidean distance. The distance between the query and a candidate is - comptued using a dot product and a rolling sum to avoid recomputing parts of the - operation. - - Parameters - ---------- - X: np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints) - The input samples. If X is an unquel length collection, expect a numba TypedList - of 2D arrays of shape (n_channels, n_timepoints) - q : np.ndarray, 2D array of shape (n_channels, query_length) - The query used for similarity search. - mask : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - query_length + 1) # noqa: E501 - Boolean mask of the shape of the distance profile indicating for which part - of it the distance should be computed. - - Returns - ------- - distance_profiles : np.ndarray - 3D array of shape (n_cases, n_timepoints - query_length + 1) - The distance profile between q and the input time series X. - - """ - distance_profiles = squared_distance_profile(X, q, mask) - # Need loop as we can return a list of np array in the unequal length case - for i in range(len(distance_profiles)): - distance_profiles[i] = distance_profiles[i] ** 0.5 - return distance_profiles - - -def normalised_euclidean_distance_profile( - X: Union[np.ndarray, List], - q: np.ndarray, - mask: np.ndarray, - X_means: Union[np.ndarray, List], - X_stds: Union[np.ndarray, List], - q_means: np.ndarray, - q_stds: np.ndarray, -) -> np.ndarray: - """ - Compute a distance profile in a brute force way. - - It computes the distance profiles between the input time series and the query using - the specified distance. The search is made in a brute force way without any - optimizations and can thus be slow. - - Parameters - ---------- - X: np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints) - The input samples. If X is an unquel length collection, expect a numba TypedList - of 2D arrays of shape (n_channels, n_timepoints) - q : np.ndarray, 2D array of shape (n_channels, query_length) - The query used for similarity search. - mask : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - query_length + 1) # noqa: E501 - Boolean mask of the shape of the distance profile indicating for which part - of it the distance should be computed. - X_means : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - query_length + 1) # noqa: E501 - Means of each subsequences of X of size query_length. Should be a numba - TypedList if X is unequal length. - X_stds : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - query_length + 1) # noqa: E501 - Stds of each subsequences of X of size query_length. Should be a numba - TypedList if X is unequal length. - q_means : np.ndarray, 1D array of shape (n_channels) - Means of the query q - q_stds : np.ndarray, 1D array of shape (n_channels) - - Returns - ------- - distance_profiles : np.ndarray - 3D array of shape (n_cases, n_timepoints - query_length + 1) - The distance profile between q and the input time series X. - - """ - distance_profiles = normalised_squared_distance_profile( - X, q, mask, X_means, X_stds, q_means, q_stds - ) - # Need loop as we can return a list of np array in the unequal length case - for i in range(len(distance_profiles)): - distance_profiles[i] = distance_profiles[i] ** 0.5 - return distance_profiles diff --git a/aeon/similarity_search/distance_profiles/squared_distance_profile.py b/aeon/similarity_search/distance_profiles/squared_distance_profile.py deleted file mode 100644 index a42beeac2f..0000000000 --- a/aeon/similarity_search/distance_profiles/squared_distance_profile.py +++ /dev/null @@ -1,319 +0,0 @@ -"""Optimized distance profile for euclidean distance.""" - -__maintainer__ = ["baraline"] - - -from typing import Union - -import numpy as np -from numba import njit, prange -from numba.typed import List - -from aeon.similarity_search._commons import fft_sliding_dot_product -from aeon.utils.numba.general import AEON_NUMBA_STD_THRESHOLD - - -def squared_distance_profile( - X: Union[np.ndarray, List], q: np.ndarray, mask: np.ndarray -) -> np.ndarray: - """ - Compute a distance profile using the squared Euclidean distance. - - It computes the distance profiles between the input time series and the query using - the squared Euclidean distance. The distance between the query and a candidate is - comptued using a dot product and a rolling sum to avoid recomputing parts of the - operation. - - Parameters - ---------- - X : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints) - The input samples. If X is an unquel length collection, expect a numba TypedList - 2D array of shape (n_channels, n_timepoints) - q : np.ndarray, 2D array of shape (n_channels, query_length) - The query used for similarity search. - mask : np.ndarray, 3D array of shape (n_cases, n_timepoints - query_length + 1) - Boolean mask of the shape of the distance profile indicating for which part - of it the distance should be computed. - - Returns - ------- - distance_profile : np.ndarray - 3D array of shape (n_cases, n_timepoints - query_length + 1) - The distance profile between q and the input time series X. - - """ - QX = [fft_sliding_dot_product(X[i], q) for i in range(len(X))] - if isinstance(X, np.ndarray): - QX = np.asarray(QX) - elif isinstance(X, List): - QX = List(QX) - distance_profiles = _squared_distance_profile(QX, X, q, mask) - if isinstance(X, np.ndarray): - distance_profiles = np.asarray(distance_profiles) - return distance_profiles - - -def normalised_squared_distance_profile( - X: Union[np.ndarray, List], - q: np.ndarray, - mask: np.ndarray, - X_means: np.ndarray, - X_stds: np.ndarray, - q_means: np.ndarray, - q_stds: np.ndarray, -) -> np.ndarray: - """ - Compute a distance profile in a brute force way. - - It computes the distance profiles between the input time series and the query using - the specified distance. The search is made in a brute force way without any - optimizations and can thus be slow. - - Parameters - ---------- - X : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints) - The input samples. If X is an unquel length collection, expect a numba TypedList - 2D array of shape (n_channels, n_timepoints) - q : np.ndarray, 2D array of shape (n_channels, query_length) - The query used for similarity search. - mask : np.ndarray, 3D array of shape (n_cases, n_timepoints - query_length + 1) - Boolean mask of the shape of the distance profile indicating for which part - of it the distance should be computed. - X_means : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - query_length + 1) # noqa: E501 - Means of each subsequences of X of size query_length - X_stds : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - query_length + 1) # noqa: E501 - Stds of each subsequences of X of size query_length - q_means : np.ndarray, 1D array of shape (n_channels) - Means of the query q - q_stds : np.ndarray, 1D array of shape (n_channels) - Stds of the query q - - Returns - ------- - distance_profiles : np.ndarray - 3D array of shape (n_cases, n_timepoints - query_length + 1) - The distance profile between q and the input time series X. - - """ - query_length = q.shape[1] - QX = [fft_sliding_dot_product(X[i], q) for i in range(len(X))] - if isinstance(X, np.ndarray): - QX = np.asarray(QX) - elif isinstance(X, List): - QX = List(QX) - - distance_profiles = _normalised_squared_distance_profile( - QX, mask, X_means, X_stds, q_means, q_stds, query_length - ) - if isinstance(X, np.ndarray): - distance_profiles = np.asarray(distance_profiles) - return distance_profiles - - -@njit(cache=True, fastmath=True, parallel=True) -def _squared_distance_profile(QX, X, q, mask): - """ - Compute squared distance profiles between query subsequence and time series. - - Parameters - ---------- - QX : List of np.ndarray - List of precomputed dot products between queries and time series, with each - element corresponding to a different time series. - Shape of each array is (n_channels, n_timepoints - query_length + 1). - X : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints) - The input samples. If X is an unquel length collection, expect a numba TypedList - 2D array of shape (n_channels, n_timepoints) - q : np.ndarray, 2D array of shape (n_channels, query_length) - The query used for similarity search. - mask : np.ndarray, 3D array of shape (n_cases, n_timepoints - query_length + 1) - Boolean mask of the shape of the distance profile indicating for which part - of it the distance should be computed. - - Returns - ------- - distance_profiles : np.ndarray - 3D array of shape (n_cases, n_timepoints - query_length + 1) - The distance profile between q and the input time series X. - - """ - distance_profiles = List() - query_length = q.shape[1] - - # Init distance profile array with unequal length support - for i_instance in range(len(X)): - profile_length = X[i_instance].shape[1] - query_length + 1 - distance_profiles.append(np.full((profile_length), np.inf)) - - for _i_instance in prange(len(QX)): - # prange cast iterator to unit64 with parallel=True - i_instance = np.int_(_i_instance) - - distance_profiles[i_instance][mask[i_instance]] = ( - _squared_dist_profile_one_series(QX[i_instance], X[i_instance], q)[ - mask[i_instance] - ] - ) - return distance_profiles - - -@njit(cache=True, fastmath=True) -def _squared_dist_profile_one_series(QT, T, Q): - """ - Compute squared distance profile between query subsequence and a single time series. - - This function calculates the squared distance profile for a single time series by - leveraging the dot product of the query and time series as well as precomputed sums - of squares to efficiently compute the squared distances. - - Parameters - ---------- - QT : np.ndarray, 2D array of shape (n_channels, n_timepoints - query_length + 1) - The dot product between the query and the time series. - T : np.ndarray, 2D array of shape (n_channels, series_length) - The series used for similarity search. Note that series_length can be equal, - superior or inferior to n_timepoints, it doesn't matter. - Q : np.ndarray - 2D array of shape (n_channels, query_length) representing query subsequence. - - Returns - ------- - distance_profile : np.ndarray - 2D array of shape (n_channels, n_timepoints - query_length + 1) - The squared distance profile between the query and the input time series. - """ - n_channels, profile_length = QT.shape - query_length = Q.shape[1] - _QT = -2 * QT - distance_profile = np.zeros(profile_length) - for k in prange(n_channels): - _sum = 0 - _qsum = 0 - for j in prange(query_length): - _sum += T[k, j] ** 2 - _qsum += Q[k, j] ** 2 - - distance_profile += _qsum + _QT[k] - distance_profile[0] += _sum - for i in prange(1, profile_length): - _sum += T[k, i + (query_length - 1)] ** 2 - T[k, i - 1] ** 2 - distance_profile[i] += _sum - return distance_profile - - -@njit(cache=True, fastmath=True, parallel=True) -def _normalised_squared_distance_profile( - QX, mask, X_means, X_stds, q_means, q_stds, query_length -): - """ - Compute the normalised squared distance profiles between query subsequence and input time series. - - Parameters - ---------- - QX : List of np.ndarray - List of precomputed dot products between queries and time series, with each element - corresponding to a different time series. - Shape of each array is (n_channels, n_timepoints - query_length + 1). - mask : np.ndarray, 3D array of shape (n_cases, n_timepoints - query_length + 1) - Boolean mask of the shape of the distance profile indicating for which part - of it the distance should be computed. - X_means : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - query_length + 1) # noqa: E501 - Means of each subsequences of X of size query_length - X_stds : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - query_length + 1) # noqa: E501 - Stds of each subsequences of X of size query_length - q_means : np.ndarray, 1D array of shape (n_channels) - Means of the query q - q_stds : np.ndarray, 1D array of shape (n_channels) - Stds of the query q - query_length : int - The length of the query subsequence used for the distance profile computation. - - Returns - ------- - List of np.ndarray - List of 2D arrays, each of shape (n_channels, n_timepoints - query_length + 1). - Each array contains the normalised squared distance profile between the query subsequence and the corresponding time series. - Entries in the array are set to infinity where the mask is False. - """ - distance_profiles = List() - Q_is_constant = q_stds <= AEON_NUMBA_STD_THRESHOLD - # Init distance profile array with unequal length support - for i_instance in range(len(QX)): - profile_length = QX[i_instance].shape[1] - distance_profiles.append(np.full((profile_length), np.inf)) - - for _i_instance in prange(len(QX)): - # prange cast iterator to unit64 with parallel=True - i_instance = np.int_(_i_instance) - - distance_profiles[i_instance][mask[i_instance]] = ( - _normalised_squared_dist_profile_one_series( - QX[i_instance], - X_means[i_instance], - X_stds[i_instance], - q_means, - q_stds, - query_length, - Q_is_constant, - )[mask[i_instance]] - ) - return distance_profiles - - -@njit(cache=True, fastmath=True) -def _normalised_squared_dist_profile_one_series( - QT, T_means, T_stds, Q_means, Q_stds, query_length, Q_is_constant -): - """ - Compute the z-normalised squared Euclidean distance profile for one time series. - - Parameters - ---------- - QT : np.ndarray, 2D array of shape (n_channels, n_timepoints - query_length + 1) - The dot product between the query and the time series. - T_means : np.ndarray, 1D array of length n_channels - The mean values of the time series for each channel. - - T_stds : np.ndarray, 2D array of shape (n_channels, profile_length) - The standard deviations of the time series for each channel and position. - Q_means : np.ndarray, 1D array of shape (n_channels) - Means of the query q - Q_stds : np.ndarray, 1D array of shape (n_channels) - Stds of the query q - query_length : int - The length of the query subsequence used for the distance profile computation. - Q_is_constant : np.ndarray - 1D array of shape (n_channels,) where each element is a Boolean indicating - whether the query standard deviation for that channel is less than or equal - to a specified threshold. - - Returns - ------- - np.ndarray - 2D array of shape (n_channels, n_timepoints - query_length + 1) containing the - z-normalised squared distance profile between the query subsequence and the time - series. Entries are computed based on the z-normalised values, with special - handling for constant values. - """ - n_channels, profile_length = QT.shape - distance_profile = np.zeros(profile_length) - - for i in prange(profile_length): - Sub_is_constant = T_stds[:, i] <= AEON_NUMBA_STD_THRESHOLD - for k in prange(n_channels): - # Two Constant case - if Q_is_constant[k] and Sub_is_constant[k]: - _val = 0 - # One Constant case - elif Q_is_constant[k] or Sub_is_constant[k]: - _val = query_length - else: - denom = query_length * Q_stds[k] * T_stds[k, i] - - p = (QT[k, i] - query_length * (Q_means[k] * T_means[k, i])) / denom - p = min(p, 1.0) - - _val = abs(2 * query_length * (1.0 - p)) - distance_profile[i] += _val - - return distance_profile diff --git a/aeon/similarity_search/distance_profiles/tests/__init__.py b/aeon/similarity_search/distance_profiles/tests/__init__.py deleted file mode 100644 index 566dda7367..0000000000 --- a/aeon/similarity_search/distance_profiles/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Tests for distance profiles.""" diff --git a/aeon/similarity_search/distance_profiles/tests/test_euclidean_distance.py b/aeon/similarity_search/distance_profiles/tests/test_euclidean_distance.py deleted file mode 100644 index 2eafff78bb..0000000000 --- a/aeon/similarity_search/distance_profiles/tests/test_euclidean_distance.py +++ /dev/null @@ -1,208 +0,0 @@ -"""Tests for naive Euclidean distance profile.""" - -__maintainer__ = [] - - -import numpy as np -import pytest -from numba.typed import List -from numpy.testing import assert_array_almost_equal, assert_array_equal - -from aeon.similarity_search._commons import naive_squared_distance_profile -from aeon.similarity_search.distance_profiles.euclidean_distance_profile import ( - euclidean_distance_profile, - normalised_euclidean_distance_profile, -) -from aeon.utils.numba.general import sliding_mean_std_one_series - -DATATYPES = ["float64", "int64"] - - -@pytest.mark.parametrize("dtype", DATATYPES) -def test_euclidean_distance(dtype): - """Test Euclidean distance.""" - X = np.asarray( - [[[1, 2, 3, 4, 5, 6, 7, 8]], [[1, 2, 4, 4, 5, 6, 5, 4]]], dtype=dtype - ) - q = np.asarray([[3, 4, 5]], dtype=dtype) - - mask = np.ones((X.shape[0], X.shape[2] - q.shape[1] + 1), dtype=bool) - expected = [T**0.5 for T in naive_squared_distance_profile(X, q, mask)] - dist_profile = euclidean_distance_profile(X, q, mask) - - assert_array_almost_equal(dist_profile, expected) - - -@pytest.mark.parametrize("dtype", DATATYPES) -def test_euclidean_constant_case(dtype): - """Test Euclidean distance profile calculation.""" - X = np.ones((2, 1, 10), dtype=dtype) - q = np.zeros((1, 3), dtype=dtype) - - mask = np.ones((X.shape[0], X.shape[2] - q.shape[1] + 1), dtype=bool) - expected = [T**0.5 for T in naive_squared_distance_profile(X, q, mask)] - dist_profile = euclidean_distance_profile(X, q, mask) - - assert_array_almost_equal(dist_profile, expected) - - -def test_non_alteration_of_inputs_euclidean(): - """Test if input is altered during Euclidean distance profile.""" - X = np.asarray([[[1, 2, 3, 4, 5, 6, 7, 8]], [[1, 2, 4, 4, 5, 6, 5, 4]]]) - X_copy = np.copy(X) - q = np.asarray([[3, 4, 5]]) - q_copy = np.copy(q) - - mask = np.ones((X.shape[0], X.shape[2] - q.shape[1] + 1), dtype=bool) - _ = euclidean_distance_profile(X, q, mask) - assert_array_equal(q, q_copy) - assert_array_equal(X, X_copy) - - -@pytest.mark.parametrize("dtype", DATATYPES) -def test_normalised_euclidean_distance(dtype): - """Test normalised Euclidean distance profile calculation.""" - X = np.asarray( - [[[1, 2, 3, 4, 5, 6, 7, 8]], [[1, 2, 4, 4, 5, 6, 5, 4]]], dtype=dtype - ) - q = np.asarray([[3, 4, 5]], dtype=dtype) - - search_space_size = X.shape[-1] - q.shape[-1] + 1 - - X_means = np.zeros((X.shape[0], X.shape[1], search_space_size)) - X_stds = np.zeros((X.shape[0], X.shape[1], search_space_size)) - - for i in range(X.shape[0]): - _mean, _std = sliding_mean_std_one_series(X[i], q.shape[-1], 1) - X_stds[i] = _std - X_means[i] = _mean - - q_means = q.mean(axis=-1) - q_stds = q.std(axis=-1) - mask = np.ones((X.shape[0], X.shape[2] - q.shape[1] + 1), dtype=bool) - - dist_profile = normalised_euclidean_distance_profile( - X, q, mask, X_means, X_stds, q_means, q_stds - ) - expected = [ - T**0.5 for T in naive_squared_distance_profile(X, q, mask, normalise=True) - ] - - assert_array_almost_equal(dist_profile, expected) - - -@pytest.mark.parametrize("dtype", DATATYPES) -def test_normalised_euclidean_distance_unequal_length(dtype): - """Test normalised Euclidean distance profile calculation.""" - X = List( - [ - np.array([[1, 2, 3, 4, 5, 6, 7, 8]], dtype=dtype), - np.array([[1, 2, 4, 4, 5, 6]], dtype=dtype), - ] - ) - q = np.asarray([[3, 4, 5]], dtype=dtype) - - X_means = List() - X_stds = List() - - for i in range(len(X)): - _mean, _std = sliding_mean_std_one_series(X[i], q.shape[-1], 1) - X_stds.append(_std) - X_means.append(_mean) - - q_means = q.mean(axis=-1) - q_stds = q.std(axis=-1) - mask = List( - [np.ones(X[i].shape[1] - q.shape[1] + 1, dtype=bool) for i in range(len(X))] - ) - - dist_profile = normalised_euclidean_distance_profile( - X, q, mask, X_means, X_stds, q_means, q_stds - ) - expected = [ - T**0.5 - for T in naive_squared_distance_profile( - X, q, mask, normalise=True, X_means=X_means, X_stds=X_stds - ) - ] - for i in range(len(X)): - assert_array_almost_equal(dist_profile[i], expected[i]) - - -@pytest.mark.parametrize("dtype", DATATYPES) -def test_euclidean_distance_unequal_length(dtype): - """Test normalised Euclidean distance profile calculation.""" - X = List( - [ - np.array([[1, 2, 3, 4, 5, 6, 7, 8]], dtype=dtype), - np.array([[1, 2, 4, 4, 5, 6]], dtype=dtype), - ] - ) - q = np.asarray([[3, 4, 5]], dtype=dtype) - - mask = List( - [np.ones(X[i].shape[1] - q.shape[1] + 1, dtype=bool) for i in range(len(X))] - ) - expected = [T**0.5 for T in naive_squared_distance_profile(X, q, mask)] - dist_profile = euclidean_distance_profile(X, q, mask) - for i in range(len(X)): - assert_array_almost_equal(dist_profile[i], expected[i]) - - -@pytest.mark.parametrize("dtype", DATATYPES) -def test_normalised_euclidean_constant_case(dtype): - """Test normalised Euclidean distance profile calculation.""" - X = np.ones((2, 2, 10), dtype=dtype) - q = np.zeros((2, 3), dtype=dtype) - - search_space_size = X.shape[-1] - q.shape[-1] + 1 - - q_means = q.mean(axis=-1) - q_stds = q.std(axis=-1) - - X_means = np.zeros((X.shape[0], X.shape[1], search_space_size)) - X_stds = np.zeros((X.shape[0], X.shape[1], search_space_size)) - for i in range(X.shape[0]): - _mean, _std = sliding_mean_std_one_series(X[i], q.shape[-1], 1) - X_stds[i] = _std - X_means[i] = _mean - - mask = np.ones((X.shape[0], X.shape[2] - q.shape[1] + 1), dtype=bool) - - dist_profile = normalised_euclidean_distance_profile( - X, q, mask, X_means, X_stds, q_means, q_stds - ) - expected = [ - T**0.5 for T in naive_squared_distance_profile(X, q, mask, normalise=True) - ] - - assert_array_almost_equal(dist_profile, expected) - - -def test_non_alteration_of_inputs_normalised_euclidean(): - """Test if input is altered during normalised Euclidean distance profile.""" - X = np.asarray([[[1, 2, 3, 4, 5, 6, 7, 8]], [[1, 2, 4, 4, 5, 6, 5, 4]]]) - X_copy = np.copy(X) - q = np.asarray([[3, 4, 5]]) - q_copy = np.copy(q) - - search_space_size = X.shape[-1] - q.shape[-1] + 1 - - X_means = np.zeros((X.shape[0], X.shape[1], search_space_size)) - X_stds = np.zeros((X.shape[0], X.shape[1], search_space_size)) - - for i in range(X.shape[0]): - _mean, _std = sliding_mean_std_one_series(X[i], q.shape[-1], 1) - X_stds[i] = _std - X_means[i] = _mean - - q_means = q.mean(axis=-1) - q_stds = q.std(axis=-1) - - mask = np.ones((X.shape[0], X.shape[2] - q.shape[1] + 1), dtype=bool) - _ = normalised_euclidean_distance_profile( - X, q, mask, X_means, X_stds, q_means, q_stds - ) - - assert_array_equal(q, q_copy) - assert_array_equal(X, X_copy) diff --git a/aeon/similarity_search/distance_profiles/tests/test_squared_distance.py b/aeon/similarity_search/distance_profiles/tests/test_squared_distance.py deleted file mode 100644 index cdb7b35cbc..0000000000 --- a/aeon/similarity_search/distance_profiles/tests/test_squared_distance.py +++ /dev/null @@ -1,200 +0,0 @@ -"""Tests for naive Euclidean distance profile.""" - -__maintainer__ = [] - - -import numpy as np -import pytest -from numba.typed import List -from numpy.testing import assert_array_almost_equal, assert_array_equal - -from aeon.similarity_search._commons import naive_squared_distance_profile -from aeon.similarity_search.distance_profiles.squared_distance_profile import ( - normalised_squared_distance_profile, - squared_distance_profile, -) -from aeon.utils.numba.general import sliding_mean_std_one_series - -DATATYPES = ["float64", "int64"] - - -@pytest.mark.parametrize("dtype", DATATYPES) -def test_euclidean_distance(dtype): - """Test Euclidean distance.""" - X = np.asarray( - [[[1, 2, 3, 4, 5, 6, 7, 8]], [[1, 2, 4, 4, 5, 6, 5, 4]]], dtype=dtype - ) - q = np.asarray([[3, 4, 5]], dtype=dtype) - - mask = np.ones((X.shape[0], X.shape[2] - q.shape[1] + 1), dtype=bool) - expected = naive_squared_distance_profile(X, q, mask) - dist_profile = squared_distance_profile(X, q, mask) - - assert_array_almost_equal(dist_profile, expected) - - -@pytest.mark.parametrize("dtype", DATATYPES) -def test_euclidean_constant_case(dtype): - """Test Euclidean distance profile calculation.""" - X = np.ones((2, 1, 10), dtype=dtype) - q = np.zeros((1, 3), dtype=dtype) - - mask = np.ones((X.shape[0], X.shape[2] - q.shape[1] + 1), dtype=bool) - expected = naive_squared_distance_profile(X, q, mask) - dist_profile = squared_distance_profile(X, q, mask) - - assert_array_almost_equal(dist_profile, expected) - - -def test_non_alteration_of_inputs_euclidean(): - """Test if input is altered during Euclidean distance profile.""" - X = np.asarray([[[1, 2, 3, 4, 5, 6, 7, 8]], [[1, 2, 4, 4, 5, 6, 5, 4]]]) - X_copy = np.copy(X) - q = np.asarray([[3, 4, 5]]) - q_copy = np.copy(q) - - mask = np.ones((X.shape[0], X.shape[2] - q.shape[1] + 1), dtype=bool) - _ = squared_distance_profile(X, q, mask) - assert_array_equal(q, q_copy) - assert_array_equal(X, X_copy) - - -@pytest.mark.parametrize("dtype", DATATYPES) -def test_normalised_euclidean_distance(dtype): - """Test normalised Euclidean distance profile calculation.""" - X = np.asarray( - [[[1, 2, 3, 4, 5, 6, 7, 8]], [[1, 2, 4, 4, 5, 6, 5, 4]]], dtype=dtype - ) - q = np.asarray([[3, 4, 5]], dtype=dtype) - - search_space_size = X.shape[-1] - q.shape[-1] + 1 - - X_means = np.zeros((X.shape[0], X.shape[1], search_space_size)) - X_stds = np.zeros((X.shape[0], X.shape[1], search_space_size)) - - for i in range(X.shape[0]): - _mean, _std = sliding_mean_std_one_series(X[i], q.shape[-1], 1) - X_stds[i] = _std - X_means[i] = _mean - - q_means = q.mean(axis=-1) - q_stds = q.std(axis=-1) - mask = np.ones((X.shape[0], X.shape[2] - q.shape[1] + 1), dtype=bool) - - dist_profile = normalised_squared_distance_profile( - X, q, mask, X_means, X_stds, q_means, q_stds - ) - expected = naive_squared_distance_profile(X, q, mask, normalise=True) - - assert_array_almost_equal(dist_profile, expected) - - -@pytest.mark.parametrize("dtype", DATATYPES) -def test_normalised_euclidean_distance_unequal_length(dtype): - """Test normalised Euclidean distance profile calculation.""" - X = List( - [ - np.array([[1, 2, 3, 4, 5, 6, 7, 8]], dtype=dtype), - np.array([[1, 2, 4, 4, 5, 6]], dtype=dtype), - ] - ) - q = np.asarray([[3, 4, 5]], dtype=dtype) - - X_means = List() - X_stds = List() - - for i in range(len(X)): - _mean, _std = sliding_mean_std_one_series(X[i], q.shape[-1], 1) - X_stds.append(_std) - X_means.append(_mean) - - q_means = q.mean(axis=-1) - q_stds = q.std(axis=-1) - mask = List( - [np.ones(X[i].shape[1] - q.shape[1] + 1, dtype=bool) for i in range(len(X))] - ) - - dist_profile = normalised_squared_distance_profile( - X, q, mask, X_means, X_stds, q_means, q_stds - ) - expected = naive_squared_distance_profile(X, q, mask, normalise=True) - for i in range(len(X)): - assert_array_almost_equal(dist_profile[i], expected[i]) - - -@pytest.mark.parametrize("dtype", DATATYPES) -def test_euclidean_distance_unequal_length(dtype): - """Test normalised Euclidean distance profile calculation.""" - X = List( - [ - np.array([[1, 2, 3, 4, 5, 6, 7, 8]], dtype=dtype), - np.array([[1, 2, 4, 4, 5, 6]], dtype=dtype), - ] - ) - q = np.asarray([[3, 4, 5]], dtype=dtype) - - mask = List( - [np.ones(X[i].shape[1] - q.shape[1] + 1, dtype=bool) for i in range(len(X))] - ) - - expected = naive_squared_distance_profile(X, q, mask) - dist_profile = squared_distance_profile(X, q, mask) - for i in range(len(X)): - assert_array_almost_equal(dist_profile[i], expected[i]) - - -@pytest.mark.parametrize("dtype", DATATYPES) -def test_normalised_euclidean_constant_case(dtype): - """Test normalised Euclidean distance profile calculation.""" - X = np.ones((2, 2, 10), dtype=dtype) - q = np.zeros((2, 3), dtype=dtype) - - search_space_size = X.shape[-1] - q.shape[-1] + 1 - - q_means = q.mean(axis=-1) - q_stds = q.std(axis=-1) - - X_means = np.zeros((X.shape[0], X.shape[1], search_space_size)) - X_stds = np.zeros((X.shape[0], X.shape[1], search_space_size)) - for i in range(X.shape[0]): - _mean, _std = sliding_mean_std_one_series(X[i], q.shape[-1], 1) - X_stds[i] = _std - X_means[i] = _mean - - mask = np.ones((X.shape[0], X.shape[2] - q.shape[1] + 1), dtype=bool) - - dist_profile = normalised_squared_distance_profile( - X, q, mask, X_means, X_stds, q_means, q_stds - ) - expected = naive_squared_distance_profile(X, q, mask, normalise=True) - - assert_array_almost_equal(dist_profile, expected) - - -def test_non_alteration_of_inputs_normalised_euclidean(): - """Test if input is altered during normalised Euclidean distance profile.""" - X = np.asarray([[[1, 2, 3, 4, 5, 6, 7, 8]], [[1, 2, 4, 4, 5, 6, 5, 4]]]) - X_copy = np.copy(X) - q = np.asarray([[3, 4, 5]]) - q_copy = np.copy(q) - - search_space_size = X.shape[-1] - q.shape[-1] + 1 - - X_means = np.zeros((X.shape[0], X.shape[1], search_space_size)) - X_stds = np.zeros((X.shape[0], X.shape[1], search_space_size)) - - for i in range(X.shape[0]): - _mean, _std = sliding_mean_std_one_series(X[i], q.shape[-1], 1) - X_stds[i] = _std - X_means[i] = _mean - - q_means = q.mean(axis=-1) - q_stds = q.std(axis=-1) - - mask = np.ones((X.shape[0], X.shape[2] - q.shape[1] + 1), dtype=bool) - _ = normalised_squared_distance_profile( - X, q, mask, X_means, X_stds, q_means, q_stds - ) - - assert_array_equal(q, q_copy) - assert_array_equal(X, X_copy) diff --git a/aeon/similarity_search/matrix_profiles/__init__.py b/aeon/similarity_search/matrix_profiles/__init__.py deleted file mode 100644 index d04f1cbfd3..0000000000 --- a/aeon/similarity_search/matrix_profiles/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -"""Distance profiles.""" - -__all__ = [ - "stomp_normalised_euclidean_matrix_profile", - "stomp_euclidean_matrix_profile", - "stomp_normalised_squared_matrix_profile", - "stomp_squared_matrix_profile", -] -from aeon.similarity_search.matrix_profiles.stomp import ( - stomp_euclidean_matrix_profile, - stomp_normalised_euclidean_matrix_profile, - stomp_normalised_squared_matrix_profile, - stomp_squared_matrix_profile, -) diff --git a/aeon/similarity_search/matrix_profiles/stomp.py b/aeon/similarity_search/matrix_profiles/stomp.py deleted file mode 100644 index 509e68ad49..0000000000 --- a/aeon/similarity_search/matrix_profiles/stomp.py +++ /dev/null @@ -1,633 +0,0 @@ -"""Implementation of stomp for euclidean and squared euclidean distance profile.""" - -from typing import Optional - -__maintainer__ = ["baraline"] - - -from typing import Union - -import numpy as np -from numba import njit -from numba.typed import List - -from aeon.similarity_search._commons import ( - extract_top_k_and_threshold_from_distance_profiles_one_series, - get_ith_products, - numba_roll_1D_no_warparound, -) -from aeon.similarity_search.distance_profiles.squared_distance_profile import ( - _normalised_squared_dist_profile_one_series, - _squared_dist_profile_one_series, -) -from aeon.utils.numba.general import AEON_NUMBA_STD_THRESHOLD - - -def stomp_euclidean_matrix_profile( - X: Union[np.ndarray, List], - T: np.ndarray, - L: int, - mask: np.ndarray, - k: int = 1, - threshold: float = np.inf, - inverse_distance: bool = False, - exclusion_size: Optional[int] = None, -): - """ - Compute a euclidean euclidean matrix profile using STOMP [1]_. - - This improves on the naive matrix profile by updating the dot products for each - sucessive query in T instead of recomputing them. - - Parameters - ---------- - X: np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints) - The input samples. If X is an unquel length collection, expect a TypedList - of 2D arrays of shape (n_channels, n_timepoints) - T : np.ndarray, 2D array of shape (n_channels, series_length) - The series used for similarity search. Note that series_length can be equal, - superior or inferior to n_timepoints, it doesn't matter. - L : int - The length of the subsequences considered during the search. This parameter - cannot be larger than n_timepoints and series_length. - mask : np.ndarray, 2D array of shape (n_cases, n_timepoints - length + 1) - Boolean mask of the shape of the distance profiles indicating for which part - of it the distance should be computed. In this context, it is the mask for the - first query of size L in T. This mask will be updated during the algorithm. - k : int, default=1 - The number of best matches to return during predict for each subsequence. - threshold : float, default=np.inf - The number of best matches to return during predict for each subsequence. - inverse_distance : bool, default=False - If True, the matching will be made on the inverse of the distance, and thus, the - worst matches to the query will be returned instead of the best ones. - exclusion_size : int, optional - The size of the exclusion zone used to prevent returning as top k candidates - the ones that are close to each other (for example i and i+1). - It is used to define a region between - :math:`id_timestomp - exclusion_size` and - :math:`id_timestomp + exclusion_size` which cannot be returned - as best match if :math:`id_timestomp` was already selected. By default, - the value None means that this is not used. - - References - ---------- - .. [1] Matrix Profile II: Exploiting a Novel Algorithm and GPUs to break the one - Hundred Million Barrier for Time Series Motifs and Joins. Yan Zhu, Zachary - Zimmerman, Nader Shakibay Senobari, Chin-Chia Michael Yeh, Gareth Funning, Abdullah - Mueen, Philip Berisk and Eamonn Keogh. IEEE ICDM 2016 - - Returns - ------- - Tuple(ndarray, ndarray) - The first array, of shape ``(series_length - length + 1, n_matches)``, - contains the distance between all the queries of size length and their best - matches in X_. The second array, of shape - ``(series_length - L + 1, n_matches, 2)``, contains the indexes of these - matches as ``(id_sample, id_timepoint)``. The corresponding match can be - retrieved as ``X_[id_sample, :, id_timepoint : id_timepoint + length]``. - - """ - MP, IP = stomp_squared_matrix_profile( - X, - T, - L, - mask, - k=k, - threshold=threshold, - exclusion_size=exclusion_size, - inverse_distance=inverse_distance, - ) - for i in range(len(MP)): - MP[i] = MP[i] ** 0.5 - return MP, IP - - -def stomp_squared_matrix_profile( - X: Union[np.ndarray, List], - T: np.ndarray, - L: int, - mask: np.ndarray, - k: int = 1, - threshold: float = np.inf, - inverse_distance: bool = False, - exclusion_size: Optional[int] = None, -): - """ - Compute a squared euclidean matrix profile using STOMP [1]_. - - This improves on the naive matrix profile by updating the dot products for each - sucessive query in T instead of recomputing them. - - Parameters - ---------- - X: np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints) - The input samples. If X is an unquel length collection, expect a TypedList - of 2D arrays of shape (n_channels, n_timepoints) - T : np.ndarray, 2D array of shape (n_channels, series_length) - The series used for similarity search. Note that series_length can be equal, - superior or inferior to n_timepoints, it doesn't matter. - L : int - The length of the subsequences considered during the search. This parameter - cannot be larger than n_timepoints and series_length. - mask : np.ndarray, 2D array of shape (n_cases, n_timepoints - length + 1) - Boolean mask of the shape of the distance profiles indicating for which part - of it the distance should be computed. In this context, it is the mask for the - first query of size L in T. This mask will be updated during the algorithm. - k : int, default=1 - The number of best matches to return during predict for each subsequence. - threshold : float, default=np.inf - The number of best matches to return during predict for each subsequence. - inverse_distance : bool, default=False - If True, the matching will be made on the inverse of the distance, and thus, the - worst matches to the query will be returned instead of the best ones. - exclusion_size : int, optional - The size of the exclusion zone used to prevent returning as top k candidates - the ones that are close to each other (for example i and i+1). - It is used to define a region between - :math:`id_timestomp - exclusion_size` and - :math:`id_timestomp + exclusion_size` which cannot be returned - as best match if :math:`id_timestomp` was already selected. By default, - the value None means that this is not used. - - References - ---------- - .. [1] Matrix Profile II: Exploiting a Novel Algorithm and GPUs to break the one - Hundred Million Barrier for Time Series Motifs and Joins. Yan Zhu, Zachary - Zimmerman, Nader Shakibay Senobari, Chin-Chia Michael Yeh, Gareth Funning, Abdullah - Mueen, Philip Berisk and Eamonn Keogh. IEEE ICDM 2016 - - Returns - ------- - Tuple(ndarray, ndarray) - The first array, of shape ``(series_length - length + 1, n_matches)``, - contains the distance between all the queries of size length and their best - matches in X_. The second array, of shape - ``(series_length - L + 1, n_matches, 2)``, contains the indexes of these - matches as ``(id_sample, id_timepoint)``. The corresponding match can be - retrieved as ``X_[id_sample, :, id_timepoint : id_timepoint + length]``. - - """ - XdotT = [get_ith_products(X[i], T, L, 0) for i in range(len(X))] - if isinstance(X, np.ndarray): - XdotT = np.asarray(XdotT) - elif isinstance(X, List): - XdotT = List(XdotT) - - MP, IP = _stomp( - X, - T, - XdotT, - L, - mask, - k, - threshold, - exclusion_size, - inverse_distance, - ) - return MP, IP - - -def stomp_normalised_euclidean_matrix_profile( - X: Union[np.ndarray, List], - T: np.ndarray, - L: int, - X_means: Union[np.ndarray, List], - X_stds: Union[np.ndarray, List], - T_means: np.ndarray, - T_stds: np.ndarray, - mask: np.ndarray, - k: int = 1, - threshold: float = np.inf, - inverse_distance: bool = False, - exclusion_size: Optional[int] = None, -): - """ - Compute a euclidean matrix profile using STOMP [1]_. - - This improves on the naive matrix profile by updating the dot products for each - sucessive query in T instead of recomputing them. - - Parameters - ---------- - X: np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints) - The input samples. If X is an unquel length collection, expect a TypedList - of 2D arrays of shape (n_channels, n_timepoints) - T : np.ndarray, 2D array of shape (n_channels, series_length) - The series used for similarity search. Note that series_length can be equal, - superior or inferior to n_timepoints, it doesn't matter. - L : int - The length of the subsequences considered during the search. This parameter - cannot be larger than n_timepoints and series_length. - X_means : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - L + 1) - Means of each subsequences of X of size L. Should be a numba TypedList if X is - unequal length. - X_stds : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - L + 1) - Stds of each subsequences of X of size L. Should be a numba TypedList if X is - unequal length. - T_means : np.ndarray, 2D array of shape (n_channels, n_timepoints - L + 1) - Means of each subsequences of T of size L. - T_stds : np.ndarray, 2D array of shape (n_channels, n_timepoints - L + 1) - Stds of each subsequences of T of size L. - mask : np.ndarray, 2D array of shape (n_cases, n_timepoints - length + 1) - Boolean mask of the shape of the distance profiles indicating for which part - of it the distance should be computed. In this context, it is the mask for the - first query of size L in T. This mask will be updated during the algorithm. - k : int, default=1 - The number of best matches to return during predict for each subsequence. - threshold : float, default=np.inf - The number of best matches to return during predict for each subsequence. - inverse_distance : bool, default=False - If True, the matching will be made on the inverse of the distance, and thus, the - worst matches to the query will be returned instead of the best ones. - exclusion_size : int, optional - The size of the exclusion zone used to prevent returning as top k candidates - the ones that are close to each other (for example i and i+1). - It is used to define a region between - :math:`id_timestomp - exclusion_size` and - :math:`id_timestomp + exclusion_size` which cannot be returned - as best match if :math:`id_timestomp` was already selected. By default, - the value None means that this is not used. - - References - ---------- - .. [1] Matrix Profile II: Exploiting a Novel Algorithm and GPUs to break the one - Hundred Million Barrier for Time Series Motifs and Joins. Yan Zhu, Zachary - Zimmerman, Nader Shakibay Senobari, Chin-Chia Michael Yeh, Gareth Funning, Abdullah - Mueen, Philip Berisk and Eamonn Keogh. IEEE ICDM 2016 - - Returns - ------- - Tuple(ndarray, ndarray) - The first array, of shape ``(series_length - length + 1, n_matches)``, - contains the distance between all the queries of size length and their best - matches in X_. The second array, of shape - ``(series_length - L + 1, n_matches, 2)``, contains the indexes of these - matches as ``(id_sample, id_timepoint)``. The corresponding match can be - retrieved as ``X_[id_sample, :, id_timepoint : id_timepoint + length]``. - - """ - MP, IP = stomp_normalised_squared_matrix_profile( - X, - T, - L, - X_means, - X_stds, - T_means, - T_stds, - mask, - k=k, - threshold=threshold, - exclusion_size=exclusion_size, - inverse_distance=inverse_distance, - ) - for i in range(len(MP)): - MP[i] = MP[i] ** 0.5 - return MP, IP - - -def stomp_normalised_squared_matrix_profile( - X: Union[np.ndarray, List], - T: np.ndarray, - L: int, - X_means: Union[np.ndarray, List], - X_stds: Union[np.ndarray, List], - T_means: np.ndarray, - T_stds: np.ndarray, - mask: np.ndarray, - k: int = 1, - threshold: float = np.inf, - inverse_distance: bool = False, - exclusion_size: Optional[int] = None, -): - """ - Compute a squared euclidean matrix profile using STOMP [1]_. - - This improves on the naive matrix profile by updating the dot products for each - sucessive query in T instead of recomputing them. - - Parameters - ---------- - X: np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints) - The input samples. If X is an unquel length collection, expect a TypedList - of 2D arrays of shape (n_channels, n_timepoints) - T : np.ndarray, 2D array of shape (n_channels, series_length) - The series used for similarity search. Note that series_length can be equal, - superior or inferior to n_timepoints, it doesn't matter. - L : int - The length of the subsequences considered during the search. This parameter - cannot be larger than n_timepoints and series_length. - X_means : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - L + 1) - Means of each subsequences of X of size L. Should be a numba TypedList if X is - unequal length. - X_stds : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - L + 1) - Stds of each subsequences of X of size L. Should be a numba TypedList if X is - unequal length. - T_means : np.ndarray, 2D array of shape (n_channels, n_timepoints - L + 1) - Means of each subsequences of T of size L. - T_stds : np.ndarray, 2D array of shape (n_channels, n_timepoints - L + 1) - Stds of each subsequences of T of size L. - mask : np.ndarray, 2D array of shape (n_cases, n_timepoints - length + 1) - Boolean mask of the shape of the distance profiles indicating for which part - of it the distance should be computed. In this context, it is the mask for the - first query of size L in T. This mask will be updated during the algorithm. - k : int, default=1 - The number of best matches to return during predict for each subsequence. - threshold : float, default=np.inf - The number of best matches to return during predict for each subsequence. - inverse_distance : bool, default=False - If True, the matching will be made on the inverse of the distance, and thus, the - worst matches to the query will be returned instead of the best ones. - exclusion_size : int, optional - The size of the exclusion zone used to prevent returning as top k candidates - the ones that are close to each other (for example i and i+1). - It is used to define a region between - :math:`id_timestomp - exclusion_size` and - :math:`id_timestomp + exclusion_size` which cannot be returned - as best match if :math:`id_timestomp` was already selected. By default, - the value None means that this is not used. - - References - ---------- - .. [1] Matrix Profile II: Exploiting a Novel Algorithm and GPUs to break the one - Hundred Million Barrier for Time Series Motifs and Joins. Yan Zhu, Zachary - Zimmerman, Nader Shakibay Senobari, Chin-Chia Michael Yeh, Gareth Funning, Abdullah - Mueen, Philip Berisk and Eamonn Keogh. IEEE ICDM 2016 - - Returns - ------- - Tuple(ndarray, ndarray) - The first array, of shape ``(series_length - length + 1, n_matches)``, - contains the distance between all the queries of size length and their best - matches in X_. The second array, of shape - ``(series_length - L + 1, n_matches, 2)``, contains the indexes of these - matches as ``(id_sample, id_timepoint)``. The corresponding match can be - retrieved as ``X_[id_sample, :, id_timepoint : id_timepoint + length]``. - - """ - XdotT = [get_ith_products(X[i], T, L, 0) for i in range(len(X))] - if isinstance(X, np.ndarray): - XdotT = np.asarray(XdotT) - elif isinstance(X, List): - XdotT = List(XdotT) - - MP, IP = _stomp_normalised( - X, - T, - XdotT, - X_means, - X_stds, - T_means, - T_stds, - L, - mask, - k, - threshold, - exclusion_size, - inverse_distance, - ) - return MP, IP - - -def _stomp_normalised( - X, - T, - XdotT, - X_means, - X_stds, - T_means, - T_stds, - L, - mask, - k, - threshold, - exclusion_size, - inverse_distance, -): - """ - Compute the Matrix Profile using the STOMP algorithm with normalised distances. - - X: np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints) - The input samples. If X is an unquel length collection, expect a TypedList - of 2D arrays of shape (n_channels, n_timepoints) - T : np.ndarray, 2D array of shape (n_channels, series_length) - The series used for similarity search. Note that series_length can be equal, - superior or inferior to n_timepoints, it doesn't matter. - L : int - Length of the subsequences used for the distance computation. - XdotT : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - L + 1) - Precomputed dot products between each time series in X and the query series T. - X_means : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - L + 1) - Means of each subsequences of X of size L. Should be a numba TypedList if X is - unequal length. - X_stds : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - L + 1) - Stds of each subsequences of X of size L. Should be a numba TypedList if X is - unequal length. - T_means : np.ndarray, 2D array of shape (n_channels, n_timepoints - L + 1) - Means of each subsequences of T of size L. - T_stds : np.ndarray, 2D array of shape (n_channels, n_timepoints - L + 1) - Stds of each subsequences of T of size L. - mask : np.ndarray, 2D array of shape (n_cases, n_timepoints - length + 1) - Boolean mask of the shape of the distance profiles indicating for which part - of it the distance should be computed. In this context, it is the mask for the - first query of size L in T. This mask will be updated during the algorithm. - k : int, default=1 - The number of best matches to return during predict for each subsequence. - threshold : float, default=np.inf - The number of best matches to return during predict for each subsequence. - inverse_distance : bool, default=False - If True, the matching will be made on the inverse of the distance, and thus, the - worst matches to the query will be returned instead of the best ones. - exclusion_size : int, optional - The size of the exclusion zone used to prevent returning as top k candidates - the ones that are close to each other (for example i and i+1). - It is used to define a region between - :math:`id_timestomp - exclusion_size` and - :math:`id_timestomp + exclusion_size` which cannot be returned - as best match if :math:`id_timestomp` was already selected. By default, - the value None means that this is not used. - - Returns - ------- - tuple of np.ndarray - - MP : array of shape (n_queries,) - Matrix profile distances for each query subsequence. - - IP : array of shape (n_queries,) - Indexes of the top matches for each query subsequence. - """ - n_queries = T.shape[1] - L + 1 - MP = np.empty(n_queries, dtype=object) - IP = np.empty(n_queries, dtype=object) - for i_x in range(len(X)): - for i in range(n_queries): - dist_profiles = _normalised_squared_dist_profile_one_series( - XdotT[i_x], - X_means[i_x], - X_stds[i_x], - T_means[:, i], - T_stds[:, i], - L, - T_stds[:, i] <= AEON_NUMBA_STD_THRESHOLD, - ) - dist_profiles[~mask[i_x]] = np.inf - if i + 1 < n_queries: - XdotT[i_x] = _update_dot_products_one_series( - X[i_x], T, XdotT[i_x], L, i + 1 - ) - - mask[i_x] = numba_roll_1D_no_warparound(mask[i_x], 1, True) - ( - top_dists, - top_indexes, - ) = extract_top_k_and_threshold_from_distance_profiles_one_series( - dist_profiles, - i_x, - k=k, - threshold=threshold, - exclusion_size=exclusion_size, - inverse_distance=inverse_distance, - ) - if i_x > 0: - top_dists, top_indexes = _sort_out_tops( - top_dists, MP[i], top_indexes, IP[i], k - ) - MP[i] = top_dists - IP[i] = top_indexes - else: - MP[i] = top_dists - IP[i] = top_indexes - - return MP, IP - - -def _stomp( - X, - T, - XdotT, - L, - mask, - k, - threshold, - exclusion_size, - inverse_distance, -): - n_queries = T.shape[1] - L + 1 - MP = np.empty(n_queries, dtype=object) - IP = np.empty(n_queries, dtype=object) - for i_x in range(len(X)): - for i in range(n_queries): - Q = T[:, i : i + L] - dist_profiles = _squared_dist_profile_one_series(XdotT[i_x], X[i_x], Q) - dist_profiles[~mask[i_x]] = np.inf - if i + 1 < n_queries: - XdotT[i_x] = _update_dot_products_one_series( - X[i_x], T, XdotT[i_x], L, i + 1 - ) - - mask[i_x] = numba_roll_1D_no_warparound(mask[i_x], 1, True) - ( - top_dists, - top_indexes, - ) = extract_top_k_and_threshold_from_distance_profiles_one_series( - dist_profiles, - i_x, - k=k, - threshold=threshold, - exclusion_size=exclusion_size, - inverse_distance=inverse_distance, - ) - if i_x > 0: - top_dists, top_indexes = _sort_out_tops( - top_dists, MP[i], top_indexes, IP[i], k - ) - MP[i] = top_dists - IP[i] = top_indexes - else: - MP[i] = top_dists - IP[i] = top_indexes - - return MP, IP - - -def _sort_out_tops(top_dists, prev_top_dists, top_indexes, prev_to_indexes, k): - """ - Sort and combine top distance results from previous and current computations. - - Parameters - ---------- - top_dists : np.ndarray - Array of distances from the current computation. Shape should be (n,). - prev_top_dists : np.ndarray - Array of distances from previous computations. Shape should be (n,). - top_indexes : np.ndarray - Array of indexes corresponding to the top distances from current computation. - Shape should be (n,). - prev_to_indexes : np.ndarray - Array of indexes corresponding to the top distances from previous computations. - Shape should be (n,). - k : int, default=1 - The number of best matches to return during predict for each subsequence. - - Returns - ------- - tuple - A tuple containing two elements: - - A 1D numpy array of sorted distances, of length min(k, - total number of distances). - - A 1D numpy array of indexes corresponding to the sorted distances, - of length min(k, total number of distances). - """ - all_dists = np.concatenate((prev_top_dists, top_dists)) - all_indexes = np.concatenate((prev_to_indexes, top_indexes)) - if k == np.inf: - return all_dists, all_indexes - else: - idx = np.argsort(all_dists)[:k] - return all_dists[idx], all_indexes[idx] - - -@njit(cache=True, fastmath=True) -def _update_dot_products_one_series( - X, - T, - XT_products, - L, - i_query, -): - """ - Update dot products of the i-th query of size L in T from the dot products of i-1. - - Parameters - ---------- - X: np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints) - Input time series on which the sliding dot product is computed. - T: np.ndarray, 2D array of shape (n_channels, series_length) - The series used for similarity search. Note that series_length can be equal, - superior or inferior to n_timepoints, it doesn't matter. - L : int - The length of the subsequences considered during the search. This parameter - cannot be larger than n_timepoints and series_length. - i_query : int - Query starting index in T. - - Returns - ------- - XT_products : np.ndarray of shape (n_cases, n_channels, n_timepoints - L + 1) - Sliding dot product between the i-th subsequence of size L in T and X. - - """ - n_channels = T.shape[0] - Q = T[:, i_query : i_query + L] - n_candidates = X.shape[1] - L + 1 - - for i_ft in range(n_channels): - # first element of all 0 to n-1 candidates * first element of previous query - _a1 = X[i_ft, : n_candidates - 1] * T[i_ft, i_query - 1] - # last element of all 1 to n candidates * last element of current query - _a2 = X[i_ft, L : L - 1 + n_candidates] * T[i_ft, i_query + L - 1] - - XT_products[i_ft, 1:] = XT_products[i_ft, :-1] - _a1 + _a2 - - # Compute first dot product - XT_products[i_ft, 0] = np.sum(Q[i_ft] * X[i_ft, :L]) - return XT_products diff --git a/aeon/similarity_search/matrix_profiles/tests/test_stomp.py b/aeon/similarity_search/matrix_profiles/tests/test_stomp.py deleted file mode 100644 index ffcf7d0b6a..0000000000 --- a/aeon/similarity_search/matrix_profiles/tests/test_stomp.py +++ /dev/null @@ -1,205 +0,0 @@ -"""Tests for stomp algorithm.""" - -__maintainer__ = ["baraline"] - -import numpy as np -import pytest -from numba.typed import List -from numpy.testing import assert_almost_equal, assert_array_almost_equal, assert_equal - -from aeon.distances import get_distance_function -from aeon.similarity_search._commons import get_ith_products -from aeon.similarity_search.matrix_profiles.stomp import ( - _update_dot_products_one_series, - stomp_normalised_squared_matrix_profile, - stomp_squared_matrix_profile, -) -from aeon.utils.numba.general import sliding_mean_std_one_series - -DATATYPES = ["int64", "float64"] -K_VALUES = [1] - - -def test__update_dot_products_one_series(): - """Test the _update_dot_product function.""" - X = np.random.rand(1, 50) - T = np.random.rand(1, 25) - L = 10 - current_product = get_ith_products(X, T, L, 0) - for i_query in range(1, T.shape[1] - L + 1): - new_product = get_ith_products( - X, - T, - L, - i_query, - ) - current_product = _update_dot_products_one_series( - X, - T, - current_product, - L, - i_query, - ) - assert_array_almost_equal(new_product, current_product) - - -@pytest.mark.parametrize("dtype", DATATYPES) -@pytest.mark.parametrize("k", K_VALUES) -def test_stomp_squared_matrix_profile(dtype, k): - """Test stomp series search.""" - X = np.asarray( - [[[1, 2, 3, 4, 5, 6, 7, 8]], [[1, 2, 4, 4, 5, 6, 5, 4]]], dtype=dtype - ) - - S = np.asarray([[3, 4, 5, 4, 3, 4, 5, 3, 2, 4, 5]], dtype=dtype) - L = 3 - mask = np.ones((X.shape[0], X.shape[2] - L + 1), dtype=bool) - distance = get_distance_function("squared") - mp, ip = stomp_squared_matrix_profile(X, S, L, mask, k=k) - for i in range(S.shape[-1] - L + 1): - q = S[:, i : i + L] - - expected = np.array( - [ - [distance(q, X[j, :, _i : _i + L]) for _i in range(X.shape[-1] - L + 1)] - for j in range(X.shape[0]) - ] - ) - id_bests = np.vstack( - np.unravel_index( - np.argsort(expected.ravel(), kind="stable"), expected.shape - ) - ).T - - for j in range(k): - assert_almost_equal(mp[i][j], expected[id_bests[j, 0], id_bests[j, 1]]) - assert_equal(ip[i][j], id_bests[j]) - - -@pytest.mark.parametrize("dtype", DATATYPES) -@pytest.mark.parametrize("k", K_VALUES) -def test_stomp_normalised_squared_matrix_profile(dtype, k): - """Test stomp series search.""" - X = np.asarray( - [[[1, 2, 3, 4, 5, 6, 7, 8]], [[1, 2, 4, 4, 5, 6, 5, 4]]], dtype=dtype - ) - - S = np.asarray([[3, 4, 5, 4, 3, 4, 5, 3, 2, 4, 5]], dtype=dtype) - L = 3 - mask = np.ones((X.shape[0], X.shape[2] - L + 1), dtype=bool) - distance = get_distance_function("squared") - X_means = [] - X_stds = [] - - for i in range(len(X)): - _mean, _std = sliding_mean_std_one_series(X[i], L, 1) - - X_stds.append(_std) - X_means.append(_mean) - X_means = np.asarray(X_means) - X_stds = np.asarray(X_stds) - - S_means, S_stds = sliding_mean_std_one_series(S, L, 1) - - mp, ip = stomp_normalised_squared_matrix_profile( - X, S, L, X_means, X_stds, S_means, S_stds, mask, k=k - ) - - for i in range(S.shape[-1] - L + 1): - q = (S[:, i : i + L] - S_means[:, i]) / S_stds[:, i] - - expected = np.array( - [ - [ - distance( - q, - (X[j, :, _i : _i + L] - X_means[j, :, _i]) / X_stds[j, :, _i], - ) - for _i in range(X.shape[-1] - L + 1) - ] - for j in range(X.shape[0]) - ] - ) - id_bests = np.vstack( - np.unravel_index(np.argsort(expected.ravel()), expected.shape) - ).T - - for j in range(k): - assert_almost_equal(mp[i][j], expected[id_bests[j, 0], id_bests[j, 1]]) - - -@pytest.mark.parametrize("dtype", DATATYPES) -def test_stomp_squared_matrix_profile_unequal_length(dtype): - """Test stomp with unequal length.""" - X = List( - [ - np.array([[1, 2, 3, 4, 5, 6, 7, 8]], dtype=dtype), - np.array([[1, 2, 4, 4, 5, 6]], dtype=dtype), - ] - ) - L = 3 - mask = List( - [ - np.ones(X[0].shape[1] - L + 1, dtype=bool), - np.ones(X[1].shape[1] - L + 1, dtype=bool), - ] - ) - S = np.asarray([[3, 4, 5, 4, 3, 4, 5, 3, 2, 4, 5]], dtype=dtype) - - distance = get_distance_function("squared") - mp, ip = stomp_squared_matrix_profile(X, S, L, mask) - - for i in range(S.shape[-1] - L + 1): - q = S[:, i : i + L] - - expected = [ - [ - distance(q, X[j][:, _i : _i + q.shape[-1]]) - for _i in range(X[j].shape[-1] - q.shape[-1] + 1) - ] - for j in range(len(X)) - ] - assert_almost_equal(mp[i][0], np.concatenate(expected).min()) - - -@pytest.mark.parametrize("dtype", DATATYPES) -@pytest.mark.parametrize("k", K_VALUES) -def test_stomp_squared_matrix_profile_inverse(dtype, k): - """Test stomp series search for inverse distance.""" - X = np.asarray( - [[[1, 2, 3, 4, 5, 6, 7, 8]], [[1, 2, 4, 4, 5, 6, 5, 4]]], dtype=dtype - ) - S = np.asarray([[3, 4, 5, 4, 3, 4, 5, 3, 2, 4, 5]], dtype=dtype) - L = 3 - mask = np.ones((X.shape[0], X.shape[2] - L + 1), dtype=bool) - distance = get_distance_function("squared") - mp, ip = stomp_squared_matrix_profile( - X, - S, - L, - mask, - k=k, - inverse_distance=True, - ) - - for i in range(S.shape[-1] - L + 1): - q = S[:, i : i + L] - - expected = np.array( - [ - [ - distance(q, X[j, :, _i : _i + q.shape[-1]]) - for _i in range(X.shape[-1] - q.shape[-1] + 1) - ] - for j in range(X.shape[0]) - ] - ) - expected += 1e-8 - expected = 1 / expected - id_bests = np.vstack( - np.unravel_index(np.argsort(expected.ravel()), expected.shape) - ).T - - for j in range(k): - assert_almost_equal(mp[i][j], expected[id_bests[j, 0], id_bests[j, 1]]) - assert_equal(ip[i][j], id_bests[j]) diff --git a/aeon/similarity_search/query_search.py b/aeon/similarity_search/query_search.py deleted file mode 100644 index 393439148d..0000000000 --- a/aeon/similarity_search/query_search.py +++ /dev/null @@ -1,428 +0,0 @@ -"""Base class for query search.""" - -__maintainer__ = ["baraline"] - -from typing import Optional, final - -import numpy as np -from numba import get_num_threads, set_num_threads - -from aeon.similarity_search._commons import ( - extract_top_k_and_threshold_from_distance_profiles, -) -from aeon.similarity_search.base import BaseSimilaritySearch -from aeon.similarity_search.distance_profiles.euclidean_distance_profile import ( - euclidean_distance_profile, - normalised_euclidean_distance_profile, -) -from aeon.similarity_search.distance_profiles.squared_distance_profile import ( - normalised_squared_distance_profile, - squared_distance_profile, -) - - -class QuerySearch(BaseSimilaritySearch): - """ - Query search estimator. - - The query search estimator will return a set of matches of a query in a search space - , which is defined by a time series dataset given during fit. Depending on the `k` - and/or `threshold` parameters, which condition what is considered a valid match - during the search, the number of matches will vary. If `k` is used, at most `k` - matches (the `k` best) will be returned, if `threshold` is used and `k` is set to - `np.inf`, all the candidates which distance to the query is inferior or equal to - `threshold` will be returned. If both are used, the `k` best matches to the query - with distance inferior to `threshold` will be returned. - - - Parameters - ---------- - k : int, default=1 - The number of best matches to return during predict for a given query. - threshold : float, default=np.inf - The number of best matches to return during predict for a given query. - distance : str, default="euclidean" - Name of the distance function to use. A list of valid strings can be found in - the documentation for :func:`aeon.distances.get_distance_function`. - If a callable is passed it must either be a python function or numba function - with nopython=True, that takes two 1d numpy arrays as input and returns a float. - distance_args : dict, default=None - Optional keyword arguments for the distance function. - normalise : bool, default=False - Whether the distance function should be z-normalised. - speed_up : str, default='fastest' - Which speed up technique to use with for the selected distance - function. By default, the fastest algorithm is used. A list of available - algorithm for each distance can be obtained by calling the - `get_speedup_function_names` function. - inverse_distance : bool, default=False - If True, the matching will be made on the inverse of the distance, and thus, the - worst matches to the query will be returned instead of the best ones. - n_jobs : int, default=1 - Number of parallel jobs to use. - store_distance_profiles : bool, default=False. - Whether to store the computed distance profiles in the attribute - "distance_profiles_" after calling the predict method. It will store the raw - distance profile, meaning without potential inversion or thresholding applied. - - Attributes - ---------- - X_ : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints) - The input time series stored during the fit method. This is the - database we search in when given a query. - distance_profile_function : function - The function used to compute the distance profile. This is determined - during the fit method based on the distance and normalise - parameters. - - Notes - ----- - For now, the multivariate case is only treated as independent. - Distances are computed for each channel independently and then - summed together. - """ - - def __init__( - self, - k: int = 1, - threshold: float = np.inf, - distance: str = "euclidean", - distance_args: Optional[dict] = None, - inverse_distance: bool = False, - normalise: bool = False, - speed_up: str = "fastest", - n_jobs: int = 1, - store_distance_profiles: bool = False, - ): - self.k = k - self.threshold = threshold - self.store_distance_profiles = store_distance_profiles - self._previous_query_length = -1 - self.axis = 1 - - super().__init__( - distance=distance, - distance_args=distance_args, - inverse_distance=inverse_distance, - normalise=normalise, - speed_up=speed_up, - n_jobs=n_jobs, - ) - - def _fit(self, X: np.ndarray, y=None): - """ - Check input format and store it to be used as search space during predict. - - Parameters - ---------- - X : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints) - Input array to used as database for the similarity search - y : optional - Not used. - - Raises - ------ - TypeError - If the input X array is not 3D raise an error. - - Returns - ------- - self - - """ - self.X_ = X - self.distance_profile_function_ = self._get_distance_profile_function() - return self - - @final - def predict( - self, - X: np.ndarray, - axis=1, - X_index=None, - exclusion_factor=2.0, - apply_exclusion_to_result=False, - ) -> np.ndarray: - """ - Predict method : Check the shape of X and call _predict to perform the search. - - If the distance profile function is normalised, it stores the mean and stds - from X and X_, with X_ the training data. - - Parameters - ---------- - X : np.ndarray, 2D array of shape (n_channels, query_length) - Input query used for similarity search. - axis : int - The time point axis of the input series if it is 2D. If ``axis==0``, it is - assumed each column is a time series and each row is a time point. i.e. the - shape of the data is ``(n_timepoints,n_channels)``. ``axis==1`` indicates - the time series are in rows, i.e. the shape of the data is - ``(n_channels,n_timepoints)``. - X_index : Iterable - An Interable (tuple, list, array) of length two used to specify the index of - the query X if it was extracted from the input data X given during the fit - method. Given the tuple (id_sample, id_timestamp), the similarity search - will define an exclusion zone around the X_index in order to avoid matching - X with itself. If None, it is considered that the query is not extracted - from X_. - exclusion_factor : float, default=2. - The factor to apply to the query length to define the exclusion zone. The - exclusion zone is define from - :math:`id_timestamp - query_length//exclusion_factor` to - :math:`id_timestamp + query_length//exclusion_factor`. This also applies to - the matching conditions defined by child classes. For example, with - TopKSimilaritySearch, the k best matches are also subject to the exclusion - zone, but with :math:`id_timestamp` the index of one of the k matches. - apply_exclusion_to_result : bool, default=False - Wheter to apply the exclusion factor to the output of the similarity search. - This means that two matches of the query from the same sample must be at - least spaced by +/- :math:`query_length//exclusion_factor`. - This can avoid pathological matching where, for example if we extract the - best two matches, there is a high chance that if the best match is located - at :math:`id_timestamp`, the second best match will be located at - :math:`id_timestamp` +/- 1, as they both share all their values except one. - - Raises - ------ - TypeError - If the input X array is not 2D raise an error. - ValueError - If the length of the query is greater - - Returns - ------- - Tuple(ndarray, ndarray) - The first array, of shape ``(n_matches)``, contains the distance between - the query and its best matches in X_. The second array, of shape - ``(n_matches, 2)``, contains the indexes of these matches as - ``(id_sample, id_timepoint)``. The corresponding match can be - retrieved as ``X_[id_sample, :, id_timepoint : id_timepoint + length]``. - - """ - self._check_is_fitted() - prev_threads = get_num_threads() - set_num_threads(self._n_jobs) - - query_dim, query_length = self._check_query_format(X, axis) - - mask = self._init_X_index_mask( - X_index, - query_length, - exclusion_factor=exclusion_factor, - ) - - if self.normalise: - self.query_means_ = np.mean(X, axis=-1) - self.query_stds_ = np.std(X, axis=-1) - if self._previous_query_length != query_length: - self._store_mean_std_from_inputs(query_length) - - if apply_exclusion_to_result: - exclusion_size = query_length // exclusion_factor - else: - exclusion_size = None - - self._previous_query_length = query_length - - X_preds = self._predict( - self._call_distance_profile(X, mask), - exclusion_size=exclusion_size, - ) - set_num_threads(prev_threads) - return X_preds - - def _predict( - self, distance_profiles: np.ndarray, exclusion_size: Optional[int] = None - ) -> np.ndarray: - """ - Private predict method for QuerySearch. - - It takes the distance profiles and apply the `k` and `threshold` conditions to - return the set of best matches. - - Parameters - ---------- - distance_profiles : np.ndarray, 2D array of shape (n_cases, n_timepoints - query_length + 1) # noqa: E501 - Precomputed distance profile. - exclusion_size : int, optional - The size of the exclusion zone used to prevent returning as top k candidates - the ones that are close to each other (for example i and i+1). - It is used to define a region between - :math:`id_timestamp - exclusion_size` and - :math:`id_timestamp + exclusion_size` which cannot be returned - as best match if :math:`id_timestamp` was already selected. By default, - the value None means that this is not used. - - Returns - ------- - Tuple(ndarray, ndarray) - The first array, of shape ``(n_matches)``, contains the distance between - the query and its best matches in X_. The second array, of shape - ``(n_matches, 2)``, contains the indexes of these matches as - ``(id_sample, id_timepoint)``. The corresponding match can be - retrieved as ``X_[id_sample, :, id_timepoint : id_timepoint + length]``. - - - """ - if self.store_distance_profiles: - self.distance_profiles_ = distance_profiles - # Define id sample and timestamp to not "loose" them due to concatenation - return extract_top_k_and_threshold_from_distance_profiles( - distance_profiles, - k=self.k, - threshold=self.threshold, - exclusion_size=exclusion_size, - inverse_distance=self.inverse_distance, - ) - - def _check_query_format(self, X, axis): - if axis not in [0, 1]: - raise ValueError("The axis argument is expected to be either 1 or 0") - if self.axis != axis: - X = X.T - if not isinstance(X, np.ndarray) or X.ndim != 2: - raise TypeError( - "Error, only supports 2D numpy for now. If the query X is univariate " - "do X = X[np.newaxis, :]." - ) - - query_dim, query_length = X.shape - if query_length >= self.min_timepoints_: - raise ValueError( - "The length of the query should be inferior or equal to the length of " - "data (X_) provided during fit, but got {} for X and {} for X_".format( - query_length, self.min_timepoints_ - ) - ) - - if query_dim != self.n_channels_: - raise ValueError( - "The number of feature should be the same for the query X and the data " - "(X_) provided during fit, but got {} for X and {} for X_".format( - query_dim, self.n_channels_ - ) - ) - return query_dim, query_length - - def _get_distance_profile_function(self): - """ - Given distance and speed_up parameters, return the distance profile function. - - Raises - ------ - ValueError - If the distance parameter given at initialization is not a string nor a - numba function or a callable, or if the speedup parameter is unknow or - unsupported, raisea ValueError. - - Returns - ------- - function - The distance profile function matching the distance argument. - - """ - if isinstance(self.distance, str): - distance_dict = _QUERY_SEARCH_SPEED_UP_DICT.get(self.distance) - if distance_dict is None: - raise NotImplementedError( - f"No distance profile have been implemented for {self.distance}." - ) - else: - speed_up_profile = distance_dict.get(self.normalise).get(self.speed_up) - - if speed_up_profile is None: - raise ValueError( - f"Unknown or unsupported speed up {self.speed_up} for " - f"{self.distance} distance function with" - ) - self.speed_up_ = self.speed_up - return speed_up_profile - else: - raise ValueError( - f"Expected distance argument to be str but got {type(self.distance)}" - ) - - def _call_distance_profile(self, X: np.ndarray, mask: np.ndarray) -> np.ndarray: - """ - Obtain the distance profile function and call it with the query and the mask. - - Parameters - ---------- - X : np.ndarray, 2D array of shape (n_channels, query_length) - Input query used for similarity search. - mask : np.ndarray, 2D array of shape (n_cases, n_timepoints - query_length + 1) - Boolean array which indicates the candidates that should be evaluated in - the similarity search. - - Returns - ------- - distance_profiles : np.ndarray, 2D array of shape (n_cases, n_timepoints - query_length + 1) # noqa: E501 - The distance profiles between the input time series and the query. - - """ - if self.normalise: - distance_profiles = self.distance_profile_function_( - self.X_, - X, - mask, - self.X_means_, - self.X_stds_, - self.query_means_, - self.query_stds_, - ) - else: - distance_profiles = self.distance_profile_function_(self.X_, X, mask) - - return distance_profiles - - @classmethod - def get_speedup_function_names(self) -> dict: - """ - Get available speedup for query search in aeon. - - The returned structure is a dictionnary that contains the names of all - avaialble speedups for normalised and non-normalised distance functions. - - Returns - ------- - dict - The available speedups name that can be used as parameters in - similarity search classes. - - """ - speedups = {} - for dist_name in _QUERY_SEARCH_SPEED_UP_DICT.keys(): - for normalise in _QUERY_SEARCH_SPEED_UP_DICT[dist_name].keys(): - speedups_names = list( - _QUERY_SEARCH_SPEED_UP_DICT[dist_name][normalise].keys() - ) - if normalise: - speedups.update({f"normalised {dist_name}": speedups_names}) - else: - speedups.update({f"{dist_name}": speedups_names}) - return speedups - - -_QUERY_SEARCH_SPEED_UP_DICT = { - "euclidean": { - True: { - "fastest": normalised_euclidean_distance_profile, - "Mueen": normalised_euclidean_distance_profile, - }, - False: { - "fastest": euclidean_distance_profile, - "Mueen": euclidean_distance_profile, - }, - }, - "squared": { - True: { - "fastest": normalised_squared_distance_profile, - "Mueen": normalised_squared_distance_profile, - }, - False: { - "fastest": squared_distance_profile, - "Mueen": squared_distance_profile, - }, - }, -} diff --git a/aeon/similarity_search/series_search.py b/aeon/similarity_search/series_search.py deleted file mode 100644 index 3c36cf9c4a..0000000000 --- a/aeon/similarity_search/series_search.py +++ /dev/null @@ -1,436 +0,0 @@ -"""Base class for series search.""" - -__maintainer__ = ["baraline"] - -from typing import Union, final - -import numpy as np -from numba import get_num_threads, set_num_threads - -from aeon.similarity_search.base import BaseSimilaritySearch -from aeon.similarity_search.matrix_profiles.stomp import ( - stomp_euclidean_matrix_profile, - stomp_normalised_euclidean_matrix_profile, - stomp_normalised_squared_matrix_profile, - stomp_squared_matrix_profile, -) -from aeon.utils.numba.general import sliding_mean_std_one_series - - -class SeriesSearch(BaseSimilaritySearch): - """ - Series search estimator. - - The series search estimator will return a set of matches for each subsequence of - size L in a time series given during predict. The matching of each subsequence will - be made against all subsequence of size L inside the time series given during fit, - which will represent the search space. - - Depending on the `k` and/or `threshold` parameters, which condition what is - considered a valid match during the search, the number of matches will vary. If `k` - is used, at most `k` matches (the `k` best) will be returned, if `threshold` is used - and `k` is set to `np.inf`, all the candidates which distance to the query is - inferior or equal to `threshold` will be returned. If both are used, the `k` best - matches to the query with distance inferior to `threshold` will be returned. - - - Parameters - ---------- - k : int, default=1 - The number of best matches to return during predict for each subsequence. - threshold : float, default=np.inf - The number of best matches to return during predict for each subsequence. - distance : str, default="euclidean" - Name of the distance function to use. A list of valid strings can be found in - the documentation for :func:`aeon.distances.get_distance_function`. - If a callable is passed it must either be a python function or numba function - with nopython=True, that takes two 1d numpy arrays as input and returns a float. - distance_args : dict, default=None - Optional keyword arguments for the distance function. - normalise : bool, default=False - Whether the distance function should be z-normalised. - speed_up : str, default='fastest' - Which speed up technique to use with for the selected distance - function. By default, the fastest algorithm is used. A list of available - algorithm for each distance can be obtained by calling the - `get_speedup_function_names` function. - inverse_distance : bool, default=False - If True, the matching will be made on the inverse of the distance, and thus, the - worst matches to the query will be returned instead of the best ones. - n_jobs : int, default=1 - Number of parallel jobs to use. - - Attributes - ---------- - X_ : array, shape (n_cases, n_channels, n_timepoints) - The input time series stored during the fit method. This is the - database we search in when given a query. - distance_profile_function : function - The function used to compute the distance profile. This is determined - during the fit method based on the distance and normalise - parameters. - - Notes - ----- - For now, the multivariate case is only treated as independent. - Distances are computed for each channel independently and then - summed together. - """ - - def __init__( - self, - k: int = 1, - threshold: float = np.inf, - distance: str = "euclidean", - distance_args: Union[None, dict] = None, - inverse_distance: bool = False, - normalise: bool = False, - speed_up: str = "fastest", - n_jobs: int = 1, - ): - self.k = k - self.threshold = threshold - self._previous_query_length = -1 - self.axis = 1 - - super().__init__( - distance=distance, - distance_args=distance_args, - inverse_distance=inverse_distance, - normalise=normalise, - speed_up=speed_up, - n_jobs=n_jobs, - ) - - def _fit(self, X, y=None): - """ - Check input format and store it to be used as search space during predict. - - Parameters - ---------- - X : array, shape (n_cases, n_channels, n_timepoints) - Input array to used as database for the similarity search - y : optional - Not used. - - Raises - ------ - TypeError - If the input X array is not 3D raise an error. - - Returns - ------- - self - - """ - self.X_ = X - self.matrix_profile_function_ = self._get_series_method_function() - return self - - @final - def predict( - self, - X: np.ndarray, - length: int, - axis: int = 1, - X_index=None, - exclusion_factor=2.0, - apply_exclusion_to_result=False, - ): - """ - Predict method : Check the shape of X and call _predict to perform the search. - - If the distance profile function is normalised, it stores the mean and stds - from X and X_, with X_ the training data. - - Parameters - ---------- - X : np.ndarray, 2D array of shape (n_channels, series_length) - Input time series used for the search. - length : int - The length parameter that will be used to extract queries from X. - axis : int - The time point axis of the input series if it is 2D. If ``axis==0``, it is - assumed each column is a time series and each row is a time point. i.e. the - shape of the data is ``(n_timepoints,n_channels)``. ``axis==1`` indicates - the time series are in rows, i.e. the shape of the data is - ``(n_channels,n_timepoints)``. - X_index : int - An integer indicating if X was extracted is part of the dataset that was - given during the fit method. If so, this integer should be the sample id. - The search will define an exclusion zone for the queries extarcted from X - in order to avoid matching with themself. If None, it is considered that - the query is not extracted from X_. - exclusion_factor : float, default=2. - The factor to apply to the query length to define the exclusion zone. The - exclusion zone is define from - ``id_timestamp - query_length//exclusion_factor`` to - ``id_timestamp + query_length//exclusion_factor``. This also applies to - the matching conditions defined by child classes. For example, with - TopKSimilaritySearch, the k best matches are also subject to the exclusion - zone, but with :math:`id_timestamp` the index of one of the k matches. - apply_exclusion_to_result : bool, default=False - Wheter to apply the exclusion factor to the output of the similarity search. - This means that two matches of the query from the same sample must be at - least spaced by +/- ``query_length//exclusion_factor``. - This can avoid pathological matching where, for example if we extract the - best two matches, there is a high chance that if the best match is located - at ``id_timestamp``, the second best match will be located at - ``id_timestamp`` +/- 1, as they both share all their values except one. - - Raises - ------ - TypeError - If the input X array is not 2D raise an error. - ValueError - If the length of the query is greater - - Returns - ------- - Tuple(ndarray, ndarray) - The first array, of shape ``(series_length - length + 1, n_matches)``, - contains the distance between all the queries of size length and their best - matches in X_. The second array, of shape - ``(series_length - L + 1, n_matches, 2)``, contains the indexes of these - matches as ``(id_sample, id_timepoint)``. The corresponding match can be - retrieved as ``X_[id_sample, :, id_timepoint : id_timepoint + length]``. - - """ - self._check_is_fitted() - prev_threads = get_num_threads() - set_num_threads(self._n_jobs) - series_dim, series_length = self._check_series_format(X, length, axis) - - mask = self._init_X_index_mask( - None if X_index is None else [X_index, 0], - length, - exclusion_factor=exclusion_factor, - ) - - if self.normalise: - _mean, _std = sliding_mean_std_one_series(X, length, 1) - self.T_means_ = _mean - self.T_stds_ = _std - if self._previous_query_length != length: - self._store_mean_std_from_inputs(length) - - if apply_exclusion_to_result: - exclusion_size = length // exclusion_factor - else: - exclusion_size = None - - self._previous_query_length = length - - X_preds = self._predict( - X, - length, - mask, - exclusion_size, - X_index, - exclusion_factor, - apply_exclusion_to_result, - ) - set_num_threads(prev_threads) - return X_preds - - def _predict( - self, - X, - length, - mask, - exclusion_size, - X_index, - exclusion_factor, - apply_exclusion_to_result, - ): - """ - Private predict method for SeriesSearch. - - This method calculates the matrix profile for a given time series dataset by - comparing all possible subsequences of a specified length against a reference - time series. It handles exclusion zones to prevent nearby matches from being - selected and supports normalization. - - Parameters - ---------- - X : np.ndarray, 2D array of shape (n_channels, series_length) - Input time series used for the search. - length : int - The length parameter that will be used to extract queries from X. - axis : int - The time point axis of the input series if it is 2D. If ``axis==0``, it is - assumed each column is a time series and each row is a time point. i.e. the - shape of the data is ``(n_timepoints,n_channels)``. ``axis==1`` indicates - the time series are in rows, i.e. the shape of the data is - ``(n_channels,n_timepoints)``. - mask : np.ndarray, 2D array of shape (n_cases, n_timepoints - length + 1) - Boolean mask of the shape of the distance profiles indicating for which part - of it the distance should be computed. In this context, it is the mask for - the first query of size L in T. This mask will be updated during the - algorithm. - exclusion_size : int, optional - The size of the exclusion zone used to prevent returning as top k candidates - the ones that are close to each other (for example i and i+1). - It is used to define a region between - :math:`id_timestamp - exclusion_size` and - :math:`id_timestamp + exclusion_size` which cannot be returned - as best match if :math:`id_timestamp` was already selected. By default, - the value None means that this is not used. - - Returns - ------- - Tuple(ndarray, ndarray) - The first array, of shape ``(series_length - length + 1, n_matches)``, - contains the distance between all the queries of size length and their best - matches in X_. The second array, of shape - ``(series_length - L + 1, n_matches, 2)``, contains the indexes of these - matches as ``(id_sample, id_timepoint)``. The corresponding match can be - retrieved as ``X_[id_sample, :, id_timepoint : id_timepoint + length]``. - - """ - if self.normalise: - return self.matrix_profile_function_( - self.X_, - X, - length, - self.X_means_, - self.X_stds_, - self.T_means_, - self.T_stds_, - mask, - k=self.k, - threshold=self.threshold, - inverse_distance=self.inverse_distance, - exclusion_size=exclusion_size, - ) - else: - return self.matrix_profile_function_( - self.X_, - X, - length, - mask, - k=self.k, - threshold=self.threshold, - inverse_distance=self.inverse_distance, - exclusion_size=exclusion_size, - ) - - def _check_series_format(self, X, length, axis): - if axis not in [0, 1]: - raise ValueError("The axis argument is expected to be either 1 or 0") - if self.axis != axis: - X = X.T - if not isinstance(X, np.ndarray) or X.ndim != 2: - raise TypeError( - "Error, only supports 2D numpy for now. If the series X is univariate " - "do X = X[np.newaxis, :]." - ) - - series_dim, series_length = X.shape - if series_length < length: - raise ValueError( - "The length of the series should be superior or equal to the length " - "parameter given during predict, but got {} < {}".format( - series_length, length - ) - ) - - if series_dim != self.n_channels_: - raise ValueError( - "The number of feature should be the same for the series X and the data" - " (X_) provided during fit, but got {} for X and {} for X_".format( - series_dim, self.n_channels_ - ) - ) - return series_dim, series_length - - def _get_series_method_function(self): - """ - Given distance and speed_up parameters, return the series method function. - - Raises - ------ - ValueError - If the distance parameter given at initialization is not a string nor a - numba function or a callable, or if the speedup parameter is unknow or - unsupported, raisea ValueError. - - Returns - ------- - function - The series method function matching the distance argument. - - """ - if isinstance(self.distance, str): - distance_dict = _SERIES_SEARCH_SPEED_UP_DICT.get(self.distance) - if distance_dict is None: - raise NotImplementedError( - f"No distance profile have been implemented for {self.distance}." - ) - else: - speed_up_series_method = distance_dict.get(self.normalise).get( - self.speed_up - ) - - if speed_up_series_method is None: - raise ValueError( - f"Unknown or unsupported speed up {self.speed_up} for " - f"{self.distance} distance function with" - ) - self.speed_up_ = self.speed_up - return speed_up_series_method - else: - raise ValueError( - f"Expected distance argument to be str but got {type(self.distance)}" - ) - - @classmethod - def get_speedup_function_names(self): - """ - Get available speedup for series search in aeon. - - The returned structure is a dictionnary that contains the names of all - avaialble speedups for normalised and non-normalised distance functions. - - Returns - ------- - dict - The available speedups name that can be used as parameters in - similarity search classes. - - """ - speedups = {} - for dist_name in _SERIES_SEARCH_SPEED_UP_DICT.keys(): - for normalise in _SERIES_SEARCH_SPEED_UP_DICT[dist_name].keys(): - speedups_names = list( - _SERIES_SEARCH_SPEED_UP_DICT[dist_name][normalise].keys() - ) - if normalise: - speedups.update({f"normalised {dist_name}": speedups_names}) - else: - speedups.update({f"{dist_name}": speedups_names}) - return speedups - - -_SERIES_SEARCH_SPEED_UP_DICT = { - "euclidean": { - True: { - "fastest": stomp_normalised_euclidean_matrix_profile, - "STOMP": stomp_normalised_euclidean_matrix_profile, - }, - False: { - "fastest": stomp_euclidean_matrix_profile, - "STOMP": stomp_euclidean_matrix_profile, - }, - }, - "squared": { - True: { - "fastest": stomp_normalised_squared_matrix_profile, - "STOMP": stomp_normalised_squared_matrix_profile, - }, - False: { - "fastest": stomp_squared_matrix_profile, - "STOMP": stomp_squared_matrix_profile, - }, - }, -} diff --git a/aeon/similarity_search/series_search/__init__.py b/aeon/similarity_search/series_search/__init__.py new file mode 100644 index 0000000000..f576c41f03 --- /dev/null +++ b/aeon/similarity_search/series_search/__init__.py @@ -0,0 +1,7 @@ +"""Similarity search module.""" + +__all__ = ["BaseSimilaritySearch", "QuerySearch", "SeriesSearch"] + +from aeon.similarity_search.base import BaseSimilaritySearch +from aeon.similarity_search.query_search import QuerySearch +from aeon.similarity_search.series_search import SeriesSearch diff --git a/aeon/similarity_search/series_search/base.py b/aeon/similarity_search/series_search/base.py new file mode 100644 index 0000000000..db83519c04 --- /dev/null +++ b/aeon/similarity_search/series_search/base.py @@ -0,0 +1,22 @@ +"""Base class for whole series search.""" + +__maintainer__ = ["baraline"] + +from aeon.similarity_search.base import BaseSimilaritySearch + + +class BaseSeriesSearch(BaseSimilaritySearch): + """.""" + + ... + + +class BaseIndexSearch(BaseSimilaritySearch): + """.""" + + ... + + def batch_fit(sourcefiles, batch_size): + """.""" + # fit + # and then update diff --git a/aeon/similarity_search/subsequence_search/__init__.py b/aeon/similarity_search/subsequence_search/__init__.py new file mode 100644 index 0000000000..dfca8ee18c --- /dev/null +++ b/aeon/similarity_search/subsequence_search/__init__.py @@ -0,0 +1,5 @@ +"""Similarity search module.""" + +__all__ = ["StompMatrixProfile"] + +from aeon.similarity_search.subsequence_search._stomp import StompMatrixProfile diff --git a/aeon/similarity_search/subsequence_search/_brute_force.py b/aeon/similarity_search/subsequence_search/_brute_force.py new file mode 100644 index 0000000000..7f95083f3b --- /dev/null +++ b/aeon/similarity_search/subsequence_search/_brute_force.py @@ -0,0 +1,284 @@ +"""Implementation of matrix profile with brute force.""" + +from typing import Optional + +__maintainer__ = ["baraline"] + + +import numpy as np +from numba import njit, prange +from numba.typed import List + +from aeon.similarity_search.subsequence_search._commons import ( + _extract_top_k_from_dist_profile, + _inverse_distance_profile_list, +) +from aeon.similarity_search.subsequence_search.base import BaseMatrixProfile +from aeon.utils.numba.general import ( + get_all_subsequences, + z_normalise_series_3d, + z_normalize_series_2d, +) + +# TODO : check function params and make docstrings +# TODO : make tests + + +class BruteForceMatrixProfile(BaseMatrixProfile): + """.""" + + def compute_matrix_profile( + self, + k, + threshold, + exclusion_size, + inverse_distance, + allow_overlap, + X: Optional[np.ndarray] = None, + X_index: Optional[int] = None, + ): + """ + . + + Parameters + ---------- + k : TYPE + DESCRIPTION. + threshold : TYPE + DESCRIPTION. + exclusion_size : TYPE + DESCRIPTION. + inverse_distance : TYPE + DESCRIPTION. + X : Optional[np.ndarray], optional + DESCRIPTION. The default is None. + X_index : Optional[int], optional + DESCRIPTION. The default is None. + : TYPE + DESCRIPTION. + + Returns + ------- + MP : TYPE + DESCRIPTION. + IP : TYPE + DESCRIPTION. + + """ + # pairwise if none + if X is None: + MP = [] + IP = [] + for i in range(len(self.X_)): + _MP, _IP = self.compute_matrix_profile( + k, + threshold, + exclusion_size, + inverse_distance, + X=self.X_[i], + X_index=i, + ) + MP.append(_MP) + IP.append(_IP) + else: + MP, IP = _naive_squared_matrix_profile( + self.X_, + X, + self.length, + X_index, + k, + allow_overlap, + threshold, + exclusion_size, + inverse_distance, + normalize=self.normalize, + ) + + return MP, IP + + def compute_distance_profile(self, X: np.ndarray): + """ + Compute the distance profile of X to all samples in X_. + + Parameters + ---------- + X : np.ndarray, 2D array of shape (n_channels, length) + The query to use to compute the distance profiles. + + Returns + ------- + distance_profiles : np.ndarray, 2D array of shape (n_cases, n_candidates) + The distance profile of X to all samples in X_. The ``n_candidates`` value + is equal to ``n_timepoins - length + 1``. If X_ is an unequal length + collection, returns a numba typed list instead of an ndarray. + + """ + distance_profiles = _naive_squared_distance_profile( + self.X_, X, normalize=self.normalize + ) + + if not self.metadata_["unequal_length"]: + distance_profiles = np.asarray(distance_profiles) + return distance_profiles + + +@njit(cache=True, fastmath=True) +def _compute_dist_profile(X_subs, q): + """ + Compute the distance profile between subsequences and a query. + + Parameters + ---------- + X_subs : array, shape=(n_samples, n_channels, query_length) + Input subsequences extracted from a time series. + q : array, shape=(n_channels, query_length) + Query used for the distance computation + + Returns + ------- + dist_profile : np.ndarray, 1D array of shape (n_samples) + The distance between the query all subsequences. + + """ + n_candidates, n_channels, q_length = X_subs.shape + dist_profile = np.zeros(n_candidates) + for i in range(n_candidates): + for j in range(n_channels): + for k in range(q_length): + dist_profile[i] += (X_subs[i, j, k] - q[j, k]) ** 2 + return dist_profile + + +@njit(cache=True, fastmath=True, parallel=True) +def _naive_squared_distance_profile( + X, + Q, + normalize=False, +): + """ + Compute a squared euclidean distance profile. + + Parameters + ---------- + X : array, shape=(n_samples, n_channels, n_timepoints) + Input time series dataset to search in. + Q : array, shape=(n_channels, query_length) + Query used during the search. + normalize : bool + Wheter to use a z-normalized distance. + + Returns + ------- + out : np.ndarray, 1D array of shape (n_samples, n_timepoints_t - query_length + 1) + The distance between the query and all candidates in X. + + """ + query_length = Q.shape[1] + dist_profiles = List() + # Init distance profile array with unequal length support + for i in range(len(X)): + dist_profiles.append(np.zeros(X[i].shape[1] - query_length + 1)) + if normalize: + Q = z_normalize_series_2d(Q) + else: + Q = Q.astype(np.float64) + + for i in prange(len(X)): + X_subs = get_all_subsequences(X[i], query_length, 1) + if normalize: + X_subs = z_normalise_series_3d(X_subs) + + dist_profile = _compute_dist_profile(X_subs, Q) + dist_profiles[i] = dist_profile + return dist_profiles + + +@njit(cache=True, fastmath=True, parallel=True) +def _naive_squared_matrix_profile( + X, + T, + L, + k, + T_index, + threshold, + inverse_distance, + allow_overlap, + exclusion_size, + normalize=False, +): + """ + Compute a squared euclidean matrix profile. + + Parameters + ---------- + X : array, shape=(n_samples, n_channels, n_timepoints_x) + Input time series dataset to search in. + T : array, shape=(n_channels, n_timepoints_t) + Time series from which queries are extracted. + query_length : int + Length of the queries to extract from T. + T_index : int, + If ``X`` is a subsequence of the database given in fit, specify its starting + index as (i_case, i_timestamp). If specified, this subsequence and the + neighboring ones (according to ``exclusion_factor``) won't be considered as + admissible candidates. + normalize : bool + Wheter to use a z-normalized distance. + + Returns + ------- + out : np.ndarray, 1D array of shape (n_timepoints_t - query_length + 1) + The minimum distance between each query in T and all candidates in X. + """ + n_queries = T.shape[1] - L + 1 + MP = List() + IP = List() + + # Init List to allow parallel, we'll re-use it for all dist profiles + dist_profiles = List() + for i_x in range(len(X)): + dist_profiles.append(np.zeros(X[i_x].shape[1] - L + 1)) + + X_subs = List() + for i in range(len(X)): + i_subs = get_all_subsequences(X[i], L, 1) + if normalize: + i_subs = z_normalise_series_3d(X_subs) + X_subs.append(i_subs) + + for i_q in range(n_queries): + Q = T[:, i : i + L] + if normalize: + Q = z_normalize_series_2d(Q) + for i_x in prange(len(X)): + dist_profiles[i_x][0 : X[i_x].shape[1] - L + 1] = _compute_dist_profile( + X_subs[i_x], Q + ) + + if T_index is not None: + _max_timestamp = X[T_index].shape[1] - L + ub = min(i_q + exclusion_size, _max_timestamp) + lb = max(0, i_q - exclusion_size) + dist_profiles[T_index][lb:ub] = np.inf + + if inverse_distance: + dist_profiles = _inverse_distance_profile_list(dist_profiles) + + # Deal with self-matches + if T_index is not None: + _max_timestamp = X[T_index].shape[1] - L + ub = min(i_q + exclusion_size, _max_timestamp) + lb = max(0, i_q - exclusion_size) + dist_profiles[T_index][lb:ub] = np.inf + + top_dists, top_indexes = _extract_top_k_from_dist_profile( + dist_profiles, + k, + threshold, + allow_overlap, + exclusion_size, + ) + + MP.append(top_dists) + IP.append(top_indexes) + return MP, IP diff --git a/aeon/similarity_search/subsequence_search/_commons.py b/aeon/similarity_search/subsequence_search/_commons.py new file mode 100644 index 0000000000..e2e0aa54df --- /dev/null +++ b/aeon/similarity_search/subsequence_search/_commons.py @@ -0,0 +1,138 @@ +"""Helper and common function for similarity search estimators and functions.""" + +__maintainer__ = ["baraline"] + +import numpy as np +from numba import njit, prange +from scipy.signal import convolve + + +def fft_sliding_dot_product(X, q): + """ + Use FFT convolution to calculate the sliding window dot product. + + This function applies the Fast Fourier Transform (FFT) to efficiently compute + the sliding dot product between the input time series `X` and the query `q`. + The dot product is computed for each channel individually. The sliding window + approach ensures that the dot product is calculated for every possible subsequence + of `X` that matches the length of `q` + + Parameters + ---------- + X : array, shape=(n_channels, n_timepoints) + Input time series + q : array, shape=(n_channels, query_length) + Input query + + Returns + ------- + out : np.ndarray, 2D array of shape (n_channels, n_timepoints - query_length + 1) + Sliding dot product between q and X. + """ + n_channels, n_timepoints = X.shape + query_length = q.shape[1] + out = np.zeros((n_channels, n_timepoints - query_length + 1)) + for i in range(n_channels): + out[i, :] = convolve(np.flipud(q[i, :]), X[i, :], mode="valid").real + return out + + +def get_ith_products(X, T, L, ith): + """ + Compute dot products between X and the i-th subsequence of size L in T. + + Parameters + ---------- + X : array, shape = (n_channels, n_timepoints_X) + Input data. + T : array, shape = (n_channels, n_timepoints_T) + Data containing the query. + L : int + Overall query length. + ith : int + Query starting index in T. + + Returns + ------- + np.ndarray, 2D array of shape (n_channels, n_timepoints_X - L + 1) + Sliding dot product between the i-th subsequence of size L in T and X. + + """ + return fft_sliding_dot_product(X, T[:, ith : ith + L]) + + +@njit(cache=True, fastmath=True, parallel=True) +def _inverse_distance_profile_list(dist_profiles): + for i in prange(len(dist_profiles)): + dist_profiles[i] = 1 / (dist_profiles[i] + 1e-8) + return dist_profiles + + +@njit(cache=True) +def _extract_top_k_from_dist_profile( + dist_profiles, + k, + threshold, + allow_overlap, + exclusion_size, +): + top_k_indexes = np.zeros((2 * k, 2), dtype=np.int64) - 1 + top_k_distances = np.full(2 * k, np.inf) + for i_profile in range(len(dist_profiles)): + # Extract top-k without neighboring matches + if not allow_overlap: + _sorted_indexes = np.argsort(dist_profiles[i_profile]) + _top_k_indexes = np.zeros(k, dtype=np.int64) - 1 + _current_k = 1 + _top_k_indexes[0] = _sorted_indexes[0] + _current_j = 1 + # Until we extract k value or explore all the array + while _current_k < k and _current_j < len(_sorted_indexes): + _insert = True + # Check for validity with each previously inserted + for i_k in range(_current_k): + ub = min(_top_k_indexes[i_k] + exclusion_size, len(dist_profiles)) + lb = max(_top_k_indexes[i_k] - exclusion_size, 0) + if ( + _sorted_indexes[_current_j] < lb + or _sorted_indexes[_current_j] > ub + ): + _insert = False + break + + if _insert: + _top_k_indexes[_current_k] = _sorted_indexes[_current_j] + _current_k += 1 + _current_j += 1 + + _top_k_indexes = _top_k_indexes[:_current_k] + _top_k_distances = dist_profiles[i_profile][_top_k_indexes] + # Extract top-k with neighboring matches + else: + _top_k_indexes = np.argpartition(dist_profiles[i_profile], k)[:k] + _top_k_distances = dist_profiles[i_profile][_top_k_indexes] + + # Select overall top k by using the buffer array of size 2*k + # Inset top from current sample + top_k_distances[k : k + len(_top_k_distances)] = _top_k_distances + top_k_indexes[k : k + len(_top_k_distances), 1] = _top_k_indexes + top_k_indexes[k : k + len(_top_k_distances), 0] = i_profile + + # Sort overall + idx = np.argsort(top_k_distances) + # Keep top k overall + top_k_distances[:k] = top_k_distances[idx[:k]] + top_k_indexes[:k] = top_k_indexes[idx[:k]] + + top_k_distances[k:] = np.inf + + # get the actual number of extracted values and apply threshold + true_k = 0 + for i in range(k): + # if top_k is inf, it means that no value was extracted + if top_k_distances[i] != np.inf and top_k_distances[i] <= threshold: + true_k += 1 + else: + break + + return top_k_indexes[:true_k], top_k_distances[:true_k] diff --git a/aeon/similarity_search/subsequence_search/_stomp.py b/aeon/similarity_search/subsequence_search/_stomp.py new file mode 100644 index 0000000000..8986a41926 --- /dev/null +++ b/aeon/similarity_search/subsequence_search/_stomp.py @@ -0,0 +1,596 @@ +"""Implementation of STOMP with squared euclidean distance.""" + +from typing import Optional + +__maintainer__ = ["baraline"] + + +import numpy as np +from numba import njit, prange +from numba.typed import List + +from aeon.similarity_search.subsequence_search._commons import ( + _extract_top_k_from_dist_profile, + _inverse_distance_profile_list, + fft_sliding_dot_product, + get_ith_products, +) +from aeon.similarity_search.subsequence_search.base import BaseMatrixProfile +from aeon.utils.numba.general import AEON_NUMBA_STD_THRESHOLD + +# TODO : check and order parameters of functions in base and here +# TODO : check function params and make docstrings to be consistent with brute force +# TODO : validate tests + + +class StompMatrixProfile(BaseMatrixProfile): + """.""" + + def compute_matrix_profile( + self, + k, + threshold, + exclusion_size, + inverse_distance, + allow_overlap, + X: Optional[np.ndarray] = None, + X_index: Optional[int] = None, + ): + """ + . + + Parameters + ---------- + k : TYPE + DESCRIPTION. + threshold : TYPE + DESCRIPTION. + exclusion_size : TYPE + DESCRIPTION. + inverse_distance : TYPE + DESCRIPTION. + X : Optional[np.ndarray], optional + DESCRIPTION. The default is None. + X_index : Optional[int], optional + If ``X`` is a series of the database given in fit, specify its index in + ``X_``. If specified, each query of this series won't be able to match with + its neighboring subsequences. + : TYPE + DESCRIPTION. + + Returns + ------- + MP : TYPE + DESCRIPTION. + IP : TYPE + DESCRIPTION. + + """ + # pairwise if none + if X is None: + MP = [] + IP = [] + for i in range(len(self.X_)): + _MP, _IP = self.compute_matrix_profile( + k, + threshold, + exclusion_size, + inverse_distance, + X=self.X_[i], + X_index=i, + ) + MP.append(_MP) + IP.append(_IP) + else: + XdotT = [ + get_ith_products(self.X[i], X, self.length, 0) + for i in range(len(self.X_)) + ] + if isinstance(X, np.ndarray): + XdotT = np.asarray(XdotT) + elif isinstance(X, List): + XdotT = List(XdotT) + if X_index is None: + X_means, X_stds = 0 + else: + X_means, X_stds = self.X_means_[i], self.X_stds_[i] + if self.normalize: + MP, IP = _stomp_normalized( + self.X_, + X, + XdotT, + self.X_means_, + self.X_stds_, + X_means, + X_stds, + self.length, + X_index, + k, + threshold, + allow_overlap, + exclusion_size, + inverse_distance, + ) + + else: + MP, IP = _stomp( + self.X_, + X, + XdotT, + self.length, + X_index, + k, + allow_overlap, + threshold, + exclusion_size, + inverse_distance, + ) + + return MP, IP + + def compute_distance_profile(self, X: np.ndarray): + """ + Compute the distance profile of X to all samples in X_. + + Parameters + ---------- + X : np.ndarray, 2D array of shape (n_channels, length) + The query to use to compute the distance profiles. + + Returns + ------- + distance_profiles : np.ndarray, 2D array of shape (n_cases, n_candidates) + The distance profile of X to all samples in X_. The ``n_candidates`` value + is equal to ``n_timepoins - length + 1``. If X_ is an unequal length + collection, returns a numba typed list instead of an ndarray. + + """ + QX = [fft_sliding_dot_product(self.X_[i], X) for i in range(len(self.X_))] + if self.metadata_["unequal_length"]: + QX = List(QX) + else: + QX = np.asarray(QX) + + if self.normalize: + distance_profiles = _normalized_squared_distance_profile( + QX, + self.X_means_, + self.X_stds_, + X.mean(axis=1), + X.std(axis=1), + self.length, + ) + else: + distance_profiles = _squared_distance_profile( + QX, + self.X_, + X, + ) + + if not self.metadata_["unequal_length"]: + distance_profiles = np.asarray(distance_profiles) + return distance_profiles + + +@njit(cache=True, parallel=True, fastmath=True) +def _stomp_normalized( + X, + T, + XdotT, + X_means, + X_stds, + T_means, + T_stds, + L, + T_index, + k, + threshold, + allow_overlap, + exclusion_size, + inverse_distance, +): + """ + Compute the Matrix Profile using the STOMP algorithm with normalized distances. + + X: np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints) + The input samples. If X is an unquel length collection, expect a TypedList + of 2D arrays of shape (n_channels, n_timepoints) + T : np.ndarray, 2D array of shape (n_channels, series_length) + The series used for similarity search. Note that series_length can be equal, + superior or inferior to n_timepoints, it doesn't matter. + L : int + Length of the subsequences used for the distance computation. + XdotT : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - L + 1) + Precomputed dot products between each time series in X and the query series T. + X_means : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - L + 1) + Means of each subsequences of X of size L. Should be a numba TypedList if X is + unequal length. + X_stds : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - L + 1) + Stds of each subsequences of X of size L. Should be a numba TypedList if X is + unequal length. + T_means : np.ndarray, 2D array of shape (n_channels, n_timepoints - L + 1) + Means of each subsequences of T of size L. + T_stds : np.ndarray, 2D array of shape (n_channels, n_timepoints - L + 1) + Stds of each subsequences of T of size L. + T_index : int, + If ``T`` is a series of the database given in fit, specify its index + in ``X_``. If specified, each query of this series won't be able to + match with its neighboring subsequences. + k : int, default=1 + The number of best matches to return during predict for each subsequence. + threshold : float, default=np.inf + The number of best matches to return during predict for each subsequence. + inverse_distance : bool, default=False + If True, the matching will be made on the inverse of the distance, and thus, the + worst matches to the query will be returned instead of the best ones. + exclusion_size : int, optional + The size of the exclusion zone used to prevent returning as top k candidates + the ones that are close to each other (for example i and i+1). + It is used to define a region between + :math:`id_timestomp - exclusion_size` and + :math:`id_timestomp + exclusion_size` which cannot be returned + as best match if :math:`id_timestomp` was already selected. By default, + the value None means that this is not used. + + Returns + ------- + tuple of np.ndarray + - MP : array of shape (series_length - L + 1,) + Matrix profile distances for each query subsequence. + - IP : array of shape (series_length - L + 1,) + Indexes of the top matches for each query subsequence. + """ + n_queries = T.shape[1] - L + 1 + MP = List() + IP = List() + + # Init List to allow parallel, we'll re-use it for all dist profiles + dist_profiles = List() + for i_x in range(len(X)): + dist_profiles.append(np.zeros(X[i_x].shape[1] - L + 1)) + + for i_q in range(n_queries): + for i_x in prange(len(X)): + dist_profiles[i_x][0 : X[i_x].shape[1] - L + 1] = ( + _normalized_squared_dist_profile_one_series( + XdotT[i_x], + X_means[i_x], + X_stds[i_x], + T_means[:, i_q], + T_stds[:, i_q], + L, + T_stds[:, i_q] <= AEON_NUMBA_STD_THRESHOLD, + ) + ) + if i_q + 1 < n_queries: + XdotT[i_x] = _update_dot_products_one_series( + X[i_x], T, XdotT[i_x], L, i_q + 1 + ) + + if inverse_distance: + dist_profiles = _inverse_distance_profile_list(dist_profiles) + + # Deal with self-matches + if T_index is not None: + _max_timestamp = X[T_index].shape[1] - L + ub = min(i_q + exclusion_size, _max_timestamp) + lb = max(0, i_q - exclusion_size) + dist_profiles[T_index][lb:ub] = np.inf + + top_indexes, top_dists = _extract_top_k_from_dist_profile( + dist_profiles, + k, + threshold, + allow_overlap, + exclusion_size, + ) + + MP.append(top_dists) + IP.append(top_indexes) + + return MP, IP + + +@njit(cache=True, parallel=True, fastmath=True) +def _stomp( + X, + T, + XdotT, + L, + T_index, + k, + allow_overlap, + threshold, + exclusion_size, + inverse_distance, +): + n_queries = T.shape[1] - L + 1 + MP = List() + IP = List() + + # Init List to allow parallel, we'll re-use it for all dist profiles + dist_profiles = List() + for i_x in range(len(X)): + dist_profiles.append(np.zeros(X[i_x].shape[1] - L + 1)) + # For each query of size L in T + for i_q in range(n_queries): + Q = T[:, i_q : i_q + L] + # For each series in X compute distance profile to the query + for i_x in prange(len(X)): + dist_profiles[i_x][0 : X[i_x].shape[1] - L + 1] = ( + _squared_dist_profile_one_series(XdotT[i_x], X[i_x], Q) + ) + if i_q + 1 < n_queries: + XdotT[i_x] = _update_dot_products_one_series( + X[i_x], T, XdotT[i_x], L, i_q + 1 + ) + + if inverse_distance: + dist_profiles = _inverse_distance_profile_list(dist_profiles) + + # Deal with self-matches + if T_index is not None: + _max_timestamp = X[T_index].shape[1] - L + ub = min(i_q + exclusion_size, _max_timestamp) + lb = max(0, i_q - exclusion_size) + dist_profiles[T_index][lb:ub] = np.inf + + top_indexes, top_dists = _extract_top_k_from_dist_profile( + dist_profiles, + k, + threshold, + allow_overlap, + exclusion_size, + ) + + MP.append(top_dists) + IP.append(top_indexes) + + return MP, IP + + +@njit(cache=True, fastmath=True) +def _update_dot_products_one_series( + X, + T, + XT_products, + L, + i_query, +): + """ + Update dot products of the i-th query of size L in T from the dot products of i-1. + + Parameters + ---------- + X: np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints) + Input time series on which the sliding dot product is computed. + T: np.ndarray, 2D array of shape (n_channels, series_length) + The series used for similarity search. Note that series_length can be equal, + superior or inferior to n_timepoints, it doesn't matter. + L : int + The length of the subsequences considered during the search. This parameter + cannot be larger than n_timepoints and series_length. + i_query : int + Query starting index in T. + + Returns + ------- + XT_products : np.ndarray of shape (n_cases, n_channels, n_timepoints - L + 1) + Sliding dot product between the i-th subsequence of size L in T and X. + + """ + n_channels = T.shape[0] + Q = T[:, i_query : i_query + L] + n_candidates = X.shape[1] - L + 1 + + for i_ft in range(n_channels): + # first element of all 0 to n-1 candidates * first element of previous query + _a1 = X[i_ft, : n_candidates - 1] * T[i_ft, i_query - 1] + # last element of all 1 to n candidates * last element of current query + _a2 = X[i_ft, L : L - 1 + n_candidates] * T[i_ft, i_query + L - 1] + + XT_products[i_ft, 1:] = XT_products[i_ft, :-1] - _a1 + _a2 + + # Compute first dot product + XT_products[i_ft, 0] = np.sum(Q[i_ft] * X[i_ft, :L]) + return XT_products + + +@njit(cache=True, fastmath=True, parallel=True) +def _squared_distance_profile(QX, X, Q): + """ + Compute squared distance profiles between query subsequence and time series. + + Parameters + ---------- + QX : List of np.ndarray + List of precomputed dot products between queries and time series, with each + element corresponding to a different time series. + Shape of each array is (n_channels, n_timepoints - query_length + 1). + X : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints) + The input samples. If X is an unquel length collection, expect a numba TypedList + 2D array of shape (n_channels, n_timepoints) + Q : np.ndarray, 2D array of shape (n_channels, query_length) + The query used for similarity search. + mask : np.ndarray, 3D array of shape (n_cases, n_timepoints - query_length + 1) + Boolean mask of the shape of the distance profile indicating for which part + of it the distance should be computed. + + Returns + ------- + distance_profiles : np.ndarray + 3D array of shape (n_cases, n_timepoints - query_length + 1) + The distance profile between Q and the input time series X. + + """ + distance_profiles = List() + query_length = Q.shape[1] + + # Init distance profile array with unequal length support + for i_instance in range(len(X)): + profile_length = X[i_instance].shape[1] - query_length + 1 + distance_profiles.append(np.full((profile_length), np.inf)) + + for _i_instance in prange(len(QX)): + # prange cast iterator to unit64 with parallel=True + i_instance = np.int_(_i_instance) + + distance_profiles[i_instance] = _squared_dist_profile_one_series( + QX[i_instance], X[i_instance], Q + ) + return distance_profiles + + +@njit(cache=True, fastmath=True) +def _squared_dist_profile_one_series(QT, T, Q): + """ + Compute squared distance profile between query subsequence and a single time series. + + This function calculates the squared distance profile for a single time series by + leveraging the dot product of the query and time series as well as precomputed sums + of squares to efficiently compute the squared distances. + + Parameters + ---------- + QT : np.ndarray, 2D array of shape (n_channels, n_timepoints - query_length + 1) + The dot product between the query and the time series. + T : np.ndarray, 2D array of shape (n_channels, series_length) + The series used for similarity search. Note that series_length can be equal, + superior or inferior to n_timepoints, it doesn't matter. + Q : np.ndarray + 2D array of shape (n_channels, query_length) representing query subsequence. + + Returns + ------- + distance_profile : np.ndarray + 2D array of shape (n_channels, n_timepoints - query_length + 1) + The squared distance profile between the query and the input time series. + """ + n_channels, profile_length = QT.shape + query_length = Q.shape[1] + _QT = -2 * QT + distance_profile = np.zeros(profile_length) + for k in prange(n_channels): + _sum = 0 + _qsum = 0 + for j in prange(query_length): + _sum += T[k, j] ** 2 + _qsum += Q[k, j] ** 2 + + distance_profile += _qsum + _QT[k] + distance_profile[0] += _sum + for i in prange(1, profile_length): + _sum += T[k, i + (query_length - 1)] ** 2 - T[k, i - 1] ** 2 + distance_profile[i] += _sum + return distance_profile + + +@njit(cache=True, fastmath=True, parallel=True) +def _normalized_squared_distance_profile( + QX, X_means, X_stds, Q_means, Q_stds, query_length +): + """ + Compute the normalized squared distance profiles between query subsequence and input time series. + + Parameters + ---------- + QX : List of np.ndarray + List of precomputed dot products between queries and time series, with each element + corresponding to a different time series. + Shape of each array is (n_channels, n_timepoints - query_length + 1). + X_means : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - query_length + 1) # noqa: E501 + Means of each subsequences of X of size query_length + X_stds : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - query_length + 1) # noqa: E501 + Stds of each subsequences of X of size query_length + Q_means : np.ndarray, 1D array of shape (n_channels) + Means of the query q + Q_stds : np.ndarray, 1D array of shape (n_channels) + Stds of the query q + query_length : int + The length of the query subsequence used for the distance profile computation. + + Returns + ------- + List of np.ndarray + List of 2D arrays, each of shape (n_channels, n_timepoints - query_length + 1). + Each array contains the normalized squared distance profile between the query subsequence and the corresponding time series. + Entries in the array are set to infinity where the mask is False. + """ + distance_profiles = List() + Q_is_constant = Q_stds <= AEON_NUMBA_STD_THRESHOLD + # Init distance profile array with unequal length support + for i_instance in range(len(QX)): + profile_length = QX[i_instance].shape[1] + distance_profiles.append(np.zeros(profile_length)) + + for _i_instance in prange(len(QX)): + # iterator is uint64 with prange and parallel so cast to int to avoid warnings + i_instance = np.int64(_i_instance) + distance_profiles[i_instance] = _normalized_squared_dist_profile_one_series( + QX[i_instance], + X_means[i_instance], + X_stds[i_instance], + Q_means, + Q_stds, + query_length, + Q_is_constant, + ) + return distance_profiles + + +@njit(cache=True, fastmath=True) +def _normalized_squared_dist_profile_one_series( + QT, T_means, T_stds, Q_means, Q_stds, query_length, Q_is_constant +): + """ + Compute the z-normalized squared Euclidean distance profile for one time series. + + Parameters + ---------- + QT : np.ndarray, 2D array of shape (n_channels, n_timepoints - query_length + 1) + The dot product between the query and the time series. + T_means : np.ndarray, 1D array of length n_channels + The mean values of the time series for each channel. + T_stds : np.ndarray, 2D array of shape (n_channels, profile_length) + The standard deviations of the time series for each channel and position. + Q_means : np.ndarray, 1D array of shape (n_channels) + Means of the query q + Q_stds : np.ndarray, 1D array of shape (n_channels) + Stds of the query q + query_length : int + The length of the query subsequence used for the distance profile computation. + Q_is_constant : np.ndarray + 1D array of shape (n_channels,) where each element is a Boolean indicating + whether the query standard deviation for that channel is less than or equal + to a specified threshold. + + Returns + ------- + np.ndarray + 2D array of shape (n_channels, n_timepoints - query_length + 1) containing the + z-normalized squared distance profile between the query subsequence and the time + series. Entries are computed based on the z-normalized values, with special + handling for constant values. + """ + n_channels, profile_length = QT.shape + distance_profile = np.zeros(profile_length) + + for i in prange(profile_length): + Sub_is_constant = T_stds[:, i] <= AEON_NUMBA_STD_THRESHOLD + for k in prange(n_channels): + # Two Constant case + if Q_is_constant[k] and Sub_is_constant[k]: + _val = 0 + # One Constant case + elif Q_is_constant[k] or Sub_is_constant[k]: + _val = query_length + else: + denom = query_length * Q_stds[k] * T_stds[k, i] + + p = (QT[k, i] - query_length * (Q_means[k] * T_means[k, i])) / denom + p = min(p, 1.0) + + _val = abs(2 * query_length * (1.0 - p)) + distance_profile[i] += _val + + return distance_profile diff --git a/aeon/similarity_search/subsequence_search/base.py b/aeon/similarity_search/subsequence_search/base.py new file mode 100644 index 0000000000..1d49004472 --- /dev/null +++ b/aeon/similarity_search/subsequence_search/base.py @@ -0,0 +1,358 @@ +"""Base class for subsequence search.""" + +__maintainer__ = ["baraline"] + +import warnings +from abc import abstractmethod +from typing import Optional, final + +import numpy as np +from numba import get_num_threads, set_num_threads +from numba.typed import List + +from aeon.similarity_search.base import BaseSimilaritySearch +from aeon.similarity_search.subsequence_search._commons import ( + _extract_top_k_from_dist_profile, + _inverse_distance_profile_list, +) +from aeon.utils.numba.general import sliding_mean_std_one_series + + +# We can define a BaseVariableLengthSubsequenceSearch later for VALMOD and the likes. +class BaseSubsequenceSearch(BaseSimilaritySearch): + """ + Base class for similarity search on time series subsequences. + + Parameters + ---------- + length : int + The length of the subsequence to be considered. + normalize : bool, optional + Whether the inputs should be z-normalized. The default is False. + n_jobs : int, optional + Number of parallel jobs to use. The default is 1. + """ + + @abstractmethod + def __init__( + self, + length: int, + normalize: Optional[bool] = False, + n_jobs: Optional[int] = 1, + ): + self.length = length + super().__init__(n_jobs=n_jobs, normalize=normalize) + + @final + def find_motifs( + self, + k: int, + threshold: float, + X: Optional[np.ndarray] = None, + allow_overlap: Optional[bool] = False, + exclusion_factor: Optional[float] = 2.0, + ): + """ + Find the top-k motifs in the training data. + + Given ``k`` and ``threshold`` parameters, this methods returns the top-k motif + sets. We define a motif set as a set of candidates which all are at a distance + of at most ``threshold`` from each other. The top-k motifs sets are the + motif sets with the most candidates. + + Parameters + ---------- + k : int, optional + Number of motifs to return + threshold : int, optional + A threshold on the similarity measure to determine which candidates will be + part of a motif set. + X : np.ndarray, 2D array of shape (n_channels, n_timestamps), optional + A series in which we want to indentify motifs. If provided, the motifs + extracted should appear in X and in the database given in fit. If not + provided, the motifs will be extracted only from the database given in fit. + allow_overlap: bool, optional + Wheter a candidate can be part of multiple motif sets (True), or if motif + sets should be mutually exclusive (False). + exclusion_factor : float, default=2. + A factor of the query length used to define the exclusion zone when + ``allow_overlap`` is set to False. For a given timestamp, the exclusion zone + starts from :math:`id_timestamp - query_length//exclusion_factor` and end at + :math:`id_timestamp + query_length//exclusion_factor`. + + Returns + ------- + list of ndarray, shape=(k,) + A list of at most ``k`` numpy arrays containing the indexes of the + candidates in each motif. + + """ + self._check_is_fitted() + + @final + def find_neighbors( + self, + X: np.ndarray, + k: Optional[int] = 1, + threshold: Optional[float] = np.inf, + inverse_distance: Optional[bool] = False, + X_index: Optional[np.ndarray] = None, + allow_overlap: Optional[bool] = False, + exclusion_factor: Optional[float] = 2.0, + ): + """ + Find the top-k neighbors of X in the database. + + Given ``k`` and ``threshold`` parameters, this methods returns the top-k + neighbors of X, such as each of the ``k`` neighbors as a distance inferior or + equal to ``threshold``. By default, ``threshold`` is set to infinity. It is + possible for this method to return less than ``k`` neighbors, either if there + is less than ``k`` admissible candidate in the database, or if in the top-k + candidates, some do not meet the ``threshold`` condition. + + Parameters + ---------- + X : np.ndarray, 2D array of shape (n_channels, length) + The subsequence for which we want to identify nearest neighbors in the + database. + k : int, optional + Number of neighbors to return. + threshold : int, optional + A threshold on the distance to determine which candidates will be returned. + inverse_distance : bool, optional + Wheter to inverse the computed distance, meaning that the method will return + the k most dissimilar neighbors instead of the k most similar. + X_index : np.ndarray, shape=(2,), optional + If ``X`` is a subsequence of the database given in fit, specify its starting + index as (i_case, i_timestamp). If specified, this subsequence and the + neighboring ones (according to ``exclusion_factor``) won't be considered as + admissible candidates. + allow_overlap: bool, optional + Wheter the top-k candidates can be neighboring subsequences. + exclusion_factor : float, default=2. + A factor of the query length used to define the exclusion zone when + ``allow_overlap`` is set to False. For a given timestamp, the exclusion zone + starts from :math:`id_timestamp - query_length//exclusion_factor` and end at + :math:`id_timestamp + query_length//exclusion_factor`. + + Returns + ------- + ndarray, shape=(k,) + A numpy array of at most ``k`` elements containing the indexes of the + neighbors. + ndarray, shape=(k,) + A numpy array of at most ``k`` elements containing the distances of the + neighbors to X. + + """ + self._check_is_fitted() + if self.length != X.shape[1] or self.n_channels_ != X.shape[0]: + raise ValueError( + f"Expected a subsequence of shape {(self.n_channels_, self.length)} but" + f" got {X.shape}" + ) + self._check_X_index(X_index) + prev_threads = get_num_threads() + set_num_threads(self._n_jobs) + neighbors, distances = self._find_neighbors( + X, + k=k, + threshold=threshold, + inverse_distance=inverse_distance, + X_index=X_index, + allow_overlap=allow_overlap, + exclusion_factor=exclusion_factor, + ) + set_num_threads(prev_threads) + if len(neighbors) < k: + warnings.warn( + f"The number of admissible neighbors found is {len(neighbors)}, instead" + f" of {k}", + stacklevel=2, + ) + return neighbors, distances + + def _check_X_index(self, X_index: np.ndarray): + """ + Check wheter the X_index parameter is correctly formated and is admissible. + + Parameters + ---------- + X_index : np.ndarray, 1D array of shape (2) + Array of integer containing the sample and timestamp identifiers of the + starting point of a subsequence in X_. + + Returns + ------- + X_index : np.ndarray, 1D array of shape (2) + Array of integer containing the sample and timestamp identifiers of the + starting point of a subsequence in X_. + + """ + if X_index is not None: + if ( + isinstance(X_index, list) + and len(X_index) == 2 + and isinstance(X_index[0], int) + and isinstance(X_index[1], int) + ): + X_index = np.asarray(X_index, dtype=int) + elif len(X_index) != 2: + raise ValueError( + "Expected a numpy array or list of integers with 2 elements " + f"for X_index but got {X_index}" + ) + elif ( + not (isinstance(X_index[0], int) and isinstance(X_index[1], int)) + or X_index.dtype != int + ): + raise TypeError( + "Expected a numpy array or list of integers for X_index but got " + f"{X_index}" + ) + + if X_index[0] >= self.n_cases_: + raise ValueError( + "The sample ID (first element) of X_index cannot exced the number " + "of series in the collection given during fit. Expected a value " + f"between [0, {self.n_cases_ - 1}] but got {X_index[0]}" + ) + _max_timestamp = self.X_[X_index[0]].shape[1] - self.length + 1 + if X_index[1] >= _max_timestamp: + raise ValueError( + "The timestamp ID (second element) of X_index cannot exced the " + "number of timestamps minus the length parameter plus one. Expected" + f" a value between [0, {_max_timestamp - 1}] but got {X_index[1]}" + ) + return X_index + + def _compute_mean_std_from_collection(self, X: np.ndarray): + """ + Compute the mean and std of each subsequence of size ``length`` in X. + + Parameters + ---------- + X : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints) + Collection of series from which we extract mean and stds. If it is an + unequal length collection, it should be a list of 2d numpy arrays. + + Returns + ------- + Tuple(np.ndarray, np.ndarray) + Both array are of shape (n_cases, n_timepoints-length+1, n_channels), + the first contains the means and the second the stds for each subsequence + of size ``length`` in X. + + """ + means = [] + stds = [] + + for i_x in range(len(X)): + _mean, _std = sliding_mean_std_one_series(X[i_x], self.length, 1) + stds.append(_std) + means.append(_mean) + + if self.metadata_["unequal_length"]: + return List(means), List(stds) + else: + return np.asarray(means), np.asarray(stds) + + @abstractmethod + def _fit(self, X, y=None): ... + + +class BaseMatrixProfile(BaseSubsequenceSearch): + """Base class for Matrix Profile methods using a length parameter.""" + + def _fit(self, X, y=None): + if self.length >= self.min_timepoints_: + raise ValueError( + "The length of the query should be inferior or equal to the length of " + "data (X_) provided during fit, but got {} for X and {} for X_".format( + self.length, self.min_timepoints_ + ) + ) + + if self.normalize: + self.X_means_, self.X_stds_ = self._compute_mean_std_from_collection(X) + self.X_ = X + return self + + def _find_motifs(): + raise NotImplementedError() + + def _find_neighbors( + self, + X: np.ndarray, + k: Optional[int] = 1, + threshold: Optional[float] = np.inf, + inverse_distance: Optional[bool] = False, + X_index=None, + allow_overlap: Optional[bool] = False, + exclusion_factor: Optional[float] = 2.0, + ): + """ + Find the top-k neighbors of X in the database. + + Given ``k`` and ``threshold`` parameters, this methods returns the top-k + neighbors of X, such as each of the ``k`` neighbors as a distance inferior or + equal to ``threshold``. By default, ``threshold`` is set to infinity. It is + possible for this method to return less than ``k`` neighbors, either if there + is less than ``k`` admissible candidate in the database, or if in the top-k + candidates, some do not meet the ``threshold`` condition. + + Parameters + ---------- + X : np.ndarray, 2D array of shape (n_channels, length) + The subsequence for which we want to identify nearest neighbors in the + database. + k : int, optional + Number of neighbors to return. + threshold : int, optional + A threshold on the distance to determine which candidates will be returned. + inverse_distance : bool, optional + Wheter to inverse the computed distance, meaning that the method will return + the k most dissimilar neighbors instead of the k most similar. + X_index : np.ndarray, shape=(2,), optional + If ``X`` is a subsequence of the database given in fit, specify its starting + index as (i_case, i_timestamp). If specified, this subsequence and the + neighboring ones (according to ``exclusion_factor``) won't be considered as + admissible candidates. + allow_overlap: bool, optional + Wheter the top-k candidates can be neighboring subsequences. + exclusion_factor : float, default=2. + A factor of the query length used to define the exclusion zone when + ``allow_overlap`` is set to False. For a given timestamp, the exclusion zone + starts from :math:`id_timestamp - query_length//exclusion_factor` and end at + :math:`id_timestamp + query_length//exclusion_factor`. + """ + exclusion_size = self.length // exclusion_factor + dist_profiles = self.compute_distance_profile(X) + + if inverse_distance: + dist_profiles = _inverse_distance_profile_list(dist_profiles) + + # Deal with self-matches + if X_index is not None: + _max_timestamp = self.X_[X_index[0]].shape[1] - self.length + ub = min(X_index[1] + exclusion_size, _max_timestamp) + lb = max(0, X_index[1] - exclusion_size) + dist_profiles[X_index[0]][lb:ub] = np.inf + + return _extract_top_k_from_dist_profile( + dist_profiles, + k, + threshold, + allow_overlap, + exclusion_size, + ) + + @abstractmethod + def compute_matrix_profile(X: Optional[np.ndarray] = None): + """Compute matrix profiles between X_ and X or between all series in X_.""" + ... + + @abstractmethod + def compute_distance_profile(X: np.ndarray): + """Compute distrance profiles between X_ and X (a series of size length).""" + ... diff --git a/aeon/similarity_search/matrix_profiles/tests/__init__.py b/aeon/similarity_search/subsequence_search/tests/__init__.py similarity index 100% rename from aeon/similarity_search/matrix_profiles/tests/__init__.py rename to aeon/similarity_search/subsequence_search/tests/__init__.py diff --git a/aeon/similarity_search/subsequence_search/tests/test__commons.py b/aeon/similarity_search/subsequence_search/tests/test__commons.py new file mode 100644 index 0000000000..23b07d78f7 --- /dev/null +++ b/aeon/similarity_search/subsequence_search/tests/test__commons.py @@ -0,0 +1,64 @@ +"""Test _commons.py functions.""" + +__maintainer__ = ["baraline"] + +import numpy as np +from numba.typed import List +from numpy.testing import assert_array_almost_equal + +from aeon.similarity_search.subsequence_search._commons import ( + _inverse_distance_profile_list, + fft_sliding_dot_product, + get_ith_products, +) +from aeon.testing.data_generation import ( + make_example_2d_numpy_list, + make_example_2d_numpy_series, +) + + +def test_fft_sliding_dot_product(): + """Test the fft_sliding_dot_product function.""" + X = make_example_2d_numpy_series(n_channels=1, n_timepoints=10) + Q = make_example_2d_numpy_series(n_channels=1, n_timepoints=4) + + values = fft_sliding_dot_product(X, Q) + # Compare values[0] only as input is univariate + assert_array_almost_equal( + values[0], + [np.dot(Q[0], X[0, i : i + 5]) for i in range(X.shape[1] - 5 + 1)], + ) + + +def test_get_ith_products(): + """Test i-th dot product of a subsequence of size L.""" + X = make_example_2d_numpy_series(n_channels=1, n_timepoints=10) + Q = make_example_2d_numpy_series(n_channels=1, n_timepoints=10) + L = 5 + + values = get_ith_products(X, Q, L, 0) + # Compare values[0] only as input is univariate + assert_array_almost_equal( + values[0], + [np.dot(Q[0, 0:L], X[0, i : i + L]) for i in range(X.shape[1] - L + 1)], + ) + + values = get_ith_products(X, Q, L, 4) + # Compare values[0] only as input is univariate + assert_array_almost_equal( + values[0], + [np.dot(Q[0, 4 : 4 + L], X[0, i : i + L]) for i in range(X.shape[1] - L + 1)], + ) + + +def test__inverse_distance_profile_list(): + """Test method to inverse a TypedList of distance profiles.""" + X = make_example_2d_numpy_list(n_cases=2, return_y=False) + T = _inverse_distance_profile_list(List(X)) + assert_array_almost_equal(1 / (X[0] + 1e-8), T[0]) + assert_array_almost_equal(1 / (X[1] + 1e-8), T[1]) + + +def test__extract_top_k_from_dist_profile(): + """Test method to esxtract the top k candidates from a list of distance profiles.""" + ... diff --git a/aeon/similarity_search/subsequence_search/tests/test_stomp.py b/aeon/similarity_search/subsequence_search/tests/test_stomp.py new file mode 100644 index 0000000000..169eee135e --- /dev/null +++ b/aeon/similarity_search/subsequence_search/tests/test_stomp.py @@ -0,0 +1,238 @@ +"""Tests for stomp algorithm.""" + +__maintainer__ = ["baraline"] + +import numpy as np +import pytest +from numba.typed import List +from numpy.testing import assert_almost_equal, assert_array_almost_equal + +from aeon.similarity_search.subsequence_search._commons import get_ith_products +from aeon.similarity_search.subsequence_search._stomp import ( + _normalized_squared_dist_profile_one_series, + _normalized_squared_distance_profile, + _squared_dist_profile_one_series, + _squared_distance_profile, + _stomp, + _stomp_normalized, + _update_dot_products_one_series, +) +from aeon.testing.data_generation import ( + make_example_2d_numpy_series, + make_example_3d_numpy, + make_example_3d_numpy_list, +) +from aeon.utils.numba.general import ( + sliding_mean_std_one_series, + z_normalise_series_2d_with_mean_std, +) + +K_VALUES = [1, 3] + + +def _get_mean_sdts_inputs(X, Q, L): + X_means = [] + X_stds = [] + + for i_x in range(len(X)): + _mean, _std = sliding_mean_std_one_series(X[i_x], L, 1) + X_stds.append(_std) + X_means.append(_mean) + + Q_means = Q.mean(axis=1) + Q_stds = Q.std(axis=1) + + return X_means, X_stds, Q_means, Q_stds + + +def test__update_dot_products_one_series(): + """Test the _update_dot_product function.""" + X = make_example_2d_numpy_series(n_channels=1, n_timepoints=20) + T = make_example_2d_numpy_series(n_channels=1, n_timepoints=10) + L = 7 + current_product = get_ith_products(X, T, L, 0) + for i_query in range(1, T.shape[1] - L + 1): + new_product = get_ith_products( + X, + T, + L, + i_query, + ) + current_product = _update_dot_products_one_series( + X, + T, + current_product, + L, + i_query, + ) + assert_array_almost_equal(new_product, current_product) + + +def test__squared_dist_profile_one_series(): + """Test Euclidean distance.""" + L = 3 + X = make_example_2d_numpy_series(n_channels=1, n_timepoints=10) + Q = make_example_2d_numpy_series(n_channels=1, n_timepoints=L) + QX = get_ith_products(X, Q, L, 0) + dist_profile = _squared_dist_profile_one_series(QX, X, Q) + for i_t in range(X.shape[1] - L + 1): + assert_almost_equal(dist_profile[i_t], np.sum((X[:, i_t : i_t + L] - Q) ** 2)) + + +def test__normalized_squared_dist_profile_one_series(): + """Test Euclidean distance.""" + L = 3 + X = make_example_2d_numpy_series(n_channels=1, n_timepoints=10) + Q = make_example_2d_numpy_series(n_channels=1, n_timepoints=L) + QX = get_ith_products(X, Q, L, 0) + X_mean, X_std = sliding_mean_std_one_series(X, L, 1) + Q_mean = Q.mean(axis=1) + Q_std = Q.std(axis=1) + + dist_profile = _normalized_squared_dist_profile_one_series( + QX, X_mean, X_std, Q_mean, Q_std, L, Q.std(axis=1) <= 0 + ) + Q = z_normalise_series_2d_with_mean_std(Q, Q_mean, Q_std) + for i_t in range(X.shape[1] - L + 1): + S = z_normalise_series_2d_with_mean_std( + X[:, i_t : i_t + L], X_mean[:, i_t], X_std[:, i_t] + ) + assert_almost_equal(dist_profile[i_t], np.sum((S - Q) ** 2)) + + +def test__squared_distance_profile(): + """Test Euclidean distance profile calculation.""" + L = 3 + X = make_example_3d_numpy(n_cases=3, n_channels=1, n_timepoints=10, return_y=False) + Q = make_example_2d_numpy_series(n_channels=1, n_timepoints=L) + QX = np.asarray([get_ith_products(X[i_x], Q, L, 0) for i_x in range(len(X))]) + dist_profiles = _squared_distance_profile(QX, X, Q) + for i_x in range(len(X)): + for i_t in range(X[i_x].shape[1] - L + 1): + assert_almost_equal( + dist_profiles[i_x][i_t], np.sum((X[i_x, :, i_t : i_t + L] - Q) ** 2) + ) + + # test unequal length and multivariate + X = List( + make_example_3d_numpy_list( + n_cases=3, + n_channels=2, + min_n_timepoints=10, + max_n_timepoints=20, + return_y=False, + ) + ) + + Q = make_example_2d_numpy_series(n_channels=2, n_timepoints=L) + QX = List([get_ith_products(X[i_x], Q, L, 0) for i_x in range(len(X))]) + dist_profiles = _squared_distance_profile(QX, X, Q) + for i_x in range(len(X)): + for i_t in range(X[i_x].shape[1] - L + 1): + assert_almost_equal( + dist_profiles[i_x][i_t], np.sum((X[i_x][:, i_t : i_t + L] - Q) ** 2) + ) + + +def test__normalized_squared_distance_profile(): + """Test Euclidean distance profile calculation.""" + L = 3 + X = make_example_3d_numpy(n_cases=3, n_channels=1, n_timepoints=10, return_y=False) + Q = make_example_2d_numpy_series(n_channels=1, n_timepoints=L) + QX = np.asarray([get_ith_products(X[i_x], Q, L, 0) for i_x in range(len(X))]) + + X_means, X_stds, Q_means, Q_stds = _get_mean_sdts_inputs(X, Q, L) + + X_means = np.asarray(X_means) + X_stds = np.asarray(X_stds) + + dist_profiles = _normalized_squared_distance_profile( + QX, X_means, X_stds, Q_means, Q_stds, L + ) + + Q_norm = z_normalise_series_2d_with_mean_std(Q, Q_means, Q_stds) + for i_x in range(len(X)): + for i_t in range(X[i_x].shape[1] - L + 1): + X_sub_norm = z_normalise_series_2d_with_mean_std( + X[i_x, :, i_t : i_t + L], X_means[i_x][:, i_t], X_stds[i_x][:, i_t] + ) + assert_almost_equal( + dist_profiles[i_x][i_t], np.sum((X_sub_norm - Q_norm) ** 2) + ) + + # test unequal length and multivariate + X = List( + make_example_3d_numpy_list( + n_cases=5, + n_channels=2, + min_n_timepoints=10, + max_n_timepoints=20, + return_y=False, + ) + ) + Q = make_example_2d_numpy_series(n_channels=2, n_timepoints=L) + + QX = List([get_ith_products(X[i_x], Q, L, 0) for i_x in range(len(X))]) + + X_means, X_stds, Q_means, Q_stds = _get_mean_sdts_inputs(X, Q, L) + # Convert to numba typed list + X_means = List(X_means) + X_stds = List(X_stds) + + dist_profiles = _normalized_squared_distance_profile( + QX, X_means, X_stds, Q_means, Q_stds, L + ) + + Q_norm = z_normalise_series_2d_with_mean_std(Q, Q_means, Q_stds) + for i_x in range(len(X)): + for i_t in range(X[i_x].shape[1] - L + 1): + X_sub_norm = z_normalise_series_2d_with_mean_std( + X[i_x][:, i_t : i_t + L], X_means[i_x][:, i_t], X_stds[i_x][:, i_t] + ) + assert_almost_equal( + dist_profiles[i_x][i_t], np.sum((X_sub_norm - Q_norm) ** 2) + ) + + +# K_VALUES = [1, 3] +@pytest.mark.parametrize("k", K_VALUES) +def test__stomp(k): + """Test STOMP method.""" + L = 3 + X = np.array([[[1, 2, 3, 2, 1, 2, 3, 4, 5, 2, 1, 2, 2]]]) + T = np.array([[1, 1, 3, 2, 2]]) + XdotT = np.asarray([get_ith_products(X[i_x], T, L, 0) for i_x in range(len(X))]) + + T_index = None + allow_overlap = False + threshold = np.inf + exclusion_size = L + inverse_distance = False + + MP, IP = _stomp( + X, + T, + XdotT, + L, + T_index, + k, + allow_overlap, + threshold, + exclusion_size, + inverse_distance, + ) + Expected_MP = [[1, 6], [1, 2], [1, 2, 5]] + Expected_IP = [[[0, 0], [0, 1]], [[0, 1], [0, 0]], [[0, 2], [0, 1], [0, 0]]] + for i in range(len(Expected_MP)): + assert_array_almost_equal(Expected_IP[i], IP[i]) + assert_array_almost_equal(Expected_MP[i], MP[i]) + + +@pytest.mark.parametrize("k", K_VALUES) +def test__stomp_normalized(k): + """Test STOMP normalized method.""" + _stomp_normalized + ... + + +# TODO : add tests for StompMatrixProfile diff --git a/aeon/similarity_search/tests/test__commons.py b/aeon/similarity_search/tests/test__commons.py deleted file mode 100644 index a97519ad31..0000000000 --- a/aeon/similarity_search/tests/test__commons.py +++ /dev/null @@ -1,49 +0,0 @@ -"""Test _commons.py functions.""" - -__maintainer__ = ["baraline"] - -import numpy as np -from numpy.testing import assert_array_almost_equal - -from aeon.similarity_search._commons import ( - fft_sliding_dot_product, - naive_squared_distance_profile, - naive_squared_matrix_profile, -) - - -def test_fft_sliding_dot_product(): - """Test the fft_sliding_dot_product function.""" - X = np.random.rand(1, 10) - q = np.random.rand(1, 5) - - values = fft_sliding_dot_product(X, q) - - assert_array_almost_equal( - values[0], - [np.dot(q[0], X[0, i : i + 5]) for i in range(X.shape[1] - 5 + 1)], - ) - - -def test_naive_squared_distance_profile(): - """Test naive squared distance profile computation is correct.""" - X = np.zeros((1, 1, 6)) - X[0, 0] = np.arange(6) - Q = np.array([[1, 2, 3]]) - query_length = Q.shape[1] - mask = np.ones((X.shape[0], X.shape[2] - query_length + 1), dtype=bool) - dist_profile = naive_squared_distance_profile(X, Q, mask) - assert_array_almost_equal(dist_profile[0], np.array([3.0, 0.0, 3.0, 12.0])) - - -def test_naive_squared_matrix_profile(): - """Test naive squared matrix profile computation is correct.""" - X = np.zeros((1, 1, 6)) - X[0, 0] = np.arange(6) - Q = np.zeros((1, 6)) - - Q[0] = np.arange(6, 12) - query_length = 3 - mask = np.ones((X.shape[0], X.shape[2] - query_length + 1), dtype=bool) - matrix_profile = naive_squared_matrix_profile(X, Q, query_length, mask) - assert_array_almost_equal(matrix_profile, np.array([27.0, 48.0, 75.0, 108.0])) diff --git a/aeon/similarity_search/tests/test_query_search.py b/aeon/similarity_search/tests/test_query_search.py deleted file mode 100644 index f97f6a50bf..0000000000 --- a/aeon/similarity_search/tests/test_query_search.py +++ /dev/null @@ -1,176 +0,0 @@ -"""Tests for QuerySearch.""" - -__maintainer__ = ["baraline"] - -import numpy as np -import pytest -from numpy.testing import assert_almost_equal, assert_array_equal - -from aeon.similarity_search.query_search import QuerySearch - -DATATYPES = ["int64", "float64"] - - -@pytest.mark.parametrize("dtype", DATATYPES) -def test_QuerySearch_mean_std_equal_length(dtype): - """Test the mean and std computation of QuerySearch.""" - X = np.asarray( - [[[1, 2, 3, 4, 5, 6, 7, 8]], [[1, 2, 4, 4, 5, 6, 5, 4]]], dtype=dtype - ) - q = np.asarray([[3, 4, 5]], dtype=dtype) - - search = QuerySearch(normalise=True) - search.fit(X) - _ = search.predict(q, X_index=(1, 2)) - for i in range(len(X)): - for j in range(X[i].shape[1] - q.shape[1] + 1): - subsequence = X[i, :, j : j + q.shape[1]] - assert_almost_equal(search.X_means_[i][:, j], subsequence.mean(axis=-1)) - assert_almost_equal(search.X_stds_[i][:, j], subsequence.std(axis=-1)) - - -@pytest.mark.parametrize("dtype", DATATYPES) -def test_QuerySearch_mean_std_unequal_length(dtype): - """Test the mean and std computation of QuerySearch on unequal length data.""" - X = [ - np.array([[1, 2, 3, 4, 5, 6, 7, 8]], dtype=dtype), - np.array([[1, 2, 4, 4, 5, 6, 5]], dtype=dtype), - ] - - q = np.asarray([[3, 4, 5]], dtype=dtype) - - search = QuerySearch(normalise=True) - search.fit(X) - _ = search.predict(q, X_index=(1, 2)) - for i in range(len(X)): - for j in range(X[i].shape[1] - q.shape[1] + 1): - subsequence = X[i][:, j : j + q.shape[1]] - assert_almost_equal(search.X_means_[i][:, j], subsequence.mean(axis=-1)) - assert_almost_equal(search.X_stds_[i][:, j], subsequence.std(axis=-1)) - - -@pytest.mark.parametrize("dtype", DATATYPES) -def test_QuerySearch_threshold_and_k(dtype): - """Test the k and threshold combination of QuerySearch.""" - X = np.asarray( - [[[1, 2, 3, 4, 5, 6, 7, 8]], [[1, 2, 4, 4, 5, 6, 5, 4]]], dtype=dtype - ) - q = np.asarray([[3, 4, 5]], dtype=dtype) - - search = QuerySearch(k=3, threshold=1) - search.fit(X) - dist, idx = search.predict(q) - assert_array_equal(idx, [(0, 2), (1, 2)]) - - -@pytest.mark.parametrize("dtype", DATATYPES) -def test_QuerySearch_inverse_distance(dtype): - """Test the inverse distance parameter of QuerySearch.""" - X = np.asarray( - [[[1, 2, 3, 4, 5, 6, 7, 8]], [[1, 2, 4, 4, 5, 6, 5, 4]]], dtype=dtype - ) - q = np.asarray([[3, 4, 5]], dtype=dtype) - - search = QuerySearch(k=1, inverse_distance=True) - search.fit(X) - _, idx = search.predict(q) - assert_array_equal(idx, [(0, 5)]) - - -@pytest.mark.parametrize("dtype", DATATYPES) -def test_QuerySearch_euclidean(dtype): - """Test the functionality of QuerySearch with Euclidean distance.""" - X = np.asarray( - [[[1, 2, 3, 4, 5, 6, 7, 8]], [[1, 2, 4, 4, 5, 6, 5, 4]]], dtype=dtype - ) - q = np.asarray([[3, 4, 5]], dtype=dtype) - - search = QuerySearch(k=1) - search.fit(X) - _, idx = search.predict(q) - assert_array_equal(idx, [(0, 2)]) - - search = QuerySearch(k=3) - search.fit(X) - _, idx = search.predict(q) - assert_array_equal(idx, [(0, 2), (1, 2), (1, 1)]) - - _, idx = search.predict(q, apply_exclusion_to_result=True) - assert_array_equal(idx, [(0, 2), (1, 2), (1, 4)]) - - search = QuerySearch(k=1, normalise=True) - search.fit(X) - q = np.asarray([[8, 8, 10]], dtype=dtype) - _, idx = search.predict(q) - assert_array_equal(idx, [(1, 2)]) - - _, idx = search.predict(q, apply_exclusion_to_result=True) - assert_array_equal(idx, [(1, 2)]) - - search = QuerySearch(k=1, normalise=True) - search.fit(X) - _, idx = search.predict(q, X_index=(1, 2)) - assert_array_equal(idx, [(1, 0)]) - - -@pytest.mark.parametrize("dtype", DATATYPES) -def test_QuerySearch_euclidean_unequal_length(dtype): - """Test the functionality of QuerySearch on unequal length data.""" - X = [ - np.array([[1, 2, 3, 4, 5, 6, 7, 8]], dtype=dtype), - np.array([[1, 2, 4, 4, 5, 6, 5]], dtype=dtype), - ] - - q = np.asarray([[3, 4, 5]], dtype=dtype) - - search = QuerySearch(k=1) - search.fit(X) - _, idx = search.predict(q) - assert_array_equal(idx, [(0, 2)]) - - search = QuerySearch(k=3) - search.fit(X) - _, idx = search.predict(q) - assert_array_equal(idx, [(0, 2), (1, 2), (1, 1)]) - - _, idx = search.predict(q, apply_exclusion_to_result=True) - assert_array_equal(idx, [(0, 2), (1, 2), (1, 4)]) - - search = QuerySearch(k=1, normalise=True) - search.fit(X) - q = np.asarray([[8, 8, 10]], dtype=dtype) - _, idx = search.predict(q) - assert_array_equal(idx, [(1, 2)]) - - _, idx = search.predict(q, apply_exclusion_to_result=True) - assert_array_equal(idx, [(1, 2)]) - - search = QuerySearch(k=1, normalise=True) - search.fit(X) - _, idx = search.predict(q, X_index=(1, 2)) - assert_array_equal(idx, [(1, 0)]) - - -@pytest.mark.parametrize("dtype", DATATYPES) -def test_QuerySearch_speedup(dtype): - """Test the speedup functionality of QuerySearch.""" - X = np.asarray( - [[[1, 2, 3, 4, 5, 6, 7, 8]], [[1, 2, 4, 4, 5, 6, 5, 4]]], dtype=dtype - ) - q = np.asarray([[3, 4, 5]], dtype=dtype) - - search = QuerySearch(k=1, speed_up="fastest") - search.fit(X) - _, idx = search.predict(q) - assert_array_equal(idx, [(0, 2)]) - - search = QuerySearch( - k=1, - distance="euclidean", - speed_up="fastest", - normalise=True, - ) - search.fit(X) - q = np.asarray([[8, 8, 10]], dtype=dtype) - _, idx = search.predict(q) - assert_array_equal(idx, [(1, 2)]) diff --git a/aeon/similarity_search/tests/test_series_search.py b/aeon/similarity_search/tests/test_series_search.py deleted file mode 100644 index a10109359c..0000000000 --- a/aeon/similarity_search/tests/test_series_search.py +++ /dev/null @@ -1,74 +0,0 @@ -"""Tests for SeriesSearch similarity search algorithm.""" - -__maintainer__ = ["baraline"] - - -import numpy as np -import pytest - -from aeon.similarity_search.series_search import SeriesSearch - -DATATYPES = ["int64", "float64"] -K_VALUES = [1, 3] -normalise = [True, False] - -# See #2236 -# @pytest.mark.parametrize("k", K_VALUES) -# @pytest.mark.parametrize("normalise", normalise) -# def test_SeriesSearch_k(k, normalise): -# """Test the k and threshold combination of SeriesSearch.""" -# X = np.asarray([[[1, 2, 3, 4, 5, 6, 7, 8]], [[1, 2, 4, 4, 5, 6, 5, 4]]]) -# S = np.asarray([[3, 4, 5, 4, 3, 4]]) -# L = 3 -# -# search = SeriesSearch(k=k, normalise=normalise) -# search.fit(X) -# mp, ip = search.predict(S, L) -# -# assert mp[0].shape[0] == ip[0].shape[0] == k -# assert len(mp) == len(ip) == S.shape[1] - L + 1 -# assert ip[0].shape[1] == 2 - - -@pytest.mark.parametrize("dtype", DATATYPES) -def test_SeriesSearch_error_predict(dtype): - """Test the functionality of SeriesSearch with Euclidean distance.""" - X = np.asarray( - [[[1, 2, 3, 4, 5, 6, 7, 8]], [[1, 2, 4, 4, 5, 6, 5, 4]]], dtype=dtype - ) - S = np.asarray([[3, 4, 5, 4, 3, 4, 5]], dtype=dtype) - L = 100 - - search = SeriesSearch() - search.fit(X) - with pytest.raises(ValueError): - mp, ip = search.predict(S, L) - L = 3 - S = np.asarray( - [ - [3, 4, 5, 4, 3, 4], - [6, 5, 3, 2, 4, 5], - ], - dtype=dtype, - ) - with pytest.raises(ValueError): - mp, ip = search.predict(S, L) - - S = [6, 5, 3, 2, 4, 5] - with pytest.raises(TypeError): - mp, ip = search.predict(S, L) - - -@pytest.mark.parametrize("dtype", DATATYPES) -def test_SeriesSearch_process_unequal_length(dtype): - """Test the functionality of SeriesSearch on unequal length data.""" - X = [ - np.array([[1, 2, 3, 4, 5, 6, 7, 8]], dtype=dtype), - np.array([[1, 2, 4, 4, 5, 6, 5]], dtype=dtype), - ] - S = np.asarray([[3, 4, 5, 4, 3, 4]], dtype=dtype) - L = 3 - - search = SeriesSearch() - search.fit(X) - mp, ip = search.predict(S, L) diff --git a/aeon/utils/numba/general.py b/aeon/utils/numba/general.py index 10e96abde6..b398a8414b 100644 --- a/aeon/utils/numba/general.py +++ b/aeon/utils/numba/general.py @@ -8,7 +8,9 @@ "first_order_differences_3d", "z_normalise_series_with_mean", "z_normalise_series", + "z_normalise_series_with_mean_std", "z_normalise_series_2d", + "z_normalise_series_2d_with_mean_std", "z_normalise_series_3d", "set_numba_random_seed", "choice_log", @@ -20,6 +22,7 @@ "slope_derivative_2d", "slope_derivative_3d", "generate_combinations", + "get_all_subsequences", ] From 52b0692a26353f302570323d9b6d4498e09071db Mon Sep 17 00:00:00 2001 From: Antoine Guillaume Date: Thu, 5 Dec 2024 17:20:47 +0100 Subject: [PATCH 02/18] Update _brute_force.py --- aeon/similarity_search/subsequence_search/_brute_force.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/aeon/similarity_search/subsequence_search/_brute_force.py b/aeon/similarity_search/subsequence_search/_brute_force.py index 7f95083f3b..691b81367a 100644 --- a/aeon/similarity_search/subsequence_search/_brute_force.py +++ b/aeon/similarity_search/subsequence_search/_brute_force.py @@ -264,13 +264,6 @@ def _naive_squared_matrix_profile( if inverse_distance: dist_profiles = _inverse_distance_profile_list(dist_profiles) - # Deal with self-matches - if T_index is not None: - _max_timestamp = X[T_index].shape[1] - L - ub = min(i_q + exclusion_size, _max_timestamp) - lb = max(0, i_q - exclusion_size) - dist_profiles[T_index][lb:ub] = np.inf - top_dists, top_indexes = _extract_top_k_from_dist_profile( dist_profiles, k, From 7973a306c98c7c0b38fff56abd01e415732ca31d Mon Sep 17 00:00:00 2001 From: Antoine Guillaume Date: Thu, 5 Dec 2024 17:47:19 +0100 Subject: [PATCH 03/18] Update test__commons.py --- .../subsequence_search/tests/test__commons.py | 57 ++++++++++++++++++- 1 file changed, 55 insertions(+), 2 deletions(-) diff --git a/aeon/similarity_search/subsequence_search/tests/test__commons.py b/aeon/similarity_search/subsequence_search/tests/test__commons.py index 23b07d78f7..b0e2764b0b 100644 --- a/aeon/similarity_search/subsequence_search/tests/test__commons.py +++ b/aeon/similarity_search/subsequence_search/tests/test__commons.py @@ -4,7 +4,7 @@ import numpy as np from numba.typed import List -from numpy.testing import assert_array_almost_equal +from numpy.testing import assert_array_almost_equal, assert_array_equal from aeon.similarity_search.subsequence_search._commons import ( _inverse_distance_profile_list, @@ -61,4 +61,57 @@ def test__inverse_distance_profile_list(): def test__extract_top_k_from_dist_profile(): """Test method to esxtract the top k candidates from a list of distance profiles.""" - ... + X = List([ + [5,4,3,3,1,3,2,5,1,4,1,0,1,2,2,7,8,1,5], + [5,1,0,1,0,0,5,4,3,5,6,1,4,2], + ]) + + top_k_indexes, top_k_distances = _extract_top_k_from_dist_profile( + X, + 1, + np.inf, + False, + 3 + ) + assert_array_equal(top_k_indexes, [[0,11]]) + assert_array_equal(top_k_indexes, [0]) + + top_k_indexes, top_k_distances = _extract_top_k_from_dist_profile( + X, + 5, + np.inf, + False, + 3 + ) + assert_array_equal(top_k_indexes, [[0,11],[1,2],[0,4],[0,17],[1,11]]) + assert_array_equal(top_k_indexes, [0,0,1,1,1]) + + top_k_indexes, top_k_distances = _extract_top_k_from_dist_profile( + X, + 5, + np.inf, + True, + 3 + ) + assert_array_equal(top_k_indexes, [[0,11],[1,2],[1,4],[1,5],[0,4]]) + assert_array_equal(top_k_indexes, [0,0,0,0,1]) + + top_k_indexes, top_k_distances = _extract_top_k_from_dist_profile( + X, + 5, + 0.5, + True, + 3 + ) + assert_array_equal(top_k_indexes, [[0,11],[1,2],[1,4],[1,5]]) + assert_array_equal(top_k_indexes, [0,0,0,0]) + + top_k_indexes, top_k_distances = _extract_top_k_from_dist_profile( + X, + 5, + 0.5, + False, + 3 + ) + assert_array_equal(top_k_indexes, [[0,11],[1,2]]) + assert_array_equal(top_k_indexes, [0,0]) From ad02b8466fb51e2fde63f1bcfec43166d5026116 Mon Sep 17 00:00:00 2001 From: baraline Date: Fri, 27 Dec 2024 09:05:05 +0100 Subject: [PATCH 04/18] WIP mock and test --- .../series_search/__init__.py | 8 +- .../subsequence_search/__init__.py | 8 +- .../subsequence_search/_brute_force.py | 140 +++++++----- .../subsequence_search/_commons.py | 48 +++- .../subsequence_search/_stomp.py | 211 ++++++++++-------- .../subsequence_search/base.py | 172 +++++++++++--- .../tests/test_brute_force.py | 155 +++++++++++++ .../{test__commons.py => test_commons.py} | 84 +++---- .../subsequence_search/tests/test_stomp.py | 166 +++++++++++--- aeon/testing/mock_estimators/__init__.py | 3 - .../_mock_similarity_search.py | 21 -- .../_mock_similarity_searchers.py | 72 ++++++ aeon/utils/base/_register.py | 14 +- docs/api_reference/utils.rst | 1 - 14 files changed, 807 insertions(+), 296 deletions(-) create mode 100644 aeon/similarity_search/subsequence_search/tests/test_brute_force.py rename aeon/similarity_search/subsequence_search/tests/{test__commons.py => test_commons.py} (59%) delete mode 100644 aeon/testing/mock_estimators/_mock_similarity_search.py create mode 100644 aeon/testing/mock_estimators/_mock_similarity_searchers.py diff --git a/aeon/similarity_search/series_search/__init__.py b/aeon/similarity_search/series_search/__init__.py index f576c41f03..2f69dab51a 100644 --- a/aeon/similarity_search/series_search/__init__.py +++ b/aeon/similarity_search/series_search/__init__.py @@ -1,7 +1,5 @@ -"""Similarity search module.""" +"""Series search module.""" -__all__ = ["BaseSimilaritySearch", "QuerySearch", "SeriesSearch"] +__all__ = ["BaseSeriesSearch", "BaseIndexSearch"] -from aeon.similarity_search.base import BaseSimilaritySearch -from aeon.similarity_search.query_search import QuerySearch -from aeon.similarity_search.series_search import SeriesSearch +from aeon.similarity_search.series_search.base import BaseIndexSearch, BaseSeriesSearch diff --git a/aeon/similarity_search/subsequence_search/__init__.py b/aeon/similarity_search/subsequence_search/__init__.py index dfca8ee18c..c5de805eb6 100644 --- a/aeon/similarity_search/subsequence_search/__init__.py +++ b/aeon/similarity_search/subsequence_search/__init__.py @@ -1,5 +1,9 @@ -"""Similarity search module.""" +"""Subsequence search module.""" -__all__ = ["StompMatrixProfile"] +__all__ = ["BaseSubsequenceSearch", "BaseMatrixProfile", "StompMatrixProfile"] from aeon.similarity_search.subsequence_search._stomp import StompMatrixProfile +from aeon.similarity_search.subsequence_search.base import ( + BaseMatrixProfile, + BaseSubsequenceSearch, +) diff --git a/aeon/similarity_search/subsequence_search/_brute_force.py b/aeon/similarity_search/subsequence_search/_brute_force.py index 691b81367a..6c26925a32 100644 --- a/aeon/similarity_search/subsequence_search/_brute_force.py +++ b/aeon/similarity_search/subsequence_search/_brute_force.py @@ -16,8 +16,8 @@ from aeon.similarity_search.subsequence_search.base import BaseMatrixProfile from aeon.utils.numba.general import ( get_all_subsequences, + z_normalise_series_2d, z_normalise_series_3d, - z_normalize_series_2d, ) # TODO : check function params and make docstrings @@ -25,7 +25,7 @@ class BruteForceMatrixProfile(BaseMatrixProfile): - """.""" + """Estimator to compute matrix profile and distance profile using brute force.""" def compute_matrix_profile( self, @@ -33,37 +33,51 @@ def compute_matrix_profile( threshold, exclusion_size, inverse_distance, - allow_overlap, + allow_neighboring_matches, X: Optional[np.ndarray] = None, X_index: Optional[int] = None, ): """ - . + Compute matrix profiles. + + The matrix profiles are computed on the collection given in fit. If ``X`` is + not given, computes the matrix profile of each series in the collection. If it + is given, only computes it for ``X``. Parameters ---------- - k : TYPE - DESCRIPTION. - threshold : TYPE - DESCRIPTION. - exclusion_size : TYPE - DESCRIPTION. - inverse_distance : TYPE - DESCRIPTION. + k : int + The number of best matches to return during predict for each subsequence. + threshold : float + The number of best matches to return during predict for each subsequence. + inverse_distance : bool + If True, the matching will be made on the inverse of the distance, and thus, + the worst matches to the query will be returned instead of the best ones. + exclusion_size : int + The size of the exclusion zone used to prevent returning as top k candidates + the ones that are close to each other (for example i and i+1). + It is used to define a region between + :math:`id_timestomp - exclusion_size` and + :math:`id_timestomp + exclusion_size` which cannot be returned + as best match if :math:`id_timestomp` was already selected. By default, + the value None means that this is not used. X : Optional[np.ndarray], optional - DESCRIPTION. The default is None. + The time series on which the matrix profile will be compute. + The default is None, meaning that the series in the collection given in fit + will be used instead. X_index : Optional[int], optional - DESCRIPTION. The default is None. - : TYPE - DESCRIPTION. + If ``X`` is a series of the database given in fit, specify its index in + ``X_``. If specified, each query of this series won't be able to match with + its neighboring subsequences. Returns ------- - MP : TYPE - DESCRIPTION. - IP : TYPE - DESCRIPTION. - + MP : array of shape (series_length - L + 1,) + Matrix profile distances for each query subsequence. If X is none, this + will be a list of MP for each series in X_. + IP : array of shape (series_length - L + 1,) + Indexes of the top matches for each query subsequence. If X is none, this + will be a list of MP for each series in X_. """ # pairwise if none if X is None: @@ -87,11 +101,11 @@ def compute_matrix_profile( self.length, X_index, k, - allow_overlap, threshold, + allow_neighboring_matches, exclusion_size, inverse_distance, - normalize=self.normalize, + normalise=self.normalise, ) return MP, IP @@ -114,7 +128,7 @@ def compute_distance_profile(self, X: np.ndarray): """ distance_profiles = _naive_squared_distance_profile( - self.X_, X, normalize=self.normalize + self.X_, X, normalise=self.normalise ) if not self.metadata_["unequal_length"]: @@ -153,7 +167,7 @@ def _compute_dist_profile(X_subs, q): def _naive_squared_distance_profile( X, Q, - normalize=False, + normalise=False, ): """ Compute a squared euclidean distance profile. @@ -164,8 +178,8 @@ def _naive_squared_distance_profile( Input time series dataset to search in. Q : array, shape=(n_channels, query_length) Query used during the search. - normalize : bool - Wheter to use a z-normalized distance. + normalise : bool + Wheter to use a z-normalised distance. Returns ------- @@ -178,14 +192,16 @@ def _naive_squared_distance_profile( # Init distance profile array with unequal length support for i in range(len(X)): dist_profiles.append(np.zeros(X[i].shape[1] - query_length + 1)) - if normalize: - Q = z_normalize_series_2d(Q) + if normalise: + Q = z_normalise_series_2d(Q) else: Q = Q.astype(np.float64) - for i in prange(len(X)): + for _i in prange(len(X)): + # cast uint64 due to parallel prange + i = np.int64(_i) X_subs = get_all_subsequences(X[i], query_length, 1) - if normalize: + if normalise: X_subs = z_normalise_series_3d(X_subs) dist_profile = _compute_dist_profile(X_subs, Q) @@ -198,32 +214,50 @@ def _naive_squared_matrix_profile( X, T, L, - k, T_index, + k, threshold, - inverse_distance, - allow_overlap, + allow_neighboring_matches, exclusion_size, - normalize=False, + inverse_distance, + normalise=False, ): """ Compute a squared euclidean matrix profile. Parameters ---------- - X : array, shape=(n_samples, n_channels, n_timepoints_x) - Input time series dataset to search in. - T : array, shape=(n_channels, n_timepoints_t) - Time series from which queries are extracted. - query_length : int - Length of the queries to extract from T. + X: np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints) + The input samples. If X is an unquel length collection, expect a TypedList + of 2D arrays of shape (n_channels, n_timepoints) + T : np.ndarray, 2D array of shape (n_channels, series_length) + The series used for similarity search. Note that series_length can be equal, + superior or inferior to n_timepoints, it doesn't matter. + L : int + Length of the subsequences used for the distance computation. T_index : int, - If ``X`` is a subsequence of the database given in fit, specify its starting - index as (i_case, i_timestamp). If specified, this subsequence and the - neighboring ones (according to ``exclusion_factor``) won't be considered as - admissible candidates. - normalize : bool - Wheter to use a z-normalized distance. + If ``T`` is a series of ``X``, specify its index + in ``X``. If specified, each query of this series won't be able to + match with its neighboring subsequences. + k : int + The number of best matches to return during predict for each subsequence. + threshold : float + The number of best matches to return during predict for each subsequence. + allow_neighboring_matches : bool + Wheter the top-k candidates can be neighboring subsequences. + exclusion_size : int + The size of the exclusion zone used to prevent returning as top k candidates + the ones that are close to each other (for example i and i+1). + It is used to define a region between + :math:`id_timestomp - exclusion_size` and + :math:`id_timestomp + exclusion_size` which cannot be returned + as best match if :math:`id_timestomp` was already selected. By default, + the value None means that this is not used. + inverse_distance : bool + If True, the matching will be made on the inverse of the distance, and thus, the + worst matches to the query will be returned instead of the best ones. + normalise : bool + Wheter to use a z-normalised distance. Returns ------- @@ -242,14 +276,14 @@ def _naive_squared_matrix_profile( X_subs = List() for i in range(len(X)): i_subs = get_all_subsequences(X[i], L, 1) - if normalize: - i_subs = z_normalise_series_3d(X_subs) + if normalise: + i_subs = z_normalise_series_3d(i_subs) X_subs.append(i_subs) for i_q in range(n_queries): Q = T[:, i : i + L] - if normalize: - Q = z_normalize_series_2d(Q) + if normalise: + Q = z_normalise_series_2d(Q) for i_x in prange(len(X)): dist_profiles[i_x][0 : X[i_x].shape[1] - L + 1] = _compute_dist_profile( X_subs[i_x], Q @@ -264,11 +298,11 @@ def _naive_squared_matrix_profile( if inverse_distance: dist_profiles = _inverse_distance_profile_list(dist_profiles) - top_dists, top_indexes = _extract_top_k_from_dist_profile( + top_indexes, top_dists = _extract_top_k_from_dist_profile( dist_profiles, k, threshold, - allow_overlap, + allow_neighboring_matches, exclusion_size, ) diff --git a/aeon/similarity_search/subsequence_search/_commons.py b/aeon/similarity_search/subsequence_search/_commons.py index e2e0aa54df..03e0ee1ac3 100644 --- a/aeon/similarity_search/subsequence_search/_commons.py +++ b/aeon/similarity_search/subsequence_search/_commons.py @@ -73,29 +73,61 @@ def _extract_top_k_from_dist_profile( dist_profiles, k, threshold, - allow_overlap, + allow_neighboring_matches, exclusion_size, ): + """ + Given an array (or list) of distance profiles, extract the top k lower distances. + + Parameters + ---------- + dist_profiles : np.ndarray, shape = (n_samples, n_timepoints - length + 1) + A collection of distance profiles computed from ``n_samples`` time series of + size ``n_timepoints``, giving distance profiles of length + ``n_timepoints - length + 1``, with ``length`` the size of the query used to + compute the distance profiles. + k : int + Number of best matches to return + threshold : float + A threshold on the distances of the best matches. To be returned, a candidate + must have a distance bellow this threshold. This can reduce the number of + returned matches to be bellow ``k`` + allow_neighboring_matches : bool + Wheter to allow returning matches that are in the same neighborhood. + exclusion_size : int + The size of the exlusion size to apply when ``allow_neighboring_matches`` is + False. It is applied on both side of existing matches (+/- their indexes). + + Returns + ------- + top_k_indexes : np.ndarray, shape = (k, 2) + The indexes of the best matches in ``distance_profiles``. + top_k_distances : np.ndarray, shape = (k) + The distances of the best matches. + + """ top_k_indexes = np.zeros((2 * k, 2), dtype=np.int64) - 1 top_k_distances = np.full(2 * k, np.inf) for i_profile in range(len(dist_profiles)): # Extract top-k without neighboring matches - if not allow_overlap: + if not allow_neighboring_matches: _sorted_indexes = np.argsort(dist_profiles[i_profile]) _top_k_indexes = np.zeros(k, dtype=np.int64) - 1 - _current_k = 1 - _top_k_indexes[0] = _sorted_indexes[0] - _current_j = 1 + _current_k = 0 + _current_j = 0 # Until we extract k value or explore all the array while _current_k < k and _current_j < len(_sorted_indexes): _insert = True # Check for validity with each previously inserted for i_k in range(_current_k): - ub = min(_top_k_indexes[i_k] + exclusion_size, len(dist_profiles)) + ub = min( + _top_k_indexes[i_k] + exclusion_size, + len(dist_profiles[i_profile]), + ) lb = max(_top_k_indexes[i_k] - exclusion_size, 0) if ( - _sorted_indexes[_current_j] < lb - or _sorted_indexes[_current_j] > ub + _sorted_indexes[_current_j] >= lb + and _sorted_indexes[_current_j] <= ub ): _insert = False break diff --git a/aeon/similarity_search/subsequence_search/_stomp.py b/aeon/similarity_search/subsequence_search/_stomp.py index 8986a41926..0dd75971f8 100644 --- a/aeon/similarity_search/subsequence_search/_stomp.py +++ b/aeon/similarity_search/subsequence_search/_stomp.py @@ -1,10 +1,10 @@ """Implementation of STOMP with squared euclidean distance.""" -from typing import Optional - __maintainer__ = ["baraline"] +from typing import Optional + import numpy as np from numba import njit, prange from numba.typed import List @@ -16,15 +16,14 @@ get_ith_products, ) from aeon.similarity_search.subsequence_search.base import BaseMatrixProfile -from aeon.utils.numba.general import AEON_NUMBA_STD_THRESHOLD - -# TODO : check and order parameters of functions in base and here -# TODO : check function params and make docstrings to be consistent with brute force -# TODO : validate tests +from aeon.utils.numba.general import ( + AEON_NUMBA_STD_THRESHOLD, + sliding_mean_std_one_series, +) class StompMatrixProfile(BaseMatrixProfile): - """.""" + """Estimator to compute matrix profile and distance profile using STOMP.""" def compute_matrix_profile( self, @@ -32,41 +31,53 @@ def compute_matrix_profile( threshold, exclusion_size, inverse_distance, - allow_overlap, + allow_neighboring_matches, X: Optional[np.ndarray] = None, X_index: Optional[int] = None, ): """ - . + Compute matrix profiles. + + The matrix profiles are computed on the collection given in fit. If ``X`` is + not given, computes the matrix profile of each series in the collection. If it + is given, only computes it for ``X``. Parameters ---------- - k : TYPE - DESCRIPTION. - threshold : TYPE - DESCRIPTION. - exclusion_size : TYPE - DESCRIPTION. - inverse_distance : TYPE - DESCRIPTION. + k : int + The number of best matches to return during predict for each subsequence. + threshold : float + The number of best matches to return during predict for each subsequence. + inverse_distance : bool + If True, the matching will be made on the inverse of the distance, and thus, + the worst matches to the query will be returned instead of the best ones. + exclusion_size : int + The size of the exclusion zone used to prevent returning as top k candidates + the ones that are close to each other (for example i and i+1). + It is used to define a region between + :math:`id_timestomp - exclusion_size` and + :math:`id_timestomp + exclusion_size` which cannot be returned + as best match if :math:`id_timestomp` was already selected. By default, + the value None means that this is not used. X : Optional[np.ndarray], optional - DESCRIPTION. The default is None. + The time series on which the matrix profile will be compute. + The default is None, meaning that the series in the collection given in fit + will be used instead. X_index : Optional[int], optional If ``X`` is a series of the database given in fit, specify its index in ``X_``. If specified, each query of this series won't be able to match with its neighboring subsequences. - : TYPE - DESCRIPTION. Returns ------- - MP : TYPE - DESCRIPTION. - IP : TYPE - DESCRIPTION. - + MP : array of shape (series_length - L + 1,) + Matrix profile distances for each query subsequence. If X is none, this + will be a list of MP for each series in X_. + IP : array of shape (series_length - L + 1,) + Indexes of the top matches for each query subsequence. If X is none, this + will be a list of MP for each series in X_. """ - # pairwise if none + # If we compute matrix profiles for each series in X_ if X is None: MP = [] IP = [] @@ -81,6 +92,7 @@ def compute_matrix_profile( ) MP.append(_MP) IP.append(_IP) + # else we compute matrix profiles using X on the series in X_ else: XdotT = [ get_ith_products(self.X[i], X, self.length, 0) @@ -90,12 +102,13 @@ def compute_matrix_profile( XdotT = np.asarray(XdotT) elif isinstance(X, List): XdotT = List(XdotT) + if X_index is None: - X_means, X_stds = 0 + X_means, X_stds = sliding_mean_std_one_series(X, self.length, 1) else: X_means, X_stds = self.X_means_[i], self.X_stds_[i] - if self.normalize: - MP, IP = _stomp_normalized( + if self.normalise: + MP, IP = _stomp_normalised( self.X_, X, XdotT, @@ -107,11 +120,10 @@ def compute_matrix_profile( X_index, k, threshold, - allow_overlap, + allow_neighboring_matches, exclusion_size, inverse_distance, ) - else: MP, IP = _stomp( self.X_, @@ -120,12 +132,11 @@ def compute_matrix_profile( self.length, X_index, k, - allow_overlap, threshold, + allow_neighboring_matches, exclusion_size, inverse_distance, ) - return MP, IP def compute_distance_profile(self, X: np.ndarray): @@ -151,8 +162,8 @@ def compute_distance_profile(self, X: np.ndarray): else: QX = np.asarray(QX) - if self.normalize: - distance_profiles = _normalized_squared_distance_profile( + if self.normalise: + distance_profiles = _normalised_squared_distance_profile( QX, self.X_means_, self.X_stds_, @@ -172,8 +183,8 @@ def compute_distance_profile(self, X: np.ndarray): return distance_profiles -@njit(cache=True, parallel=True, fastmath=True) -def _stomp_normalized( +@njit(cache=True, fastmath=True) +def _stomp_normalised( X, T, XdotT, @@ -185,12 +196,12 @@ def _stomp_normalized( T_index, k, threshold, - allow_overlap, + allow_neighboring_matches, exclusion_size, inverse_distance, ): """ - Compute the Matrix Profile using the STOMP algorithm with normalized distances. + Compute the Matrix Profile using the STOMP algorithm with normalised distances. X: np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints) The input samples. If X is an unquel length collection, expect a TypedList @@ -198,8 +209,6 @@ def _stomp_normalized( T : np.ndarray, 2D array of shape (n_channels, series_length) The series used for similarity search. Note that series_length can be equal, superior or inferior to n_timepoints, it doesn't matter. - L : int - Length of the subsequences used for the distance computation. XdotT : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - L + 1) Precomputed dot products between each time series in X and the query series T. X_means : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - L + 1) @@ -212,18 +221,19 @@ def _stomp_normalized( Means of each subsequences of T of size L. T_stds : np.ndarray, 2D array of shape (n_channels, n_timepoints - L + 1) Stds of each subsequences of T of size L. + L : int + Length of the subsequences used for the distance computation. T_index : int, If ``T`` is a series of the database given in fit, specify its index in ``X_``. If specified, each query of this series won't be able to match with its neighboring subsequences. - k : int, default=1 + k : int, The number of best matches to return during predict for each subsequence. - threshold : float, default=np.inf + threshold : float The number of best matches to return during predict for each subsequence. - inverse_distance : bool, default=False - If True, the matching will be made on the inverse of the distance, and thus, the - worst matches to the query will be returned instead of the best ones. - exclusion_size : int, optional + allow_neighboring_matches : bool + Wheter the top-k candidates can be neighboring subsequences. + exclusion_size : int The size of the exclusion zone used to prevent returning as top k candidates the ones that are close to each other (for example i and i+1). It is used to define a region between @@ -231,6 +241,9 @@ def _stomp_normalized( :math:`id_timestomp + exclusion_size` which cannot be returned as best match if :math:`id_timestomp` was already selected. By default, the value None means that this is not used. + inverse_distance : bool + If True, the matching will be made on the inverse of the distance, and thus, the + worst matches to the query will be returned instead of the best ones. Returns ------- @@ -244,25 +257,12 @@ def _stomp_normalized( MP = List() IP = List() - # Init List to allow parallel, we'll re-use it for all dist profiles - dist_profiles = List() - for i_x in range(len(X)): - dist_profiles.append(np.zeros(X[i_x].shape[1] - L + 1)) - for i_q in range(n_queries): - for i_x in prange(len(X)): - dist_profiles[i_x][0 : X[i_x].shape[1] - L + 1] = ( - _normalized_squared_dist_profile_one_series( - XdotT[i_x], - X_means[i_x], - X_stds[i_x], - T_means[:, i_q], - T_stds[:, i_q], - L, - T_stds[:, i_q] <= AEON_NUMBA_STD_THRESHOLD, - ) - ) - if i_q + 1 < n_queries: + dist_profiles = _normalised_squared_distance_profile( + XdotT, X_means, X_stds, T_means[:, i_q], T_stds[:, i_q], L + ) + if i_q + 1 < n_queries: + for i_x in range(len(X)): XdotT[i_x] = _update_dot_products_one_series( X[i_x], T, XdotT[i_x], L, i_q + 1 ) @@ -281,7 +281,7 @@ def _stomp_normalized( dist_profiles, k, threshold, - allow_overlap, + allow_neighboring_matches, exclusion_size, ) @@ -291,7 +291,7 @@ def _stomp_normalized( return MP, IP -@njit(cache=True, parallel=True, fastmath=True) +@njit(cache=True, fastmath=True) def _stomp( X, T, @@ -299,28 +299,65 @@ def _stomp( L, T_index, k, - allow_overlap, threshold, + allow_neighboring_matches, exclusion_size, inverse_distance, ): + """ + Compute the Matrix Profile using the STOMP algorithm with non-normalised distances. + + X: np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints) + The input samples. If X is an unquel length collection, expect a TypedList + of 2D arrays of shape (n_channels, n_timepoints) + T : np.ndarray, 2D array of shape (n_channels, series_length) + The series used for similarity search. Note that series_length can be equal, + superior or inferior to n_timepoints, it doesn't matter. + XdotT : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - L + 1) + Precomputed dot products between each time series in X and the query series T. + L : int + Length of the subsequences used for the distance computation. + T_index : int, + If ``T`` is a series of the database given in fit, specify its index + in ``X_``. If specified, each query of this series won't be able to + match with its neighboring subsequences. + k : int, + The number of best matches to return during predict for each subsequence. + threshold : float + The number of best matches to return during predict for each subsequence. + allow_neighboring_matches : bool + Wheter the top-k candidates can be neighboring subsequences. + exclusion_size : int + The size of the exclusion zone used to prevent returning as top k candidates + the ones that are close to each other (for example i and i+1). + It is used to define a region between + :math:`id_timestomp - exclusion_size` and + :math:`id_timestomp + exclusion_size` which cannot be returned + as best match if :math:`id_timestomp` was already selected. By default, + the value None means that this is not used. + inverse_distance : bool + If True, the matching will be made on the inverse of the distance, and thus, the + worst matches to the query will be returned instead of the best ones. + + Returns + ------- + tuple of np.ndarray + - MP : array of shape (series_length - L + 1,) + Matrix profile distances for each query subsequence. + - IP : array of shape (series_length - L + 1,) + Indexes of the top matches for each query subsequence. + """ n_queries = T.shape[1] - L + 1 MP = List() IP = List() - # Init List to allow parallel, we'll re-use it for all dist profiles - dist_profiles = List() - for i_x in range(len(X)): - dist_profiles.append(np.zeros(X[i_x].shape[1] - L + 1)) # For each query of size L in T for i_q in range(n_queries): Q = T[:, i_q : i_q + L] + dist_profiles = _squared_distance_profile(XdotT, X, Q) # For each series in X compute distance profile to the query - for i_x in prange(len(X)): - dist_profiles[i_x][0 : X[i_x].shape[1] - L + 1] = ( - _squared_dist_profile_one_series(XdotT[i_x], X[i_x], Q) - ) - if i_q + 1 < n_queries: + if i_q + 1 < n_queries: + for i_x in range(len(X)): XdotT[i_x] = _update_dot_products_one_series( X[i_x], T, XdotT[i_x], L, i_q + 1 ) @@ -339,7 +376,7 @@ def _stomp( dist_profiles, k, threshold, - allow_overlap, + allow_neighboring_matches, exclusion_size, ) @@ -486,11 +523,11 @@ def _squared_dist_profile_one_series(QT, T, Q): @njit(cache=True, fastmath=True, parallel=True) -def _normalized_squared_distance_profile( +def _normalised_squared_distance_profile( QX, X_means, X_stds, Q_means, Q_stds, query_length ): """ - Compute the normalized squared distance profiles between query subsequence and input time series. + Compute the normalised squared distance profiles between query subsequence and input time series. Parameters ---------- @@ -513,7 +550,7 @@ def _normalized_squared_distance_profile( ------- List of np.ndarray List of 2D arrays, each of shape (n_channels, n_timepoints - query_length + 1). - Each array contains the normalized squared distance profile between the query subsequence and the corresponding time series. + Each array contains the normalised squared distance profile between the query subsequence and the corresponding time series. Entries in the array are set to infinity where the mask is False. """ distance_profiles = List() @@ -526,7 +563,7 @@ def _normalized_squared_distance_profile( for _i_instance in prange(len(QX)): # iterator is uint64 with prange and parallel so cast to int to avoid warnings i_instance = np.int64(_i_instance) - distance_profiles[i_instance] = _normalized_squared_dist_profile_one_series( + distance_profiles[i_instance] = _normalised_squared_dist_profile_one_series( QX[i_instance], X_means[i_instance], X_stds[i_instance], @@ -539,11 +576,11 @@ def _normalized_squared_distance_profile( @njit(cache=True, fastmath=True) -def _normalized_squared_dist_profile_one_series( +def _normalised_squared_dist_profile_one_series( QT, T_means, T_stds, Q_means, Q_stds, query_length, Q_is_constant ): """ - Compute the z-normalized squared Euclidean distance profile for one time series. + Compute the z-normalised squared Euclidean distance profile for one time series. Parameters ---------- @@ -568,8 +605,8 @@ def _normalized_squared_dist_profile_one_series( ------- np.ndarray 2D array of shape (n_channels, n_timepoints - query_length + 1) containing the - z-normalized squared distance profile between the query subsequence and the time - series. Entries are computed based on the z-normalized values, with special + z-normalised squared distance profile between the query subsequence and the time + series. Entries are computed based on the z-normalised values, with special handling for constant values. """ n_channels, profile_length = QT.shape diff --git a/aeon/similarity_search/subsequence_search/base.py b/aeon/similarity_search/subsequence_search/base.py index 1d49004472..7df358e5ec 100644 --- a/aeon/similarity_search/subsequence_search/base.py +++ b/aeon/similarity_search/subsequence_search/base.py @@ -17,8 +17,9 @@ ) from aeon.utils.numba.general import sliding_mean_std_one_series - # We can define a BaseVariableLengthSubsequenceSearch later for VALMOD and the likes. + + class BaseSubsequenceSearch(BaseSimilaritySearch): """ Base class for similarity search on time series subsequences. @@ -27,8 +28,8 @@ class BaseSubsequenceSearch(BaseSimilaritySearch): ---------- length : int The length of the subsequence to be considered. - normalize : bool, optional - Whether the inputs should be z-normalized. The default is False. + normalise : bool, optional + Whether the inputs should be z-normalised. The default is False. n_jobs : int, optional Number of parallel jobs to use. The default is 1. """ @@ -37,19 +38,21 @@ class BaseSubsequenceSearch(BaseSimilaritySearch): def __init__( self, length: int, - normalize: Optional[bool] = False, + normalise: Optional[bool] = False, n_jobs: Optional[int] = 1, ): self.length = length - super().__init__(n_jobs=n_jobs, normalize=normalize) + super().__init__(n_jobs=n_jobs, normalise=normalise) @final def find_motifs( self, - k: int, - threshold: float, + k: Optional[int] = 1, + threshold: Optional[float] = np.inf, X: Optional[np.ndarray] = None, - allow_overlap: Optional[bool] = False, + X_index: Optional[int] = None, + inverse_distance: Optional[bool] = False, + allow_neighboring_matches: Optional[bool] = False, exclusion_factor: Optional[float] = 2.0, ): """ @@ -71,13 +74,21 @@ def find_motifs( A series in which we want to indentify motifs. If provided, the motifs extracted should appear in X and in the database given in fit. If not provided, the motifs will be extracted only from the database given in fit. - allow_overlap: bool, optional + X_index : Optional[int], optional + If ``X`` is a series of the database given in fit, specify its index in + ``X_``. If specified, each query of this series won't be able to match with + its neighboring subsequences. + inverse_distance : bool, optional + Wheter to inverse the computed distance, meaning that the method will return + the anomalies instead of motifs. + allow_neighboring_matches: bool, optional Wheter a candidate can be part of multiple motif sets (True), or if motif sets should be mutually exclusive (False). exclusion_factor : float, default=2. A factor of the query length used to define the exclusion zone when - ``allow_overlap`` is set to False. For a given timestamp, the exclusion zone - starts from :math:`id_timestamp - query_length//exclusion_factor` and end at + ``allow_neighboring_matches`` is set to False. For a given timestamp, + the exclusion zone starts from + :math:`id_timestamp - query_length//exclusion_factor` and end at :math:`id_timestamp + query_length//exclusion_factor`. Returns @@ -88,6 +99,19 @@ def find_motifs( """ self._check_is_fitted() + prev_threads = get_num_threads() + X_index = self._check_X_index_int(X_index) + motifs = self._find_motifs( + k=k, + threshold=threshold, + exclusion_factor=exclusion_factor, + inverse_distance=inverse_distance, + allow_neighboring_matches=allow_neighboring_matches, + X=X, + X_index=X_index, + ) + set_num_threads(prev_threads) + return motifs @final def find_neighbors( @@ -97,7 +121,7 @@ def find_neighbors( threshold: Optional[float] = np.inf, inverse_distance: Optional[bool] = False, X_index: Optional[np.ndarray] = None, - allow_overlap: Optional[bool] = False, + allow_neighboring_matches: Optional[bool] = False, exclusion_factor: Optional[float] = 2.0, ): """ @@ -127,12 +151,13 @@ def find_neighbors( index as (i_case, i_timestamp). If specified, this subsequence and the neighboring ones (according to ``exclusion_factor``) won't be considered as admissible candidates. - allow_overlap: bool, optional + allow_neighboring_matches: bool, optional Wheter the top-k candidates can be neighboring subsequences. exclusion_factor : float, default=2. A factor of the query length used to define the exclusion zone when - ``allow_overlap`` is set to False. For a given timestamp, the exclusion zone - starts from :math:`id_timestamp - query_length//exclusion_factor` and end at + ``allow_neighboring_matches`` is set to False. For a given timestamp, + the exclusion zone starts from + :math:`id_timestamp - query_length//exclusion_factor` and end at :math:`id_timestamp + query_length//exclusion_factor`. Returns @@ -151,7 +176,7 @@ def find_neighbors( f"Expected a subsequence of shape {(self.n_channels_, self.length)} but" f" got {X.shape}" ) - self._check_X_index(X_index) + X_index = self._check_X_index_array(X_index) prev_threads = get_num_threads() set_num_threads(self._n_jobs) neighbors, distances = self._find_neighbors( @@ -160,7 +185,7 @@ def find_neighbors( threshold=threshold, inverse_distance=inverse_distance, X_index=X_index, - allow_overlap=allow_overlap, + allow_neighboring_matches=allow_neighboring_matches, exclusion_factor=exclusion_factor, ) set_num_threads(prev_threads) @@ -172,10 +197,41 @@ def find_neighbors( ) return neighbors, distances - def _check_X_index(self, X_index: np.ndarray): + def _check_X_index_int(self, X_index: int): """ Check wheter the X_index parameter is correctly formated and is admissible. + This check is made for motif search functions. + + Parameters + ---------- + X_index : int + Index of a series in X_. + + Returns + ------- + X_index : int + Index of a series in X_ + + """ + if X_index is not None: + if not isinstance(X_index, int): + raise TypeError("Expected an integer for X_index but got {X_index}") + + if X_index >= self.n_cases_ or X_index < 0: + raise ValueError( + "The value of X_index cannot exced the number " + "of series in the collection given during fit. Expected a value " + f"between [0, {self.n_cases_ - 1}] but got {X_index}" + ) + return X_index + + def _check_X_index_array(self, X_index: np.ndarray): + """ + Check wheter the X_index parameter is correctly formated and is admissible. + + This check is made for neighbour search functions. + Parameters ---------- X_index : np.ndarray, 1D array of shape (2) @@ -198,12 +254,12 @@ def _check_X_index(self, X_index: np.ndarray): ): X_index = np.asarray(X_index, dtype=int) elif len(X_index) != 2: - raise ValueError( + raise TypeError( "Expected a numpy array or list of integers with 2 elements " f"for X_index but got {X_index}" ) elif ( - not (isinstance(X_index[0], int) and isinstance(X_index[1], int)) + not (isinstance(X_index[0], int) or not isinstance(X_index[1], int)) or X_index.dtype != int ): raise TypeError( @@ -211,7 +267,7 @@ def _check_X_index(self, X_index: np.ndarray): f"{X_index}" ) - if X_index[0] >= self.n_cases_: + if X_index[0] >= self.n_cases_ or X_index[0] < 0: raise ValueError( "The sample ID (first element) of X_index cannot exced the number " "of series in the collection given during fit. Expected a value " @@ -260,6 +316,30 @@ def _compute_mean_std_from_collection(self, X: np.ndarray): @abstractmethod def _fit(self, X, y=None): ... + @abstractmethod + def _find_motifs( + self, + X: np.ndarray, + k: Optional[int] = 1, + threshold: Optional[float] = np.inf, + inverse_distance: Optional[bool] = False, + X_index=None, + allow_neighboring_matches: Optional[bool] = False, + exclusion_factor: Optional[float] = 2.0, + ): ... + + @abstractmethod + def _find_neighbors( + self, + X: np.ndarray, + k: Optional[int] = 1, + threshold: Optional[float] = np.inf, + inverse_distance: Optional[bool] = False, + X_index=None, + allow_neighboring_matches: Optional[bool] = False, + exclusion_factor: Optional[float] = 2.0, + ): ... + class BaseMatrixProfile(BaseSubsequenceSearch): """Base class for Matrix Profile methods using a length parameter.""" @@ -273,13 +353,33 @@ def _fit(self, X, y=None): ) ) - if self.normalize: + if self.normalise: self.X_means_, self.X_stds_ = self._compute_mean_std_from_collection(X) self.X_ = X return self - def _find_motifs(): - raise NotImplementedError() + def _find_motifs( + self, + X: np.ndarray, + k: Optional[int] = 1, + threshold: Optional[float] = np.inf, + inverse_distance: Optional[bool] = False, + X_index=None, + allow_neighboring_matches: Optional[bool] = False, + exclusion_factor: Optional[float] = 2.0, + ): + exclusion_size = self.length // exclusion_factor + + MP, IP = self.compute_matrix_profile( + k, + threshold, + exclusion_size, + inverse_distance, + allow_neighboring_matches, + X=X, + X_index=X_index, + ) + # TODO : implement logic here def _find_neighbors( self, @@ -288,7 +388,7 @@ def _find_neighbors( threshold: Optional[float] = np.inf, inverse_distance: Optional[bool] = False, X_index=None, - allow_overlap: Optional[bool] = False, + allow_neighboring_matches: Optional[bool] = False, exclusion_factor: Optional[float] = 2.0, ): """ @@ -318,12 +418,13 @@ def _find_neighbors( index as (i_case, i_timestamp). If specified, this subsequence and the neighboring ones (according to ``exclusion_factor``) won't be considered as admissible candidates. - allow_overlap: bool, optional + allow_neighboring_matches: bool, optional Wheter the top-k candidates can be neighboring subsequences. exclusion_factor : float, default=2. A factor of the query length used to define the exclusion zone when - ``allow_overlap`` is set to False. For a given timestamp, the exclusion zone - starts from :math:`id_timestamp - query_length//exclusion_factor` and end at + ``allow_neighboring_matches`` is set to False. For a given timestamp, the + exclusion zone starts from + :math:`id_timestamp - query_length//exclusion_factor` and end at :math:`id_timestamp + query_length//exclusion_factor`. """ exclusion_size = self.length // exclusion_factor @@ -343,16 +444,25 @@ def _find_neighbors( dist_profiles, k, threshold, - allow_overlap, + allow_neighboring_matches, exclusion_size, ) @abstractmethod - def compute_matrix_profile(X: Optional[np.ndarray] = None): + def compute_matrix_profile( + self, + k, + threshold, + exclusion_size, + inverse_distance, + allow_neighboring_matches, + X: Optional[np.ndarray] = None, + X_index: Optional[int] = None, + ): """Compute matrix profiles between X_ and X or between all series in X_.""" ... @abstractmethod - def compute_distance_profile(X: np.ndarray): + def compute_distance_profile(self, X: np.ndarray): """Compute distrance profiles between X_ and X (a series of size length).""" ... diff --git a/aeon/similarity_search/subsequence_search/tests/test_brute_force.py b/aeon/similarity_search/subsequence_search/tests/test_brute_force.py new file mode 100644 index 0000000000..3c92138b3d --- /dev/null +++ b/aeon/similarity_search/subsequence_search/tests/test_brute_force.py @@ -0,0 +1,155 @@ +""" +Tests for stomp algorithm. + +We do not test equality for returned indexes due to the unstable nature of argsort +and the fact that the "kind=stable" parameter is not yet supported in numba. We instead +test that the returned index match the expected distance value. +""" + +__maintainer__ = ["baraline"] + +import numpy as np +import pytest +from numba.typed import List +from numpy.testing import assert_almost_equal, assert_array_almost_equal + +from aeon.similarity_search.subsequence_search._brute_force import ( + _compute_dist_profile, + _naive_squared_distance_profile, + _naive_squared_matrix_profile, +) +from aeon.similarity_search.subsequence_search._commons import ( + _extract_top_k_from_dist_profile, + _inverse_distance_profile_list, +) +from aeon.testing.data_generation import ( + make_example_2d_numpy_series, + make_example_3d_numpy, + make_example_3d_numpy_list, +) +from aeon.utils.numba.general import sliding_mean_std_one_series + +K_VALUES = [1, 3, 5] +NN_MATCHES = [True, False] +INVERSE = [True, False] +NORMALISE = [True, False] + + +def _get_mean_sdts_inputs(X, Q, L): + X_means = [] + X_stds = [] + + for i_x in range(len(X)): + _mean, _std = sliding_mean_std_one_series(X[i_x], L, 1) + X_stds.append(_std) + X_means.append(_mean) + + Q_means = Q.mean(axis=1) + Q_stds = Q.std(axis=1) + + return X_means, X_stds, Q_means, Q_stds + + +def test__compute_dist_profile(): + """Test Euclidean distance.""" + L = 3 + X = make_example_2d_numpy_series(n_channels=1, n_timepoints=10) + Q = make_example_2d_numpy_series(n_channels=1, n_timepoints=L) + dist_profile = _compute_dist_profile(X, Q) + for i_t in range(X.shape[1] - L + 1): + assert_almost_equal(dist_profile[i_t], np.sum((X[:, i_t : i_t + L] - Q) ** 2)) + + +def test__naive_squared_distance_profile(normalise): + """Test Euclidean distance profile calculation.""" + L = 3 + X = make_example_3d_numpy(n_cases=3, n_channels=1, n_timepoints=10, return_y=False) + Q = make_example_2d_numpy_series(n_channels=1, n_timepoints=L) + dist_profiles = _naive_squared_distance_profile(X, Q) + for i_x in range(len(X)): + for i_t in range(X[i_x].shape[1] - L + 1): + assert_almost_equal( + dist_profiles[i_x][i_t], np.sum((X[i_x, :, i_t : i_t + L] - Q) ** 2) + ) + + # test unequal length and multivariate + X = List( + make_example_3d_numpy_list( + n_cases=3, + n_channels=2, + min_n_timepoints=10, + max_n_timepoints=20, + return_y=False, + ) + ) + + Q = make_example_2d_numpy_series(n_channels=2, n_timepoints=L) + dist_profiles = _naive_squared_distance_profile(X, Q) + for i_x in range(len(X)): + for i_t in range(X[i_x].shape[1] - L + 1): + assert_almost_equal( + dist_profiles[i_x][i_t], np.sum((X[i_x][:, i_t : i_t + L] - Q) ** 2) + ) + + +@pytest.mark.parametrize( + [ + ("k", K_VALUES), + ("allow_neighboring_matches", NN_MATCHES), + ("inverse_distance", INVERSE), + ("normalise", NORMALISE), + ] +) +def test__naive_squared_matrix_profile( + k, allow_neighboring_matches, inverse_distance, normalise +): + """Test STOMP method.""" + L = 3 + + X = make_example_3d_numpy_list( + n_cases=3, + n_channels=2, + min_n_timepoints=6, + max_n_timepoints=8, + return_y=False, + ) + T = make_example_2d_numpy_series(n_channels=2, n_timepoints=5) + + T_index = None + threshold = np.inf + exclusion_size = L + # MP : distances to best matches for each query + # IP : Indexes of best matches for each query + MP, IP = _naive_squared_matrix_profile( + X, + T, + L, + T_index, + k, + threshold, + allow_neighboring_matches, + exclusion_size, + inverse_distance, + normalise=normalise, + ) + # For each query of size L in T + for i in range(T.shape[1] - L + 1): + dist_profiles = _naive_squared_distance_profile( + X, T[:, i : i + L], normalise=normalise + ) + # Check that the top matches extracted have the same value that the + # top matches in the distance profile + if inverse_distance: + dist_profiles = _inverse_distance_profile_list(dist_profiles) + + top_k_indexes, top_k_distances = _extract_top_k_from_dist_profile( + dist_profiles, k, threshold, allow_neighboring_matches, exclusion_size + ) + # Check that the top matches extracted have the same value that the + # top matches in the distance profile + assert_array_almost_equal(MP[i], top_k_distances) + + # Check that the index in IP correspond to a distance profile point + # with value equal to the corresponding MP point. + for j, index in enumerate(top_k_indexes): + assert_almost_equal(MP[i][j], dist_profiles[index[0]][index[1]]) diff --git a/aeon/similarity_search/subsequence_search/tests/test__commons.py b/aeon/similarity_search/subsequence_search/tests/test_commons.py similarity index 59% rename from aeon/similarity_search/subsequence_search/tests/test__commons.py rename to aeon/similarity_search/subsequence_search/tests/test_commons.py index b0e2764b0b..70e443a78c 100644 --- a/aeon/similarity_search/subsequence_search/tests/test__commons.py +++ b/aeon/similarity_search/subsequence_search/tests/test_commons.py @@ -1,12 +1,13 @@ """Test _commons.py functions.""" __maintainer__ = ["baraline"] - import numpy as np +import pytest from numba.typed import List -from numpy.testing import assert_array_almost_equal, assert_array_equal +from numpy.testing import assert_, assert_array_almost_equal from aeon.similarity_search.subsequence_search._commons import ( + _extract_top_k_from_dist_profile, _inverse_distance_profile_list, fft_sliding_dot_product, get_ith_products, @@ -59,59 +60,38 @@ def test__inverse_distance_profile_list(): assert_array_almost_equal(1 / (X[1] + 1e-8), T[1]) -def test__extract_top_k_from_dist_profile(): - """Test method to esxtract the top k candidates from a list of distance profiles.""" - X = List([ - [5,4,3,3,1,3,2,5,1,4,1,0,1,2,2,7,8,1,5], - [5,1,0,1,0,0,5,4,3,5,6,1,4,2], - ]) - - top_k_indexes, top_k_distances = _extract_top_k_from_dist_profile( - X, - 1, - np.inf, - False, - 3 - ) - assert_array_equal(top_k_indexes, [[0,11]]) - assert_array_equal(top_k_indexes, [0]) - - top_k_indexes, top_k_distances = _extract_top_k_from_dist_profile( - X, - 5, - np.inf, - False, - 3 - ) - assert_array_equal(top_k_indexes, [[0,11],[1,2],[0,4],[0,17],[1,11]]) - assert_array_equal(top_k_indexes, [0,0,1,1,1]) +K_VALUES = [1, 3, 5] +THRESHOLDS = [np.inf, 0.7] +NN_MATCHES = [False, True] - top_k_indexes, top_k_distances = _extract_top_k_from_dist_profile( - X, - 5, - np.inf, - True, - 3 - ) - assert_array_equal(top_k_indexes, [[0,11],[1,2],[1,4],[1,5],[0,4]]) - assert_array_equal(top_k_indexes, [0,0,0,0,1]) - top_k_indexes, top_k_distances = _extract_top_k_from_dist_profile( - X, - 5, - 0.5, - True, - 3 +@pytest.mark.parametrize( + [("k", K_VALUES), ("threshold", THRESHOLDS), ("allow_nn_matches", NN_MATCHES)] +) +def test__extract_top_k_from_dist_profile(k, threshold, allow_nn_matches): + """Test method to esxtract the top k candidates from a list of distance profiles.""" + X = make_example_2d_numpy_list( + n_cases=2, min_n_timepoints=5, max_n_timepoints=7, return_y=False ) - assert_array_equal(top_k_indexes, [[0,11],[1,2],[1,4],[1,5]]) - assert_array_equal(top_k_indexes, [0,0,0,0]) + X_sort = [X[i][np.argsort(X[i])] for i in range(len(X))] top_k_indexes, top_k_distances = _extract_top_k_from_dist_profile( - X, - 5, - 0.5, - False, - 3 + X, k, threshold, allow_nn_matches, 3 ) - assert_array_equal(top_k_indexes, [[0,11],[1,2]]) - assert_array_equal(top_k_indexes, [0,0]) + for i, index in enumerate(top_k_indexes): + assert_(X[index[0]][index[1]] == top_k_distances[i]) + assert_(np.all(top_k_distances <= threshold)) + if allow_nn_matches: + for i in range(len(X)): + assert_(np.all(top_k_distances <= X_sort[i][k - 1])) + if not allow_nn_matches: + for i_x in range(len(X)): + # test same index X respect exclusion + same_X = [ + top_k_indexes[i][1] + for i in range(len(top_k_indexes)) + if top_k_indexes[i][0] == i_x + ] + same_X = np.sort(same_X) + if len(same_X) > 1: + assert_(np.all(np.diff(same_X) >= 3)) diff --git a/aeon/similarity_search/subsequence_search/tests/test_stomp.py b/aeon/similarity_search/subsequence_search/tests/test_stomp.py index 169eee135e..7a655035a5 100644 --- a/aeon/similarity_search/subsequence_search/tests/test_stomp.py +++ b/aeon/similarity_search/subsequence_search/tests/test_stomp.py @@ -1,4 +1,10 @@ -"""Tests for stomp algorithm.""" +""" +Tests for stomp algorithm. + +We do not test equality for returned indexes due to the unstable nature of argsort +and the fact that the "kind=stable" parameter is not yet supported in numba. We instead +test that the returned index match the expected distance value. +""" __maintainer__ = ["baraline"] @@ -7,14 +13,18 @@ from numba.typed import List from numpy.testing import assert_almost_equal, assert_array_almost_equal -from aeon.similarity_search.subsequence_search._commons import get_ith_products +from aeon.similarity_search.subsequence_search._commons import ( + _extract_top_k_from_dist_profile, + _inverse_distance_profile_list, + get_ith_products, +) from aeon.similarity_search.subsequence_search._stomp import ( - _normalized_squared_dist_profile_one_series, - _normalized_squared_distance_profile, + _normalised_squared_dist_profile_one_series, + _normalised_squared_distance_profile, _squared_dist_profile_one_series, _squared_distance_profile, _stomp, - _stomp_normalized, + _stomp_normalised, _update_dot_products_one_series, ) from aeon.testing.data_generation import ( @@ -27,7 +37,9 @@ z_normalise_series_2d_with_mean_std, ) -K_VALUES = [1, 3] +K_VALUES = [1, 3, 5] +NN_MATCHES = [True, False] +INVERSE = [True, False] def _get_mean_sdts_inputs(X, Q, L): @@ -79,7 +91,7 @@ def test__squared_dist_profile_one_series(): assert_almost_equal(dist_profile[i_t], np.sum((X[:, i_t : i_t + L] - Q) ** 2)) -def test__normalized_squared_dist_profile_one_series(): +def test__normalised_squared_dist_profile_one_series(): """Test Euclidean distance.""" L = 3 X = make_example_2d_numpy_series(n_channels=1, n_timepoints=10) @@ -89,7 +101,7 @@ def test__normalized_squared_dist_profile_one_series(): Q_mean = Q.mean(axis=1) Q_std = Q.std(axis=1) - dist_profile = _normalized_squared_dist_profile_one_series( + dist_profile = _normalised_squared_dist_profile_one_series( QX, X_mean, X_std, Q_mean, Q_std, L, Q.std(axis=1) <= 0 ) Q = z_normalise_series_2d_with_mean_std(Q, Q_mean, Q_std) @@ -134,7 +146,7 @@ def test__squared_distance_profile(): ) -def test__normalized_squared_distance_profile(): +def test__normalised_squared_distance_profile(): """Test Euclidean distance profile calculation.""" L = 3 X = make_example_3d_numpy(n_cases=3, n_channels=1, n_timepoints=10, return_y=False) @@ -146,7 +158,7 @@ def test__normalized_squared_distance_profile(): X_means = np.asarray(X_means) X_stds = np.asarray(X_stds) - dist_profiles = _normalized_squared_distance_profile( + dist_profiles = _normalised_squared_distance_profile( QX, X_means, X_stds, Q_means, Q_stds, L ) @@ -179,7 +191,7 @@ def test__normalized_squared_distance_profile(): X_means = List(X_means) X_stds = List(X_stds) - dist_profiles = _normalized_squared_distance_profile( + dist_profiles = _normalised_squared_distance_profile( QX, X_means, X_stds, Q_means, Q_stds, L ) @@ -194,21 +206,32 @@ def test__normalized_squared_distance_profile(): ) -# K_VALUES = [1, 3] -@pytest.mark.parametrize("k", K_VALUES) -def test__stomp(k): +@pytest.mark.parametrize( + [ + ("k", K_VALUES), + ("allow_neighboring_matches", NN_MATCHES), + ("inverse_distance", INVERSE), + ] +) +def test__stomp(k, allow_neighboring_matches, inverse_distance): """Test STOMP method.""" L = 3 - X = np.array([[[1, 2, 3, 2, 1, 2, 3, 4, 5, 2, 1, 2, 2]]]) - T = np.array([[1, 1, 3, 2, 2]]) - XdotT = np.asarray([get_ith_products(X[i_x], T, L, 0) for i_x in range(len(X))]) + + X = make_example_3d_numpy_list( + n_cases=3, + n_channels=2, + min_n_timepoints=6, + max_n_timepoints=8, + return_y=False, + ) + T = make_example_2d_numpy_series(n_channels=2, n_timepoints=5) + XdotT = List([get_ith_products(X[i_x], T, L, 0) for i_x in range(len(X))]) T_index = None - allow_overlap = False threshold = np.inf exclusion_size = L - inverse_distance = False - + # MP : distances to best matches for each query + # IP : Indexes of best matches for each query MP, IP = _stomp( X, T, @@ -216,23 +239,102 @@ def test__stomp(k): L, T_index, k, - allow_overlap, threshold, + allow_neighboring_matches, exclusion_size, inverse_distance, ) - Expected_MP = [[1, 6], [1, 2], [1, 2, 5]] - Expected_IP = [[[0, 0], [0, 1]], [[0, 1], [0, 0]], [[0, 2], [0, 1], [0, 0]]] - for i in range(len(Expected_MP)): - assert_array_almost_equal(Expected_IP[i], IP[i]) - assert_array_almost_equal(Expected_MP[i], MP[i]) + # For each query of size L in T + for i in range(T.shape[1] - L + 1): + dist_profiles = _squared_distance_profile( + List([get_ith_products(X[i_x], T, L, i) for i_x in range(len(X))]), + X, + T[:, i : i + L], + ) + # Check that the top matches extracted have the same value that the + # top matches in the distance profile + if inverse_distance: + dist_profiles = _inverse_distance_profile_list(dist_profiles) + top_k_indexes, top_k_distances = _extract_top_k_from_dist_profile( + dist_profiles, k, threshold, allow_neighboring_matches, exclusion_size + ) + # Check that the top matches extracted have the same value that the + # top matches in the distance profile + assert_array_almost_equal(MP[i], top_k_distances) + + # Check that the index in IP correspond to a distance profile point + # with value equal to the corresponding MP point. + for j, index in enumerate(top_k_indexes): + assert_almost_equal(MP[i][j], dist_profiles[index[0]][index[1]]) + + +@pytest.mark.parametrize( + [ + ("k", K_VALUES), + ("allow_neighboring_matches", NN_MATCHES), + ("inverse_distance", INVERSE), + ] +) +def test__stomp_normalised(k, allow_neighboring_matches, inverse_distance): + """Test STOMP normalised method.""" + L = 3 + X = make_example_3d_numpy_list( + n_cases=3, + n_channels=2, + min_n_timepoints=6, + max_n_timepoints=8, + return_y=False, + ) + T = make_example_2d_numpy_series(n_channels=2, n_timepoints=5) -@pytest.mark.parametrize("k", K_VALUES) -def test__stomp_normalized(k): - """Test STOMP normalized method.""" - _stomp_normalized - ... + XdotT = List([get_ith_products(X[i_x], T, L, 0) for i_x in range(len(X))]) + T_index = None + threshold = np.inf + exclusion_size = L + X_means, X_stds, _, _ = _get_mean_sdts_inputs(X, T, L) + T_means, T_stds = sliding_mean_std_one_series(T, L, 1) + # MP : distances to best matches for each query + # IP : Indexes of best matches for each query + MP, IP = _stomp_normalised( + X, + T, + XdotT, + X_means, + X_stds, + T_means, + T_stds, + L, + T_index, + k, + threshold, + allow_neighboring_matches, + exclusion_size, + inverse_distance, + ) + # For each query of size L in T + for i in range(T.shape[1] - L + 1): + dist_profiles = _normalised_squared_distance_profile( + List([get_ith_products(X[i_x], T, L, i) for i_x in range(len(X))]), + X_means, + X_stds, + T_means[:, i], + T_stds[:, i], + L, + ) + + if inverse_distance: + dist_profiles = _inverse_distance_profile_list(dist_profiles) -# TODO : add tests for StompMatrixProfile + top_k_indexes, top_k_distances = _extract_top_k_from_dist_profile( + dist_profiles, k, threshold, allow_neighboring_matches, exclusion_size + ) + # Check that the top matches extracted have the same value that the + # top matches in the distance profile + assert_array_almost_equal(MP[i], top_k_distances) + + # Check that the index in IP correspond to a distance profile point + # with value equal to the corresponding MP point. + for j, index in enumerate(top_k_indexes): + assert_almost_equal(MP[i][j], dist_profiles[index[0]][index[1]]) diff --git a/aeon/testing/mock_estimators/__init__.py b/aeon/testing/mock_estimators/__init__.py index 219fc3e987..e517e07ca0 100644 --- a/aeon/testing/mock_estimators/__init__.py +++ b/aeon/testing/mock_estimators/__init__.py @@ -29,8 +29,6 @@ "MockUnivariateSeriesTransformer", "MockMultivariateSeriesTransformer", "MockSeriesTransformerNoFit", - # similarity search - "MockSimilaritySearch", ] from aeon.testing.mock_estimators._mock_anomaly_detectors import ( @@ -64,4 +62,3 @@ MockSeriesTransformerNoFit, MockUnivariateSeriesTransformer, ) -from aeon.testing.mock_estimators._mock_similarity_search import MockSimilaritySearch diff --git a/aeon/testing/mock_estimators/_mock_similarity_search.py b/aeon/testing/mock_estimators/_mock_similarity_search.py deleted file mode 100644 index 55c9c435c7..0000000000 --- a/aeon/testing/mock_estimators/_mock_similarity_search.py +++ /dev/null @@ -1,21 +0,0 @@ -"""Mock similarity searchers useful for testing and debugging.""" - -__maintainer__ = ["baraline"] -__all__ = [ - "MockSimilaritySearch", -] - -from aeon.similarity_search.base import BaseSimilaritySearch - - -class MockSimilaritySearch(BaseSimilaritySearch): - """Mock similarity search for testing base class predict.""" - - def _fit(self, X, y=None): - """_fit dummy.""" - self.X_ = X - return self - - def predict(self, X): - """Predict dummy.""" - return [(0, 0)] diff --git a/aeon/testing/mock_estimators/_mock_similarity_searchers.py b/aeon/testing/mock_estimators/_mock_similarity_searchers.py new file mode 100644 index 0000000000..f7c0acfc98 --- /dev/null +++ b/aeon/testing/mock_estimators/_mock_similarity_searchers.py @@ -0,0 +1,72 @@ +"""Mock series transformers useful for testing and debugging.""" + +__maintainer__ = [] +__all__ = [ + "MockSubsequenceSearch", + "MockMatrixProfile", +] + +import numpy as np + +from aeon.similarity_search.subsequence_search.base import ( + BaseMatrixProfile, + BaseSubsequenceSearch, +) + + +class MockMatrixProfile(BaseMatrixProfile): + def __init__(self, length=3): + super().__init__(length=length) + + def compute_matrix_profile( + self, + k, + threshold, + exclusion_size, + inverse_distance, + allow_neighboring_matches, + X=None, + X_index=None, + ): + """Compute matrix profiles between X_ and X or between all series in X_.""" + return np.zeros((X.shape[1] - self.length + 1, k)), np.zeros( + (X.shape[1] - self.length + 1, k, 2) + ) + + def compute_distance_profile(self, X): + """Compute distrance profiles between X_ and X (a series of size length).""" + return np.zeros(X.shape[1] - self.length + 1) + + +class MockSubsequenceSearch(BaseSubsequenceSearch): + """MockSeriesTransformer to set tags.""" + + def __init__(self, length=3): + super().__init__(length=length) + + def _fit(self, X, y=None): + return self + + def _find_motifs( + self, + X, + k=1, + threshold=np.inf, + inverse_distance=False, + X_index=None, + allow_neighboring_matches=False, + exclusion_factor=2.0, + ): + return [[0, 0]], self.X_[0][0:1] # TODO: update after logic is implemented + + def _find_neighbors( + self, + X, + k=1, + threshold=np.inf, + inverse_distance=False, + X_index=None, + allow_neighboring_matches=False, + exclusion_factor=2.0, + ): + return [[0, 0]], self.X_[0][0:1] diff --git a/aeon/utils/base/_register.py b/aeon/utils/base/_register.py index 1d81c2512c..5bf4045ad5 100644 --- a/aeon/utils/base/_register.py +++ b/aeon/utils/base/_register.py @@ -25,6 +25,10 @@ from aeon.regression.base import BaseRegressor from aeon.segmentation.base import BaseSegmenter from aeon.similarity_search.base import BaseSimilaritySearch +from aeon.similarity_search.subsequence_search.base import ( + BaseMatrixProfile, + BaseSubsequenceSearch, +) from aeon.transformations.base import BaseTransformer from aeon.transformations.collection import BaseCollectionTransformer from aeon.transformations.series import BaseSeriesTransformer @@ -45,6 +49,8 @@ "regressor": BaseRegressor, "segmenter": BaseSegmenter, "similarity_searcher": BaseSimilaritySearch, + "subsequence_searcher": BaseSubsequenceSearch, + "matrixprofile_searcher": BaseMatrixProfile, "series-transformer": BaseSeriesTransformer, "forecaster": BaseForecaster, } @@ -53,5 +59,11 @@ VALID_ESTIMATOR_BASES = { k: BASE_CLASS_REGISTER[k] for k in BASE_CLASS_REGISTER.keys() - - {"estimator", "collection-estimator", "series-estimator", "transformer"} + - { + "estimator", + "collection-estimator", + "series-estimator", + "transformer", + "similarity_searcher", + } } diff --git a/docs/api_reference/utils.rst b/docs/api_reference/utils.rst index 40dea9f67c..6f43398a44 100644 --- a/docs/api_reference/utils.rst +++ b/docs/api_reference/utils.rst @@ -87,7 +87,6 @@ Mock Estimators MockUnivariateSeriesTransformer MockMultivariateSeriesTransformer MockSeriesTransformerNoFit - MockSimilaritySearch Utilities ^^^^^^^^^ From bb2aa33820b914b8a5dd390e4f087c61925c7a0c Mon Sep 17 00:00:00 2001 From: baraline Date: Wed, 1 Jan 2025 09:36:10 +0100 Subject: [PATCH 05/18] Add test for base subsequence --- aeon/similarity_search/__init__.py | 4 +- aeon/similarity_search/base.py | 59 ++++- .../similarity_search/series_search/_r_lsh.py | 239 ++++++++++++++++++ .../subsequence_search/_stomp.py | 145 +++++------ .../subsequence_search/base.py | 89 ++++--- .../subsequence_search/tests/test_base.py | 78 ++++++ aeon/similarity_search/tests/test_base.py | 1 + .../_mock_similarity_searchers.py | 15 +- 8 files changed, 487 insertions(+), 143 deletions(-) create mode 100644 aeon/similarity_search/series_search/_r_lsh.py create mode 100644 aeon/similarity_search/subsequence_search/tests/test_base.py create mode 100644 aeon/similarity_search/tests/test_base.py diff --git a/aeon/similarity_search/__init__.py b/aeon/similarity_search/__init__.py index 53f80b2cdf..cdf5cebd84 100644 --- a/aeon/similarity_search/__init__.py +++ b/aeon/similarity_search/__init__.py @@ -1,3 +1,5 @@ """Similarity search module.""" -__all__ = [] +__all__ = ["BaseSimilaritySearch"] + +from aeon.similarity_search.base import BaseSimilaritySearch diff --git a/aeon/similarity_search/base.py b/aeon/similarity_search/base.py index 8a9e9547d7..d486491fa4 100644 --- a/aeon/similarity_search/base.py +++ b/aeon/similarity_search/base.py @@ -18,8 +18,8 @@ class BaseSimilaritySearch(BaseCollectionEstimator): Parameters ---------- - normalize : bool, optional - Whether the inputs should be z-normalized. The default is False. + normalise : bool, optional + Whether the inputs should be z-normalised. The default is False. n_jobs : int, optional Number of parallel jobs to use. The default is 1. """ @@ -35,11 +35,11 @@ class BaseSimilaritySearch(BaseCollectionEstimator): @abstractmethod def __init__( self, - normalize: Optional[bool] = False, + normalise: Optional[bool] = False, n_jobs: Optional[int] = 1, ): self.n_jobs = n_jobs - self.normalize = normalize + self.normalise = normalise super().__init__() @final @@ -68,7 +68,9 @@ def fit( ------- self """ + self.reset() prev_threads = get_num_threads() + self._check_fit_format(X) X = self._preprocess_collection(X) # Store minimum number of n_timepoints for unequal length collections self.min_timepoints_ = min([X[i].shape[-1] for i in range(len(X))]) @@ -85,9 +87,9 @@ def fit( @abstractmethod def find_motifs( self, + X: np.ndarray, k: int, threshold: float, - X: Optional[np.ndarray] = None, allow_overlap: Optional[bool] = True, ): """ @@ -100,10 +102,8 @@ def find_motifs( Parameters ---------- - X : np.ndarray, optional - The query in which we want to indentify motifs. If provided, the motifs - extracted should appear in X and in the database given in fit. If not - provided, the motifs will be extracted only from the database given in fit. + X : np.ndarray, + A series in which we want to indentify motifs. k : int, optional Number of motifs to return threshold : int, optional @@ -115,9 +115,12 @@ def find_motifs( Returns ------- - list of ndarray, shape=(k,) - A list of at most ``k`` numpy arrays containing the indexes of the - candidates in each motif. + ndarray, shape=(k,) + A numpy array of at most ``k`` elements containing the indexes of the + motifs. + ndarray, shape=(k,) + A numpy array of at most ``k`` elements containing the distances of the + motifs to . """ ... @@ -157,5 +160,37 @@ def find_neighbors( """ ... + def _check_fit_format(self, X): + if isinstance(X, np.ndarray): # "numpy3D" or numpy2D + if X.ndim != 3: + raise TypeError( + f"A np.ndarray given in fit must be 3D but found {X.ndim}D" + ) + elif isinstance(X, list): # np-list or df-list + if isinstance(X[0], np.ndarray): # if one a numpy they must all be 2D numpy + for a in X: + if not (isinstance(a, np.ndarray) and a.ndim == 2): + raise TypeError( + "A np-list given in fit must contain 2D np.ndarray but" + f" found {a.ndim}D" + ) + + def _check_find_neighbors_motif_format(self, X): + if isinstance(X, np.ndarray): + if X.ndim != 2: + raise TypeError( + "A np.ndarray given in find_neighbors must be 2D" + f"(n_channels, n_timepoints) but found {X.ndim}D." + ) + else: + raise TypeError( + "Expected a 2D np.ndarray in find_neighbors but found" f" {type(X)}." + ) + if self.n_channels_ != X.shape[0]: + raise ValueError( + f"Expected X to have {self.n_channels_} channels but" + f" got {X.shape[0]} channels." + ) + @abstractmethod def _fit(self, X, y=None): ... diff --git a/aeon/similarity_search/series_search/_r_lsh.py b/aeon/similarity_search/series_search/_r_lsh.py new file mode 100644 index 0000000000..9921aeeba1 --- /dev/null +++ b/aeon/similarity_search/series_search/_r_lsh.py @@ -0,0 +1,239 @@ +"""Random projection LSH index.""" + +import numpy as np +from numba import njit, prange + +TPB = 16 + + +@njit(cache=True) +def _hamming_dist(X, Y): + d = 0 + for i in prange(X.shape[0]): + d += X[i] ^ Y[i] + return d + + +@njit(cache=True, parallel=True) +def _hamming_dist_matrix(bool_hashes_value_list, bool_hashes): + n_hashes = bool_hashes.shape[0] + res = np.zeros((n_hashes, bool_hashes_value_list.shape[0]), dtype=np.int64) + for i in prange(n_hashes): + for j in prange(bool_hashes_value_list.shape[0]): + res[i, j] = _hamming_dist(bool_hashes_value_list[j], bool_hashes[i]) + return res + + +@njit(cache=True, fastmath=True, parallel=True) +def _series_to_bool(X, hash_funcs, start_points, length): + n_hash_funcs = hash_funcs.shape[0] + res = np.empty(n_hash_funcs, dtype=np.bool_) + for j in prange(n_hash_funcs): + res[j] = ( + np.dot(X[start_points[j] : start_points[j] + length], hash_funcs[j]) >= 0 + ) + return res + + +@njit(cache=True, fastmath=True, parallel=True) +def _collection_to_bool(X, hash_funcs, start_points, length): + n_hash_funcs = hash_funcs.shape[0] + n_samples = X.shape[0] + res = np.empty((n_samples, n_hash_funcs), dtype=np.bool_) + for i in prange(n_samples): + for j in prange(n_hash_funcs): + res[i, j] = ( + np.dot(X[i, start_points[j] : start_points[j] + length], hash_funcs[j]) + >= 0 + ) + return res + + +class LSH: + """ + . + + Parameters + ---------- + n_vectors : TYPE + DESCRIPTION. + custom_table : TYPE, optional + DESCRIPTION. The default is None. + + Returns + ------- + None. + + """ + + def __init__(self, n_hash_funcs=128, window_length=1.0, seed=None): + self.n_hash_funcs = n_hash_funcs + self.window_length = window_length + self.seed = seed + + def fit(self, X): + """ + . + + Parameters + ---------- + X : TYPE + DESCRIPTION. + + Returns + ------- + TYPE + DESCRIPTION. + + """ + self.rng_ = np.random.default_rng(self.seed) + self.X_ = np.array( + [X[i].flatten() for i in range(len(X))] + ) # n_samples, n_channels * n_timepoints + + self.window_length_ = max(1, int(self.X_.shape[1] * self.window_length)) + # Can replace with choice [-1, 1] + self.hash_funcs_ = self.rng_.uniform( + low=-1, high=1.0, size=(self.n_hash_funcs, self.window_length_) + ) + self.start_points_ = self.rng_.choice( + self.X_.shape[1] - self.window_length_ + 1, + size=self.n_hash_funcs, + replace=True, + ) + + bool_hashes = _collection_to_bool( + self.X_, self.hash_funcs_, self.start_points_, self.window_length_ + ) + # could yield this + str_hashes = [hash(bool_hashes[i].tobytes()) for i in range(len(bool_hashes))] + self.dict_X_index = {} + self.dict_bool_hashes = {} + for i in range(len(str_hashes)): + if str_hashes[i] in self.dict_X_index: + self.dict_X_index[str_hashes[i]].append(i) + else: + self.dict_X_index[str_hashes[i]] = [i] + self.dict_bool_hashes[str_hashes[i]] = bool_hashes[i] + + self.bool_hashes_value_list = np.asarray(list(self.dict_bool_hashes.values())) + self.bool_hashes_key_list = np.asarray(list(self.dict_bool_hashes.keys())) + + return self + + def update(self, X): + """ + . + + Parameters + ---------- + X : TYPE + DESCRIPTION. + + Returns + ------- + TYPE + DESCRIPTION. + + """ + X_ = np.array( + [X[i].flatten() for i in range(len(X))] + ) # n_samples, n_channels * n_timepoints + bool_hashes = _collection_to_bool( + X_, self.hash_funcs_, self.start_points_, self.window_length_ + ) + + str_hashes = [hash(bool_hashes[i].tobytes()) for i in range(len(bool_hashes))] + base_index = self.X_.shape[0] + for i in range(len(str_hashes)): + if str_hashes[i] in self.dict_X_index: + self.dict_X_index[str_hashes[i]].append(i + base_index) + else: + self.dict_X_index[str_hashes[i]] = [i + base_index] + self.dict_bool_hashes[str_hashes[i]] = bool_hashes[i] + self.X_ = np.concatenate((self.X_, X_)) + + self.bool_hashes_value_list = np.asarray(list(self.dict_bool_hashes.values())) + self.bool_hashes_key_list = np.asarray(list(self.dict_bool_hashes.keys())) + return self + + def get_bucket_collection_indexes(self, X): + """ + . + + Parameters + ---------- + X : TYPE + DESCRIPTION. + + Returns + ------- + TYPE + DESCRIPTION. + + """ + bool_hash = _series_to_bool( + X.flatten(), self.hash_funcs_, self.start_points_, self.window_length_ + ) + str_hash = hash(bool_hash.tobytes()) + if str_hash in self.dict_X_index: + return self.dict_X_index[str_hash] + else: + return [] + + def predict(self, X, k=1): + """ + . + + Parameters + ---------- + X : TYPE + DESCRIPTION. + k : TYPE, optional + DESCRIPTION. The default is 1. + + Returns + ------- + top_k : TYPE + DESCRIPTION. + + """ + X_ = np.array([X[i].flatten() for i in range(len(X))]) + bool_hashes = _collection_to_bool( + X_, self.hash_funcs_, self.start_points_, self.window_length_ + ) + top_k = np.zeros((len(X), k), dtype=int) + dists = _hamming_dist_matrix(self.bool_hashes_value_list, bool_hashes) + self.h_dists = dists + # Deal with equality by merging bucket contents ? + for i_x in range(len(X)): + ids = np.argsort(dists[i_x]) + _i = 0 + c = k + while c > 0: + candidates = self.dict_X_index[self.bool_hashes_key_list[ids[_i]]] + # Can do exact search by computing distances here + if len(candidates) > c: + candidates = candidates[:c] + top_k[i_x, k - c : k - c + len(candidates)] = candidates + c -= len(candidates) + _i += 1 + return top_k + + def find_motif(Index, X=None): + """ + . + + Parameters + ---------- + Index : TYPE + DESCRIPTION. + X : TYPE, optional + DESCRIPTION. The default is None. + + Returns + ------- + None. + + """ + pass diff --git a/aeon/similarity_search/subsequence_search/_stomp.py b/aeon/similarity_search/subsequence_search/_stomp.py index 0dd75971f8..21fdacdfcd 100644 --- a/aeon/similarity_search/subsequence_search/_stomp.py +++ b/aeon/similarity_search/subsequence_search/_stomp.py @@ -2,9 +2,6 @@ __maintainer__ = ["baraline"] - -from typing import Optional - import numpy as np from numba import njit, prange from numba.typed import List @@ -27,13 +24,13 @@ class StompMatrixProfile(BaseMatrixProfile): def compute_matrix_profile( self, - k, - threshold, - exclusion_size, - inverse_distance, - allow_neighboring_matches, - X: Optional[np.ndarray] = None, - X_index: Optional[int] = None, + X: np.ndarray, + k: int, + threshold: float, + exclusion_size: int, + inverse_distance: bool, + allow_neighboring_matches: bool, + X_index=None, ): """ Compute matrix profiles. @@ -44,6 +41,8 @@ def compute_matrix_profile( Parameters ---------- + X : np.ndarray, shape = (n_channels, n_timepoints) + A 2D array time series on which the matrix profile will be computed. k : int The number of best matches to return during predict for each subsequence. threshold : float @@ -55,14 +54,10 @@ def compute_matrix_profile( The size of the exclusion zone used to prevent returning as top k candidates the ones that are close to each other (for example i and i+1). It is used to define a region between - :math:`id_timestomp - exclusion_size` and - :math:`id_timestomp + exclusion_size` which cannot be returned - as best match if :math:`id_timestomp` was already selected. By default, + :math:`id_timestamp - exclusion_size` and + :math:`id_timestamp + exclusion_size` which cannot be returned + as best match if :math:`id_timestamp` was already selected. By default, the value None means that this is not used. - X : Optional[np.ndarray], optional - The time series on which the matrix profile will be compute. - The default is None, meaning that the series in the collection given in fit - will be used instead. X_index : Optional[int], optional If ``X`` is a series of the database given in fit, specify its index in ``X_``. If specified, each query of this series won't be able to match with @@ -70,73 +65,55 @@ def compute_matrix_profile( Returns ------- - MP : array of shape (series_length - L + 1,) + MP : array of shape (n_timepoints - L + 1,) Matrix profile distances for each query subsequence. If X is none, this will be a list of MP for each series in X_. - IP : array of shape (series_length - L + 1,) + IP : array of shape (n_timepoints - L + 1,) Indexes of the top matches for each query subsequence. If X is none, this will be a list of MP for each series in X_. """ - # If we compute matrix profiles for each series in X_ - if X is None: - MP = [] - IP = [] - for i in range(len(self.X_)): - _MP, _IP = self.compute_matrix_profile( - k, - threshold, - exclusion_size, - inverse_distance, - X=self.X_[i], - X_index=i, - ) - MP.append(_MP) - IP.append(_IP) - # else we compute matrix profiles using X on the series in X_ + XdotT = [ + get_ith_products(self.X[i], X, self.length, 0) for i in range(len(self.X_)) + ] + if isinstance(X, np.ndarray): + XdotT = np.asarray(XdotT) + elif isinstance(X, List): + XdotT = List(XdotT) + + if X_index is None: + X_means, X_stds = sliding_mean_std_one_series(X, self.length, 1) else: - XdotT = [ - get_ith_products(self.X[i], X, self.length, 0) - for i in range(len(self.X_)) - ] - if isinstance(X, np.ndarray): - XdotT = np.asarray(XdotT) - elif isinstance(X, List): - XdotT = List(XdotT) - - if X_index is None: - X_means, X_stds = sliding_mean_std_one_series(X, self.length, 1) - else: - X_means, X_stds = self.X_means_[i], self.X_stds_[i] - if self.normalise: - MP, IP = _stomp_normalised( - self.X_, - X, - XdotT, - self.X_means_, - self.X_stds_, - X_means, - X_stds, - self.length, - X_index, - k, - threshold, - allow_neighboring_matches, - exclusion_size, - inverse_distance, - ) - else: - MP, IP = _stomp( - self.X_, - X, - XdotT, - self.length, - X_index, - k, - threshold, - allow_neighboring_matches, - exclusion_size, - inverse_distance, - ) + X_means, X_stds = self.X_means_[X_index], self.X_stds_[X_index] + if self.normalise: + MP, IP = _stomp_normalised( + self.X_, + X, + XdotT, + self.X_means_, + self.X_stds_, + X_means, + X_stds, + self.length, + X_index, + k, + threshold, + allow_neighboring_matches, + exclusion_size, + inverse_distance, + ) + else: + MP, IP = _stomp( + self.X_, + X, + XdotT, + self.length, + X_index, + k, + threshold, + allow_neighboring_matches, + exclusion_size, + inverse_distance, + ) return MP, IP def compute_distance_profile(self, X: np.ndarray): @@ -237,9 +214,9 @@ def _stomp_normalised( The size of the exclusion zone used to prevent returning as top k candidates the ones that are close to each other (for example i and i+1). It is used to define a region between - :math:`id_timestomp - exclusion_size` and - :math:`id_timestomp + exclusion_size` which cannot be returned - as best match if :math:`id_timestomp` was already selected. By default, + :math:`id_timestamp - exclusion_size` and + :math:`id_timestamp + exclusion_size` which cannot be returned + as best match if :math:`id_timestamp` was already selected. By default, the value None means that this is not used. inverse_distance : bool If True, the matching will be made on the inverse of the distance, and thus, the @@ -331,9 +308,9 @@ def _stomp( The size of the exclusion zone used to prevent returning as top k candidates the ones that are close to each other (for example i and i+1). It is used to define a region between - :math:`id_timestomp - exclusion_size` and - :math:`id_timestomp + exclusion_size` which cannot be returned - as best match if :math:`id_timestomp` was already selected. By default, + :math:`id_timestamp - exclusion_size` and + :math:`id_timestamp + exclusion_size` which cannot be returned + as best match if :math:`id_timestamp` was already selected. By default, the value None means that this is not used. inverse_distance : bool If True, the matching will be made on the inverse of the distance, and thus, the diff --git a/aeon/similarity_search/subsequence_search/base.py b/aeon/similarity_search/subsequence_search/base.py index 7df358e5ec..dcc3f17dc8 100644 --- a/aeon/similarity_search/subsequence_search/base.py +++ b/aeon/similarity_search/subsequence_search/base.py @@ -47,9 +47,9 @@ def __init__( @final def find_motifs( self, + X: np.ndarray, k: Optional[int] = 1, threshold: Optional[float] = np.inf, - X: Optional[np.ndarray] = None, X_index: Optional[int] = None, inverse_distance: Optional[bool] = False, allow_neighboring_matches: Optional[bool] = False, @@ -65,15 +65,13 @@ def find_motifs( Parameters ---------- + X : np.ndarray, 2D array of shape (n_channels, n_timestamps) + A series in which we want to indentify motifs. k : int, optional Number of motifs to return threshold : int, optional A threshold on the similarity measure to determine which candidates will be part of a motif set. - X : np.ndarray, 2D array of shape (n_channels, n_timestamps), optional - A series in which we want to indentify motifs. If provided, the motifs - extracted should appear in X and in the database given in fit. If not - provided, the motifs will be extracted only from the database given in fit. X_index : Optional[int], optional If ``X`` is a series of the database given in fit, specify its index in ``X_``. If specified, each query of this series won't be able to match with @@ -93,25 +91,30 @@ def find_motifs( Returns ------- - list of ndarray, shape=(k,) - A list of at most ``k`` numpy arrays containing the indexes of the - candidates in each motif. + ndarray, shape=(k,) + A numpy array of at most ``k`` elements containing the indexes of the + motifs in X. + ndarray, shape=(k,) + A numpy array of at most ``k`` elements containing the distances of the + motifs macthes to the motif in X. """ self._check_is_fitted() + if X is not None: + self._check_find_neighbors_motif_format(X) prev_threads = get_num_threads() X_index = self._check_X_index_int(X_index) - motifs = self._find_motifs( + motifs_indexes, distances = self._find_motifs( + X, k=k, threshold=threshold, exclusion_factor=exclusion_factor, inverse_distance=inverse_distance, allow_neighboring_matches=allow_neighboring_matches, - X=X, X_index=X_index, ) set_num_threads(prev_threads) - return motifs + return motifs_indexes, distances @final def find_neighbors( @@ -171,11 +174,14 @@ def find_neighbors( """ self._check_is_fitted() - if self.length != X.shape[1] or self.n_channels_ != X.shape[0]: + + self._check_find_neighbors_motif_format(X) + if self.length != X.shape[1]: raise ValueError( - f"Expected a subsequence of shape {(self.n_channels_, self.length)} but" - f" got {X.shape}" + f"Expected X to be of shape {(self.n_channels_, self.length)} but" + f" got {X.shape} in find_neighbors." ) + X_index = self._check_X_index_array(X_index) prev_threads = get_num_threads() set_num_threads(self._n_jobs) @@ -313,8 +319,19 @@ def _compute_mean_std_from_collection(self, X: np.ndarray): else: return np.asarray(means), np.asarray(stds) - @abstractmethod - def _fit(self, X, y=None): ... + def _fit(self, X, y=None): + if self.length >= self.min_timepoints_ or self.length < 1: + raise ValueError( + "The length of the query should be inferior or equal to the length of " + "data (X_) provided during fit, but got {} for X and {} for X_".format( + self.length, self.min_timepoints_ + ) + ) + + if self.normalise: + self.X_means_, self.X_stds_ = self._compute_mean_std_from_collection(X) + self.X_ = X + return self @abstractmethod def _find_motifs( @@ -322,8 +339,8 @@ def _find_motifs( X: np.ndarray, k: Optional[int] = 1, threshold: Optional[float] = np.inf, + X_index: Optional[int] = None, inverse_distance: Optional[bool] = False, - X_index=None, allow_neighboring_matches: Optional[bool] = False, exclusion_factor: Optional[float] = 2.0, ): ... @@ -344,27 +361,13 @@ def _find_neighbors( class BaseMatrixProfile(BaseSubsequenceSearch): """Base class for Matrix Profile methods using a length parameter.""" - def _fit(self, X, y=None): - if self.length >= self.min_timepoints_: - raise ValueError( - "The length of the query should be inferior or equal to the length of " - "data (X_) provided during fit, but got {} for X and {} for X_".format( - self.length, self.min_timepoints_ - ) - ) - - if self.normalise: - self.X_means_, self.X_stds_ = self._compute_mean_std_from_collection(X) - self.X_ = X - return self - def _find_motifs( self, X: np.ndarray, k: Optional[int] = 1, threshold: Optional[float] = np.inf, + X_index: Optional[int] = None, inverse_distance: Optional[bool] = False, - X_index=None, allow_neighboring_matches: Optional[bool] = False, exclusion_factor: Optional[float] = 2.0, ): @@ -379,7 +382,15 @@ def _find_motifs( X=X, X_index=X_index, ) - # TODO : implement logic here + # TODO check motif extraction logic, sure its not this one + MP_avg = np.array([np.mean(MP[i]) for i in range(len(MP))]) + return _extract_top_k_from_dist_profile( + MP_avg, + k, + threshold, + allow_neighboring_matches, + exclusion_size, + ) def _find_neighbors( self, @@ -451,12 +462,12 @@ def _find_neighbors( @abstractmethod def compute_matrix_profile( self, - k, - threshold, - exclusion_size, - inverse_distance, - allow_neighboring_matches, - X: Optional[np.ndarray] = None, + X: np.ndarray, + k: int, + threshold: float, + exclusion_size: int, + inverse_distance: bool, + allow_neighboring_matches: bool, X_index: Optional[int] = None, ): """Compute matrix profiles between X_ and X or between all series in X_.""" diff --git a/aeon/similarity_search/subsequence_search/tests/test_base.py b/aeon/similarity_search/subsequence_search/tests/test_base.py new file mode 100644 index 0000000000..a037879992 --- /dev/null +++ b/aeon/similarity_search/subsequence_search/tests/test_base.py @@ -0,0 +1,78 @@ +"""Test for subsequence search base class.""" + +import pytest + +from aeon.testing.mock_estimators._mock_similarity_searchers import ( + MockMatrixProfile, + MockSubsequenceSearch, +) +from aeon.testing.testing_data import ( + make_example_1d_numpy, + make_example_2d_numpy_series, + make_example_3d_numpy, + make_example_3d_numpy_list, +) + +BASES = [MockMatrixProfile, MockSubsequenceSearch] + + +@pytest.mark.parametrize("base", BASES) +def test_input_shape_fit_neighbord_motifs(base): + """Test input shapes.""" + estimator = base() + # dummy data to pass to fit when testing predict/predict_proba + X_3D_uni = make_example_3d_numpy(n_channels=1, return_y=False) + X_3D_multi = make_example_3d_numpy(n_channels=2, return_y=False) + X_3D_uni_list = make_example_3d_numpy_list(n_channels=1, return_y=False) + X_3D_multi_list = make_example_3d_numpy_list(n_channels=2, return_y=False) + X_2D_uni = make_example_2d_numpy_series(n_channels=1) + X_2D_multi = make_example_2d_numpy_series(n_channels=2) + X_1D = make_example_1d_numpy() + + valid_inputs_fit = [X_3D_uni, X_3D_multi, X_3D_uni_list, X_3D_multi_list] + # Valid inputs + for _input in valid_inputs_fit: + estimator.fit(_input) + + invalid_inputs_fit = [X_2D_uni, X_2D_multi, X_1D] + for _input in invalid_inputs_fit: + with pytest.raises(TypeError): + estimator.fit(_input) + + valid_inputs_neighboord_motifs_uni = [X_2D_uni] + invalid_inputs_neighboord_motifs_uni = [ + X_1D, + X_3D_uni, + X_3D_uni_list, + ] + invalid_inputs_neighboord_motifs_multi = [ + X_3D_multi, + X_3D_multi_list, + ] + L = 5 + estimator_multi = base(length=L).fit(X_3D_multi) + estimator_uni = base(length=L).fit(X_3D_uni) + + for _input in valid_inputs_neighboord_motifs_uni: + estimator_uni.find_neighbors(_input[:, :L]) + estimator_uni.find_motifs(X=_input) + with pytest.raises(ValueError): + # Wrong number of channels + estimator_multi.find_neighbors(_input) + estimator_multi.find_motifs(X=_input) + # X length not of size L + estimator_uni.find_neighbors(X=_input[:, : L + 2]) + + for _input in invalid_inputs_neighboord_motifs_uni: + with pytest.raises(TypeError): + estimator_uni.find_neighbors(_input) + estimator_uni.find_motifs(X=_input) + estimator_multi.find_neighbors(_input) + estimator_multi.find_motifs(X=_input) + + for _input in invalid_inputs_neighboord_motifs_multi: + with pytest.raises(TypeError): + estimator_uni.find_neighbors(_input) + estimator_uni.find_motifs(X=_input) + estimator_multi.find_neighbors(_input) + estimator_multi.find_motifs(X=_input) diff --git a/aeon/similarity_search/tests/test_base.py b/aeon/similarity_search/tests/test_base.py new file mode 100644 index 0000000000..e066e14680 --- /dev/null +++ b/aeon/similarity_search/tests/test_base.py @@ -0,0 +1 @@ +"""Tests for base similarity search.""" diff --git a/aeon/testing/mock_estimators/_mock_similarity_searchers.py b/aeon/testing/mock_estimators/_mock_similarity_searchers.py index f7c0acfc98..89c6121c38 100644 --- a/aeon/testing/mock_estimators/_mock_similarity_searchers.py +++ b/aeon/testing/mock_estimators/_mock_similarity_searchers.py @@ -15,6 +15,8 @@ class MockMatrixProfile(BaseMatrixProfile): + """Mock estimator for BaseMatrixProfile.""" + def __init__(self, length=3): super().__init__(length=length) @@ -35,25 +37,24 @@ def compute_matrix_profile( def compute_distance_profile(self, X): """Compute distrance profiles between X_ and X (a series of size length).""" - return np.zeros(X.shape[1] - self.length + 1) + return [ + np.zeros(self.X_[i].shape[1] - self.length + 1) for i in range(len(self.X_)) + ] class MockSubsequenceSearch(BaseSubsequenceSearch): - """MockSeriesTransformer to set tags.""" + """Mock estimator for BaseSubsequenceSearch.""" def __init__(self, length=3): super().__init__(length=length) - def _fit(self, X, y=None): - return self - def _find_motifs( self, - X, k=1, threshold=np.inf, - inverse_distance=False, + X=None, X_index=None, + inverse_distance=False, allow_neighboring_matches=False, exclusion_factor=2.0, ): From f23c7203f9cc6ba295c187d1d227c8af1329daa4 Mon Sep 17 00:00:00 2001 From: baraline Date: Thu, 2 Jan 2025 11:23:49 +0100 Subject: [PATCH 06/18] Fix subsequence_search tests --- aeon/similarity_search/__init__.py | 2 +- aeon/similarity_search/{base.py => _base.py} | 0 aeon/similarity_search/series_search/base.py | 2 +- .../subsequence_search/__init__.py | 10 ++++- .../subsequence_search/base.py | 6 +-- .../subsequence_search/tests/__init__.py | 2 +- .../subsequence_search/tests/test_base.py | 24 +++++++---- .../tests/test_brute_force.py | 42 ++++++++++--------- .../subsequence_search/tests/test_commons.py | 15 ++++--- .../subsequence_search/tests/test_stomp.py | 10 ++--- .../_mock_similarity_searchers.py | 4 +- aeon/testing/utils/estimator_checks.py | 2 +- aeon/utils/base/_register.py | 14 +++---- aeon/utils/discovery.py | 1 + 14 files changed, 73 insertions(+), 61 deletions(-) rename aeon/similarity_search/{base.py => _base.py} (100%) diff --git a/aeon/similarity_search/__init__.py b/aeon/similarity_search/__init__.py index cdf5cebd84..26b79c7da2 100644 --- a/aeon/similarity_search/__init__.py +++ b/aeon/similarity_search/__init__.py @@ -2,4 +2,4 @@ __all__ = ["BaseSimilaritySearch"] -from aeon.similarity_search.base import BaseSimilaritySearch +from aeon.similarity_search._base import BaseSimilaritySearch diff --git a/aeon/similarity_search/base.py b/aeon/similarity_search/_base.py similarity index 100% rename from aeon/similarity_search/base.py rename to aeon/similarity_search/_base.py diff --git a/aeon/similarity_search/series_search/base.py b/aeon/similarity_search/series_search/base.py index db83519c04..bcbc92c042 100644 --- a/aeon/similarity_search/series_search/base.py +++ b/aeon/similarity_search/series_search/base.py @@ -2,7 +2,7 @@ __maintainer__ = ["baraline"] -from aeon.similarity_search.base import BaseSimilaritySearch +from aeon.similarity_search._base import BaseSimilaritySearch class BaseSeriesSearch(BaseSimilaritySearch): diff --git a/aeon/similarity_search/subsequence_search/__init__.py b/aeon/similarity_search/subsequence_search/__init__.py index c5de805eb6..5d64f901bd 100644 --- a/aeon/similarity_search/subsequence_search/__init__.py +++ b/aeon/similarity_search/subsequence_search/__init__.py @@ -1,7 +1,15 @@ """Subsequence search module.""" -__all__ = ["BaseSubsequenceSearch", "BaseMatrixProfile", "StompMatrixProfile"] +__all__ = [ + "BaseSubsequenceSearch", + "BaseMatrixProfile", + "StompMatrixProfile", + "BruteForceMatrixProfile", +] +from aeon.similarity_search.subsequence_search._brute_force import ( + BruteForceMatrixProfile, +) from aeon.similarity_search.subsequence_search._stomp import StompMatrixProfile from aeon.similarity_search.subsequence_search.base import ( BaseMatrixProfile, diff --git a/aeon/similarity_search/subsequence_search/base.py b/aeon/similarity_search/subsequence_search/base.py index dcc3f17dc8..238b749b5f 100644 --- a/aeon/similarity_search/subsequence_search/base.py +++ b/aeon/similarity_search/subsequence_search/base.py @@ -10,7 +10,7 @@ from numba import get_num_threads, set_num_threads from numba.typed import List -from aeon.similarity_search.base import BaseSimilaritySearch +from aeon.similarity_search._base import BaseSimilaritySearch from aeon.similarity_search.subsequence_search._commons import ( _extract_top_k_from_dist_profile, _inverse_distance_profile_list, @@ -34,7 +34,6 @@ class BaseSubsequenceSearch(BaseSimilaritySearch): Number of parallel jobs to use. The default is 1. """ - @abstractmethod def __init__( self, length: int, @@ -383,7 +382,8 @@ def _find_motifs( X_index=X_index, ) # TODO check motif extraction logic, sure its not this one - MP_avg = np.array([np.mean(MP[i]) for i in range(len(MP))]) + MP_avg = np.array([[np.mean(MP[i]) for i in range(len(MP))]]) + # TODO: appening IP of identified motifs to return to get motifs matches in X_ return _extract_top_k_from_dist_profile( MP_avg, k, diff --git a/aeon/similarity_search/subsequence_search/tests/__init__.py b/aeon/similarity_search/subsequence_search/tests/__init__.py index 3feb8d4ca5..0287f2ee04 100644 --- a/aeon/similarity_search/subsequence_search/tests/__init__.py +++ b/aeon/similarity_search/subsequence_search/tests/__init__.py @@ -1 +1 @@ -"""Tests for series methods.""" +"""Tests for subsequence search methods.""" diff --git a/aeon/similarity_search/subsequence_search/tests/test_base.py b/aeon/similarity_search/subsequence_search/tests/test_base.py index a037879992..e1a314f38a 100644 --- a/aeon/similarity_search/subsequence_search/tests/test_base.py +++ b/aeon/similarity_search/subsequence_search/tests/test_base.py @@ -55,24 +55,32 @@ def test_input_shape_fit_neighbord_motifs(base): for _input in valid_inputs_neighboord_motifs_uni: estimator_uni.find_neighbors(_input[:, :L]) - estimator_uni.find_motifs(X=_input) + estimator_uni.find_motifs(_input) with pytest.raises(ValueError): # Wrong number of channels estimator_multi.find_neighbors(_input) - estimator_multi.find_motifs(X=_input) - # X length not of size L - estimator_uni.find_neighbors(X=_input[:, : L + 2]) + with pytest.raises(ValueError): + estimator_multi.find_motifs(_input) + # X length not of size L + with pytest.raises(ValueError): + estimator_uni.find_neighbors(_input[:, : L + 2]) for _input in invalid_inputs_neighboord_motifs_uni: with pytest.raises(TypeError): estimator_uni.find_neighbors(_input) - estimator_uni.find_motifs(X=_input) + with pytest.raises(TypeError): + estimator_uni.find_motifs(_input) + with pytest.raises(TypeError): estimator_multi.find_neighbors(_input) - estimator_multi.find_motifs(X=_input) + with pytest.raises(TypeError): + estimator_multi.find_motifs(_input) for _input in invalid_inputs_neighboord_motifs_multi: with pytest.raises(TypeError): estimator_uni.find_neighbors(_input) - estimator_uni.find_motifs(X=_input) + with pytest.raises(TypeError): + estimator_uni.find_motifs(_input) + with pytest.raises(TypeError): estimator_multi.find_neighbors(_input) - estimator_multi.find_motifs(X=_input) + with pytest.raises(TypeError): + estimator_multi.find_motifs(_input) diff --git a/aeon/similarity_search/subsequence_search/tests/test_brute_force.py b/aeon/similarity_search/subsequence_search/tests/test_brute_force.py index 3c92138b3d..cc87b34a9f 100644 --- a/aeon/similarity_search/subsequence_search/tests/test_brute_force.py +++ b/aeon/similarity_search/subsequence_search/tests/test_brute_force.py @@ -27,7 +27,7 @@ make_example_3d_numpy, make_example_3d_numpy_list, ) -from aeon.utils.numba.general import sliding_mean_std_one_series +from aeon.utils.numba.general import sliding_mean_std_one_series, z_normalise_series_2d K_VALUES = [1, 3, 5] NN_MATCHES = [True, False] @@ -60,17 +60,22 @@ def test__compute_dist_profile(): assert_almost_equal(dist_profile[i_t], np.sum((X[:, i_t : i_t + L] - Q) ** 2)) +@pytest.mark.parametrize("normalise", NORMALISE) def test__naive_squared_distance_profile(normalise): """Test Euclidean distance profile calculation.""" L = 3 X = make_example_3d_numpy(n_cases=3, n_channels=1, n_timepoints=10, return_y=False) Q = make_example_2d_numpy_series(n_channels=1, n_timepoints=L) - dist_profiles = _naive_squared_distance_profile(X, Q) + dist_profiles = _naive_squared_distance_profile(X, Q, normalise=normalise) + + if normalise: + Q = z_normalise_series_2d(Q) for i_x in range(len(X)): for i_t in range(X[i_x].shape[1] - L + 1): - assert_almost_equal( - dist_profiles[i_x][i_t], np.sum((X[i_x, :, i_t : i_t + L] - Q) ** 2) - ) + _x = X[i_x, :, i_t : i_t + L] + if normalise: + _x = z_normalise_series_2d(_x) + assert_almost_equal(dist_profiles[i_x][i_t], np.sum((_x - Q) ** 2)) # test unequal length and multivariate X = List( @@ -84,22 +89,21 @@ def test__naive_squared_distance_profile(normalise): ) Q = make_example_2d_numpy_series(n_channels=2, n_timepoints=L) - dist_profiles = _naive_squared_distance_profile(X, Q) + dist_profiles = _naive_squared_distance_profile(X, Q, normalise=normalise) + if normalise: + Q = z_normalise_series_2d(Q) for i_x in range(len(X)): for i_t in range(X[i_x].shape[1] - L + 1): - assert_almost_equal( - dist_profiles[i_x][i_t], np.sum((X[i_x][:, i_t : i_t + L] - Q) ** 2) - ) - - -@pytest.mark.parametrize( - [ - ("k", K_VALUES), - ("allow_neighboring_matches", NN_MATCHES), - ("inverse_distance", INVERSE), - ("normalise", NORMALISE), - ] -) + _x = X[i_x][:, i_t : i_t + L] + if normalise: + _x = z_normalise_series_2d(_x) + assert_almost_equal(dist_profiles[i_x][i_t], np.sum((_x - Q) ** 2)) + + +@pytest.mark.parametrize("k", K_VALUES) +@pytest.mark.parametrize("allow_neighboring_matches", NN_MATCHES) +@pytest.mark.parametrize("inverse_distance", INVERSE) +@pytest.mark.parametrize("normalise", NORMALISE) def test__naive_squared_matrix_profile( k, allow_neighboring_matches, inverse_distance, normalise ): diff --git a/aeon/similarity_search/subsequence_search/tests/test_commons.py b/aeon/similarity_search/subsequence_search/tests/test_commons.py index 70e443a78c..50c5cfad31 100644 --- a/aeon/similarity_search/subsequence_search/tests/test_commons.py +++ b/aeon/similarity_search/subsequence_search/tests/test_commons.py @@ -17,6 +17,10 @@ make_example_2d_numpy_series, ) +K_VALUES = [1, 3, 5] +THRESHOLDS = [np.inf, 0.7] +NN_MATCHES = [False, True] + def test_fft_sliding_dot_product(): """Test the fft_sliding_dot_product function.""" @@ -60,14 +64,9 @@ def test__inverse_distance_profile_list(): assert_array_almost_equal(1 / (X[1] + 1e-8), T[1]) -K_VALUES = [1, 3, 5] -THRESHOLDS = [np.inf, 0.7] -NN_MATCHES = [False, True] - - -@pytest.mark.parametrize( - [("k", K_VALUES), ("threshold", THRESHOLDS), ("allow_nn_matches", NN_MATCHES)] -) +@pytest.mark.parametrize("k", K_VALUES) +@pytest.mark.parametrize("threshold", THRESHOLDS) +@pytest.mark.parametrize("allow_nn_matches", NN_MATCHES) def test__extract_top_k_from_dist_profile(k, threshold, allow_nn_matches): """Test method to esxtract the top k candidates from a list of distance profiles.""" X = make_example_2d_numpy_list( diff --git a/aeon/similarity_search/subsequence_search/tests/test_stomp.py b/aeon/similarity_search/subsequence_search/tests/test_stomp.py index 7a655035a5..12d7738eaf 100644 --- a/aeon/similarity_search/subsequence_search/tests/test_stomp.py +++ b/aeon/similarity_search/subsequence_search/tests/test_stomp.py @@ -206,13 +206,9 @@ def test__normalised_squared_distance_profile(): ) -@pytest.mark.parametrize( - [ - ("k", K_VALUES), - ("allow_neighboring_matches", NN_MATCHES), - ("inverse_distance", INVERSE), - ] -) +@pytest.mark.parametrize("k", K_VALUES) +@pytest.mark.parametrize("allow_neighboring_matches", NN_MATCHES) +@pytest.mark.parametrize("inverse_distance", INVERSE) def test__stomp(k, allow_neighboring_matches, inverse_distance): """Test STOMP method.""" L = 3 diff --git a/aeon/testing/mock_estimators/_mock_similarity_searchers.py b/aeon/testing/mock_estimators/_mock_similarity_searchers.py index 89c6121c38..824a627d7a 100644 --- a/aeon/testing/mock_estimators/_mock_similarity_searchers.py +++ b/aeon/testing/mock_estimators/_mock_similarity_searchers.py @@ -32,7 +32,7 @@ def compute_matrix_profile( ): """Compute matrix profiles between X_ and X or between all series in X_.""" return np.zeros((X.shape[1] - self.length + 1, k)), np.zeros( - (X.shape[1] - self.length + 1, k, 2) + (X.shape[1] - self.length + 1, k, 2), dtype=np.int64 ) def compute_distance_profile(self, X): @@ -50,9 +50,9 @@ def __init__(self, length=3): def _find_motifs( self, + X, k=1, threshold=np.inf, - X=None, X_index=None, inverse_distance=False, allow_neighboring_matches=False, diff --git a/aeon/testing/utils/estimator_checks.py b/aeon/testing/utils/estimator_checks.py index b2e0973dbf..d556ff0249 100644 --- a/aeon/testing/utils/estimator_checks.py +++ b/aeon/testing/utils/estimator_checks.py @@ -7,7 +7,7 @@ import numpy as np -from aeon.similarity_search.base import BaseSimilaritySearch +from aeon.similarity_search import BaseSimilaritySearch from aeon.testing.testing_data import FULL_TEST_DATA_DICT from aeon.utils.validation import get_n_cases diff --git a/aeon/utils/base/_register.py b/aeon/utils/base/_register.py index 5bf4045ad5..9c576c350a 100644 --- a/aeon/utils/base/_register.py +++ b/aeon/utils/base/_register.py @@ -24,15 +24,13 @@ from aeon.forecasting.base import BaseForecaster from aeon.regression.base import BaseRegressor from aeon.segmentation.base import BaseSegmenter -from aeon.similarity_search.base import BaseSimilaritySearch -from aeon.similarity_search.subsequence_search.base import ( - BaseMatrixProfile, - BaseSubsequenceSearch, -) +from aeon.similarity_search.subsequence_search.base import BaseSubsequenceSearch from aeon.transformations.base import BaseTransformer from aeon.transformations.collection import BaseCollectionTransformer from aeon.transformations.series import BaseSeriesTransformer +# from aeon.similarity_search.series_search.base import BaseSeriesSearch + # all base classes BASE_CLASS_REGISTER = { # abstract - no estimator directly inherits from these @@ -48,11 +46,10 @@ "early_classifier": BaseEarlyClassifier, "regressor": BaseRegressor, "segmenter": BaseSegmenter, - "similarity_searcher": BaseSimilaritySearch, - "subsequence_searcher": BaseSubsequenceSearch, - "matrixprofile_searcher": BaseMatrixProfile, "series-transformer": BaseSeriesTransformer, "forecaster": BaseForecaster, + "subsequence_searcher": BaseSubsequenceSearch, + # "series_searcher": BaseSeriesSearch, } # base classes which are valid for estimator to directly inherit from @@ -64,6 +61,5 @@ "collection-estimator", "series-estimator", "transformer", - "similarity_searcher", } } diff --git a/aeon/utils/discovery.py b/aeon/utils/discovery.py index 8fd4a05efe..d6e5ce61fc 100644 --- a/aeon/utils/discovery.py +++ b/aeon/utils/discovery.py @@ -92,6 +92,7 @@ def all_estimators( # ignore test modules and base classes "base", "tests", + "similarity_search" # ignore these submodules "benchmarking", "datasets", From c372969a92257da14ea40a8f238d9f6160b331e4 Mon Sep 17 00:00:00 2001 From: baraline Date: Thu, 2 Jan 2025 13:56:20 +0100 Subject: [PATCH 07/18] debug brute force mp --- .../subsequence_search/_brute_force.py | 2 +- .../subsequence_search/_commons.py | 2 +- .../tests/test_brute_force.py | 41 ++++++++++++------- 3 files changed, 29 insertions(+), 16 deletions(-) diff --git a/aeon/similarity_search/subsequence_search/_brute_force.py b/aeon/similarity_search/subsequence_search/_brute_force.py index 6c26925a32..a7227499b6 100644 --- a/aeon/similarity_search/subsequence_search/_brute_force.py +++ b/aeon/similarity_search/subsequence_search/_brute_force.py @@ -281,7 +281,7 @@ def _naive_squared_matrix_profile( X_subs.append(i_subs) for i_q in range(n_queries): - Q = T[:, i : i + L] + Q = T[:, i_q : i_q + L] if normalise: Q = z_normalise_series_2d(Q) for i_x in prange(len(X)): diff --git a/aeon/similarity_search/subsequence_search/_commons.py b/aeon/similarity_search/subsequence_search/_commons.py index 03e0ee1ac3..c13a7381bc 100644 --- a/aeon/similarity_search/subsequence_search/_commons.py +++ b/aeon/similarity_search/subsequence_search/_commons.py @@ -141,7 +141,7 @@ def _extract_top_k_from_dist_profile( _top_k_distances = dist_profiles[i_profile][_top_k_indexes] # Extract top-k with neighboring matches else: - _top_k_indexes = np.argpartition(dist_profiles[i_profile], k)[:k] + _top_k_indexes = np.argsort(dist_profiles[i_profile])[:k] _top_k_distances = dist_profiles[i_profile][_top_k_indexes] # Select overall top k by using the buffer array of size 2*k diff --git a/aeon/similarity_search/subsequence_search/tests/test_brute_force.py b/aeon/similarity_search/subsequence_search/tests/test_brute_force.py index cc87b34a9f..9ef0eb44e8 100644 --- a/aeon/similarity_search/subsequence_search/tests/test_brute_force.py +++ b/aeon/similarity_search/subsequence_search/tests/test_brute_force.py @@ -11,7 +11,11 @@ import numpy as np import pytest from numba.typed import List -from numpy.testing import assert_almost_equal, assert_array_almost_equal +from numpy.testing import ( + assert_almost_equal, + assert_array_almost_equal, + assert_array_equal, +) from aeon.similarity_search.subsequence_search._brute_force import ( _compute_dist_profile, @@ -27,7 +31,11 @@ make_example_3d_numpy, make_example_3d_numpy_list, ) -from aeon.utils.numba.general import sliding_mean_std_one_series, z_normalise_series_2d +from aeon.utils.numba.general import ( + get_all_subsequences, + sliding_mean_std_one_series, + z_normalise_series_2d, +) K_VALUES = [1, 3, 5] NN_MATCHES = [True, False] @@ -51,18 +59,18 @@ def _get_mean_sdts_inputs(X, Q, L): def test__compute_dist_profile(): - """Test Euclidean distance.""" + """Test Euclidean distance with brute force.""" L = 3 X = make_example_2d_numpy_series(n_channels=1, n_timepoints=10) Q = make_example_2d_numpy_series(n_channels=1, n_timepoints=L) - dist_profile = _compute_dist_profile(X, Q) + dist_profile = _compute_dist_profile(get_all_subsequences(X, L, 1), Q) for i_t in range(X.shape[1] - L + 1): assert_almost_equal(dist_profile[i_t], np.sum((X[:, i_t : i_t + L] - Q) ** 2)) @pytest.mark.parametrize("normalise", NORMALISE) def test__naive_squared_distance_profile(normalise): - """Test Euclidean distance profile calculation.""" + """Test Euclidean distance profile calculation with brute force.""" L = 3 X = make_example_3d_numpy(n_cases=3, n_channels=1, n_timepoints=10, return_y=False) Q = make_example_2d_numpy_series(n_channels=1, n_timepoints=L) @@ -107,18 +115,20 @@ def test__naive_squared_distance_profile(normalise): def test__naive_squared_matrix_profile( k, allow_neighboring_matches, inverse_distance, normalise ): - """Test STOMP method.""" + """Test brute force matrix profile method.""" L = 3 - - X = make_example_3d_numpy_list( - n_cases=3, - n_channels=2, - min_n_timepoints=6, - max_n_timepoints=8, - return_y=False, + X = List( + make_example_3d_numpy_list( + n_cases=3, + n_channels=2, + min_n_timepoints=6, + max_n_timepoints=8, + return_y=False, + ) ) + X_copy = X.copy() T = make_example_2d_numpy_series(n_channels=2, n_timepoints=5) - + T_copy = T.copy() T_index = None threshold = np.inf exclusion_size = L @@ -136,6 +146,9 @@ def test__naive_squared_matrix_profile( inverse_distance, normalise=normalise, ) + assert_array_equal(T, T_copy) + for i in range(len(X)): + assert_array_equal(X[i], X_copy[i]) # For each query of size L in T for i in range(T.shape[1] - L + 1): dist_profiles = _naive_squared_distance_profile( From d7da68bf4f7806b78b2532baba5c77f21d80f447 Mon Sep 17 00:00:00 2001 From: baraline Date: Thu, 2 Jan 2025 14:50:20 +0100 Subject: [PATCH 08/18] more debug of subsequence tests --- aeon/similarity_search/_base.py | 89 +++++++- .../similarity_search/series_search/_r_lsh.py | 2 +- aeon/similarity_search/series_search/base.py | 209 +++++++++++++++++- .../subsequence_search/_brute_force.py | 8 + .../subsequence_search/_stomp.py | 9 + .../subsequence_search/base.py | 1 + .../subsequence_search/tests/test_commons.py | 5 +- .../subsequence_search/tests/test_stomp.py | 10 +- .../_mock_similarity_searchers.py | 18 +- aeon/utils/numba/general.py | 89 ++++++-- 10 files changed, 396 insertions(+), 44 deletions(-) diff --git a/aeon/similarity_search/_base.py b/aeon/similarity_search/_base.py index d486491fa4..476dc664c2 100644 --- a/aeon/similarity_search/_base.py +++ b/aeon/similarity_search/_base.py @@ -90,7 +90,6 @@ def find_motifs( X: np.ndarray, k: int, threshold: float, - allow_overlap: Optional[bool] = True, ): """ Find the top-k motifs in the training data. @@ -109,9 +108,6 @@ def find_motifs( threshold : int, optional A threshold on the similarity measure to determine which candidates will be part of a motif set. - allow_overlap: bool, optional - Wheter a candidate can be part of multiple motif sets (True), or if motif - sets should be mutually exclusive (False). Returns ------- @@ -194,3 +190,88 @@ def _check_find_neighbors_motif_format(self, X): @abstractmethod def _fit(self, X, y=None): ... + + def _check_X_index_int(self, X_index: int): + """ + Check wheter the X_index parameter is correctly formated and is admissible. + + This check is made for motif search functions. + + Parameters + ---------- + X_index : int + Index of a series in X_. + + Returns + ------- + X_index : int + Index of a series in X_ + + """ + if X_index is not None: + if not isinstance(X_index, int): + raise TypeError("Expected an integer for X_index but got {X_index}") + + if X_index >= self.n_cases_ or X_index < 0: + raise ValueError( + "The value of X_index cannot exced the number " + "of series in the collection given during fit. Expected a value " + f"between [0, {self.n_cases_ - 1}] but got {X_index}" + ) + return X_index + + def _check_X_index_array(self, X_index: np.ndarray): + """ + Check wheter the X_index parameter is correctly formated and is admissible. + + This check is made for neighbour search functions. + + Parameters + ---------- + X_index : np.ndarray, 1D array of shape (2) + Array of integer containing the sample and timestamp identifiers of the + starting point of a subsequence in X_. + + Returns + ------- + X_index : np.ndarray, 1D array of shape (2) + Array of integer containing the sample and timestamp identifiers of the + starting point of a subsequence in X_. + + """ + if X_index is not None: + if ( + isinstance(X_index, list) + and len(X_index) == 2 + and isinstance(X_index[0], int) + and isinstance(X_index[1], int) + ): + X_index = np.asarray(X_index, dtype=int) + elif len(X_index) != 2: + raise TypeError( + "Expected a numpy array or list of integers with 2 elements " + f"for X_index but got {X_index}" + ) + elif ( + not (isinstance(X_index[0], int) or not isinstance(X_index[1], int)) + or X_index.dtype != int + ): + raise TypeError( + "Expected a numpy array or list of integers for X_index but got " + f"{X_index}" + ) + + if X_index[0] >= self.n_cases_ or X_index[0] < 0: + raise ValueError( + "The sample ID (first element) of X_index cannot exced the number " + "of series in the collection given during fit. Expected a value " + f"between [0, {self.n_cases_ - 1}] but got {X_index[0]}" + ) + _max_timestamp = self.X_[X_index[0]].shape[1] - self.length + 1 + if X_index[1] >= _max_timestamp: + raise ValueError( + "The timestamp ID (second element) of X_index cannot exced the " + "number of timestamps minus the length parameter plus one. Expected" + f" a value between [0, {_max_timestamp - 1}] but got {X_index[1]}" + ) + return X_index diff --git a/aeon/similarity_search/series_search/_r_lsh.py b/aeon/similarity_search/series_search/_r_lsh.py index 9921aeeba1..a98dbfc562 100644 --- a/aeon/similarity_search/series_search/_r_lsh.py +++ b/aeon/similarity_search/series_search/_r_lsh.py @@ -3,7 +3,7 @@ import numpy as np from numba import njit, prange -TPB = 16 +# TPB = 16 @njit(cache=True) diff --git a/aeon/similarity_search/series_search/base.py b/aeon/similarity_search/series_search/base.py index bcbc92c042..9f0e4f4ca6 100644 --- a/aeon/similarity_search/series_search/base.py +++ b/aeon/similarity_search/series_search/base.py @@ -2,13 +2,218 @@ __maintainer__ = ["baraline"] +import warnings +from abc import abstractmethod +from typing import Optional, final + +import numpy as np +from numba import get_num_threads, set_num_threads + from aeon.similarity_search._base import BaseSimilaritySearch +from aeon.utils.numba.general import compute_mean_stds_collection_parallel class BaseSeriesSearch(BaseSimilaritySearch): - """.""" + """ + Base class for similarity search on whole time series. - ... + Parameters + ---------- + normalise : bool, optional + Whether the inputs should be z-normalised. The default is False. + n_jobs : int, optional + Number of parallel jobs to use. The default is 1. + """ + + @final + def find_motifs( + self, + X: np.ndarray, + k: Optional[int] = 1, + threshold: Optional[float] = np.inf, + X_index: Optional[int] = None, + inverse_distance: Optional[bool] = False, + ): + """ + Find the top-k motifs in the training data. + + Given ``k`` and ``threshold`` parameters, this methods returns the top-k motif + sets. We define a motif set as a set of candidates which all are at a distance + of at most ``threshold`` from each other. The top-k motifs sets are the + motif sets with the most candidates. + + Parameters + ---------- + X : np.ndarray, 2D array of shape (n_channels, n_timestamps) + A series in which we want to indentify motifs. + k : int, optional + Number of motifs to return + threshold : int, optional + A threshold on the similarity measure to determine which candidates will be + part of a motif set. + X_index : Optional[int], optional + If ``X`` is a series of the database given in fit, specify its index in + ``X_``. If specified, this series won't be able to match with itself. + inverse_distance : bool, optional + Wheter to inverse the computed distance, meaning that the method will return + the anomalies instead of motifs. + + Returns + ------- + ndarray, shape=(k,) + A numpy array of at most ``k`` elements containing the indexes of the + motifs in X. + ndarray, shape=(k,) + A numpy array of at most ``k`` elements containing the distances of the + motifs macthes to the motif in X. + + """ + self._check_is_fitted() + if X is not None: + self._check_find_neighbors_motif_format(X) + prev_threads = get_num_threads() + X_index = self._check_X_index_int(X_index) + motifs_indexes, distances = self._find_motifs( + X, + k=k, + threshold=threshold, + inverse_distance=inverse_distance, + X_index=X_index, + ) + set_num_threads(prev_threads) + return motifs_indexes, distances + + @final + def find_neighbors( + self, + X: np.ndarray, + k: Optional[int] = 1, + threshold: Optional[float] = np.inf, + X_index: Optional[int] = None, + inverse_distance: Optional[bool] = False, + ): + """ + Find the top-k neighbors of X in the database. + + Given ``k`` and ``threshold`` parameters, this methods returns the top-k + neighbors of X, such as each of the ``k`` neighbors as a distance inferior or + equal to ``threshold``. By default, ``threshold`` is set to infinity. It is + possible for this method to return less than ``k`` neighbors, either if there + is less than ``k`` admissible candidate in the database, or if in the top-k + candidates, some do not meet the ``threshold`` condition. + + Parameters + ---------- + X : np.ndarray, 2D array of shape (n_channels, length) + The subsequence for which we want to identify nearest neighbors in the + database. + k : int, optional + Number of neighbors to return. + threshold : int, optional + A threshold on the distance to determine which candidates will be returned. + X_index : Optional[int], optional + If ``X`` is a series of the database given in fit, specify its index in + ``X_``. If specified, this series won't be able to match with itself. + inverse_distance : bool, optional + Wheter to inverse the computed distance, meaning that the method will return + the k most dissimilar neighbors instead of the k most similar. + + + Returns + ------- + ndarray, shape=(k,) + A numpy array of at most ``k`` elements containing the indexes of the + neighbors. + ndarray, shape=(k,) + A numpy array of at most ``k`` elements containing the distances of the + neighbors to X. + + """ + self._check_is_fitted() + + self._check_find_neighbors_motif_format(X) + if self.length != X.shape[1]: + raise ValueError( + f"Expected X to be of shape {(self.n_channels_, self.length)} but" + f" got {X.shape} in find_neighbors." + ) + + X_index = self._check_X_index_int(X_index) + prev_threads = get_num_threads() + set_num_threads(self._n_jobs) + neighbors, distances = self._find_neighbors( + X, + k=k, + threshold=threshold, + inverse_distance=inverse_distance, + X_index=X_index, + ) + set_num_threads(prev_threads) + if len(neighbors) < k: + warnings.warn( + f"The number of admissible neighbors found is {len(neighbors)}, instead" + f" of {k}", + stacklevel=2, + ) + return neighbors, distances + + def _compute_mean_std_from_collection(self, X: np.ndarray): + """ + Compute the mean and std of each channel for all series in X. + + Parameters + ---------- + X : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints) + Collection of series from which we extract mean and stds. If it is an + unequal length collection, it should be a list of 2d numpy arrays. + + Returns + ------- + Tuple(np.ndarray, np.ndarray) + Both array are of shape (n_cases, n_channels), the first contains the means + and the second the stds for each series in X. + + """ + means, stds = compute_mean_stds_collection_parallel(X) + return means, stds + + def _fit(self, X, y=None): + if self.length >= self.min_timepoints_ or self.length < 1: + raise ValueError( + "The length of the query should be inferior or equal to the length of " + "data (X_) provided during fit, but got {} for X and {} for X_".format( + self.length, self.min_timepoints_ + ) + ) + + if self.normalise: + self.X_means_, self.X_stds_ = self._compute_mean_std_from_collection(X) + self.X_ = X + return self + + @abstractmethod + def _find_motifs( + self, + X: np.ndarray, + k: Optional[int] = 1, + threshold: Optional[float] = np.inf, + X_index: Optional[int] = None, + inverse_distance: Optional[bool] = False, + allow_neighboring_matches: Optional[bool] = False, + exclusion_factor: Optional[float] = 2.0, + ): ... + + @abstractmethod + def _find_neighbors( + self, + X: np.ndarray, + k: Optional[int] = 1, + threshold: Optional[float] = np.inf, + inverse_distance: Optional[bool] = False, + X_index=None, + allow_neighboring_matches: Optional[bool] = False, + exclusion_factor: Optional[float] = 2.0, + ): ... class BaseIndexSearch(BaseSimilaritySearch): diff --git a/aeon/similarity_search/subsequence_search/_brute_force.py b/aeon/similarity_search/subsequence_search/_brute_force.py index a7227499b6..2225fbd0ad 100644 --- a/aeon/similarity_search/subsequence_search/_brute_force.py +++ b/aeon/similarity_search/subsequence_search/_brute_force.py @@ -27,6 +27,14 @@ class BruteForceMatrixProfile(BaseMatrixProfile): """Estimator to compute matrix profile and distance profile using brute force.""" + def __init__( + self, + length: int, + normalise: Optional[bool] = False, + n_jobs: Optional[int] = 1, + ): + super().__init__(length=length, n_jobs=n_jobs, normalise=normalise) + def compute_matrix_profile( self, k, diff --git a/aeon/similarity_search/subsequence_search/_stomp.py b/aeon/similarity_search/subsequence_search/_stomp.py index 21fdacdfcd..a1d6ac8730 100644 --- a/aeon/similarity_search/subsequence_search/_stomp.py +++ b/aeon/similarity_search/subsequence_search/_stomp.py @@ -1,6 +1,7 @@ """Implementation of STOMP with squared euclidean distance.""" __maintainer__ = ["baraline"] +from typing import Optional import numpy as np from numba import njit, prange @@ -22,6 +23,14 @@ class StompMatrixProfile(BaseMatrixProfile): """Estimator to compute matrix profile and distance profile using STOMP.""" + def __init__( + self, + length: int, + normalise: Optional[bool] = False, + n_jobs: Optional[int] = 1, + ): + super().__init__(length=length, n_jobs=n_jobs, normalise=normalise) + def compute_matrix_profile( self, X: np.ndarray, diff --git a/aeon/similarity_search/subsequence_search/base.py b/aeon/similarity_search/subsequence_search/base.py index 238b749b5f..4c26a6a06a 100644 --- a/aeon/similarity_search/subsequence_search/base.py +++ b/aeon/similarity_search/subsequence_search/base.py @@ -34,6 +34,7 @@ class BaseSubsequenceSearch(BaseSimilaritySearch): Number of parallel jobs to use. The default is 1. """ + @abstractmethod def __init__( self, length: int, diff --git a/aeon/similarity_search/subsequence_search/tests/test_commons.py b/aeon/similarity_search/subsequence_search/tests/test_commons.py index 50c5cfad31..e5b4272285 100644 --- a/aeon/similarity_search/subsequence_search/tests/test_commons.py +++ b/aeon/similarity_search/subsequence_search/tests/test_commons.py @@ -24,14 +24,15 @@ def test_fft_sliding_dot_product(): """Test the fft_sliding_dot_product function.""" + L = 4 X = make_example_2d_numpy_series(n_channels=1, n_timepoints=10) - Q = make_example_2d_numpy_series(n_channels=1, n_timepoints=4) + Q = make_example_2d_numpy_series(n_channels=1, n_timepoints=L) values = fft_sliding_dot_product(X, Q) # Compare values[0] only as input is univariate assert_array_almost_equal( values[0], - [np.dot(Q[0], X[0, i : i + 5]) for i in range(X.shape[1] - 5 + 1)], + [np.dot(Q[0], X[0, i : i + L]) for i in range(X.shape[1] - L + 1)], ) diff --git a/aeon/similarity_search/subsequence_search/tests/test_stomp.py b/aeon/similarity_search/subsequence_search/tests/test_stomp.py index 12d7738eaf..757c8a3133 100644 --- a/aeon/similarity_search/subsequence_search/tests/test_stomp.py +++ b/aeon/similarity_search/subsequence_search/tests/test_stomp.py @@ -265,13 +265,9 @@ def test__stomp(k, allow_neighboring_matches, inverse_distance): assert_almost_equal(MP[i][j], dist_profiles[index[0]][index[1]]) -@pytest.mark.parametrize( - [ - ("k", K_VALUES), - ("allow_neighboring_matches", NN_MATCHES), - ("inverse_distance", INVERSE), - ] -) +@pytest.mark.parametrize("k", K_VALUES) +@pytest.mark.parametrize("allow_neighboring_matches", NN_MATCHES) +@pytest.mark.parametrize("inverse_distance", INVERSE) def test__stomp_normalised(k, allow_neighboring_matches, inverse_distance): """Test STOMP normalised method.""" L = 3 diff --git a/aeon/testing/mock_estimators/_mock_similarity_searchers.py b/aeon/testing/mock_estimators/_mock_similarity_searchers.py index 824a627d7a..5362137ba9 100644 --- a/aeon/testing/mock_estimators/_mock_similarity_searchers.py +++ b/aeon/testing/mock_estimators/_mock_similarity_searchers.py @@ -17,8 +17,13 @@ class MockMatrixProfile(BaseMatrixProfile): """Mock estimator for BaseMatrixProfile.""" - def __init__(self, length=3): - super().__init__(length=length) + def __init__( + self, + length=3, + normalise=False, + n_jobs=1, + ): + super().__init__(length=length, n_jobs=n_jobs, normalise=normalise) def compute_matrix_profile( self, @@ -45,8 +50,13 @@ def compute_distance_profile(self, X): class MockSubsequenceSearch(BaseSubsequenceSearch): """Mock estimator for BaseSubsequenceSearch.""" - def __init__(self, length=3): - super().__init__(length=length) + def __init__( + self, + length=3, + normalise=False, + n_jobs=1, + ): + super().__init__(length=length, n_jobs=n_jobs, normalise=normalise) def _find_motifs( self, diff --git a/aeon/utils/numba/general.py b/aeon/utils/numba/general.py index b398a8414b..958c584459 100644 --- a/aeon/utils/numba/general.py +++ b/aeon/utils/numba/general.py @@ -276,7 +276,7 @@ def z_normalise_series_2d_with_mean_std( Parameters ---------- - X : array, shape = (n_channels, n_timestamps) + X : array, shape = (n_channels, n_timepoints) Input array to normalise. mean : array, shape = (n_channels) Mean of each channel of X. @@ -285,7 +285,7 @@ def z_normalise_series_2d_with_mean_std( Returns ------- - arr : array, shape = (n_channels, n_timestamps) + arr : array, shape = (n_channels, n_timepoints) The normalised array """ arr = np.zeros(X.shape) @@ -379,10 +379,10 @@ def get_subsequence( Parameters ---------- - X : array, shape (n_channels, n_timestamps) + X : array, shape (n_channels, n_timepoints) Input time series. i_start : int - A starting index between [0, n_timestamps - (length-1)*dilation] + A starting index between [0, n_timepoints - (length-1)*dilation] length : int Length parameter of the subsequence. dilation : int @@ -411,10 +411,10 @@ def get_subsequence_with_mean_std( Parameters ---------- - X : array, shape (n_channels, n_timestamps) + X : array, shape (n_channels, n_timepoints) Input time series. i_start : int - A starting index between [0, n_timestamps - (length-1)*dilation] + A starting index between [0, n_timepoints - (length-1)*dilation] length : int Length parameter of the subsequence. dilation : int @@ -454,15 +454,56 @@ def get_subsequence_with_mean_std( return values, means, stds +@njit(cache=True, fastmath=True, parallel=True) +def compute_mean_stds_collection_parallel(X): + """ + Return the mean and standard deviation for each channel of all series in X. + + Parameters + ---------- + X : array, shape (n_cases, n_channels, n_timepoints) + A time series collection + + Returns + ------- + means : array, shape (n_cases, n_channels) + The mean of each channel of each time series in X. + stds : array, shape (n_cases, n_channels) + The std of each channel of each time series in X. + + """ + n_channels = X[0].shape[0] + n_cases = len(X) + means = np.zeros((n_cases, n_channels)) + stds = np.zeros((n_cases, n_channels)) + for i_x in prange(n_cases): + n_timepoints = X[i_x].shape[1] + _s = np.zeros(n_channels) + _s2 = np.zeros(n_channels) + for i_t in range(n_timepoints): + for i_c in range(n_channels): + _s += X[i_x][i_c, i_t] + _s2 += X[i_x][i_c, i_t] ** 2 + + for i_c in range(n_channels): + means[i_x, i_c] = _s / n_timepoints + _std = _s2 / n_timepoints - means[i_x, i_c] ** 2 + if _s > AEON_NUMBA_STD_THRESHOLD: + stds[i_x, i_c] = _std**0.5 + + return means, stds + + @njit(fastmath=True, cache=True) def sliding_mean_std_one_series( X: np.ndarray, length: int, dilation: int ) -> tuple[np.ndarray, np.ndarray]: - """Return the mean and standard deviation for all subsequence (l,d) in X. + """ + Return the mean and standard deviation for all subsequence (l,d) in X. Parameters ---------- - X : array, shape (n_channels, n_timestamps) + X : array, shape (n_channels, n_timepoints) An input time series length : int Length of the subsequence @@ -471,14 +512,14 @@ def sliding_mean_std_one_series( Returns ------- - mean : array, shape (n_channels, n_timestamps - (length-1) * dilation) + mean : array, shape (n_channels, n_timepoints - (length-1) * dilation) The mean of each subsequence with parameter length and dilation in X. - std : array, shape (n_channels, n_timestamps - (length-1) * dilation) + std : array, shape (n_channels, n_timepoints - (length-1) * dilation) The standard deviation of each subsequence with parameter length and dilation in X. """ - n_channels, n_timestamps = X.shape - n_subs = n_timestamps - (length - 1) * dilation + n_channels, n_timepoints = X.shape + n_subs = n_timepoints - (length - 1) * dilation if n_subs <= 0: raise ValueError( "Invalid input parameter for sliding mean and std computations" @@ -496,7 +537,7 @@ def sliding_mean_std_one_series( _sum2 = np.zeros(n_channels) # Initialize first subsequence if it is valid - if np.all(_idx_sub < n_timestamps): + if np.all(_idx_sub < n_timepoints): for i_length in prange(length): _idx_sub[i_length] = (i_length * dilation) + i_mod_dil for i_channel in prange(n_channels): @@ -513,7 +554,7 @@ def sliding_mean_std_one_series( _idx_sub += dilation # As long as subsequences further subsequences are valid - while np.all(_idx_sub < n_timestamps): + while np.all(_idx_sub < n_timepoints): # Update sums and mean stds arrays for i_channel in prange(n_channels): _v_new = X[i_channel, _idx_sub[-1]] @@ -537,17 +578,17 @@ def normalise_subsequences(X_subs: np.ndarray, X_means: np.ndarray, X_stds: np.n Parameters ---------- - X_subs : array, shape (n_timestamps-(length-1)*dilation, n_channels, length) - The subsequences of an input time series of size n_timestamps given the + X_subs : array, shape (n_timepoints-(length-1)*dilation, n_channels, length) + The subsequences of an input time series of size n_timepoints given the length and dilation parameter. - X_means : array, shape (n_channels, n_timestamps-(length-1)*dilation) + X_means : array, shape (n_channels, n_timepoints-(length-1)*dilation) Mean of the subsequences to normalise. - X_stds : array, shape (n_channels, n_timestamps-(length-1)*dilation) + X_stds : array, shape (n_channels, n_timepoints-(length-1)*dilation) Stds of the subsequences to normalise. Returns ------- - array, shape = (n_timestamps-(length-1)*dilation, n_channels, length) + array, shape = (n_timepoints-(length-1)*dilation, n_channels, length) Z-normalised subsequences. """ n_subsequences, n_channels, length = X_subs.shape @@ -758,8 +799,8 @@ def get_all_subsequences(X: np.ndarray, length: int, dilation: int) -> np.ndarra Parameters ---------- - X : array, shape = (n_channels, n_timestamps) - An input time series as (n_channels, n_timestamps). + X : array, shape = (n_channels, n_timepoints) + An input time series as (n_channels, n_timepoints). length : int Length of the subsequences to generate. dilation : int @@ -767,11 +808,11 @@ def get_all_subsequences(X: np.ndarray, length: int, dilation: int) -> np.ndarra Returns ------- - array, shape = (n_timestamps-(length-1)*dilation, n_channels, length) + array, shape = (n_timepoints-(length-1)*dilation, n_channels, length) The view of the subsequences of the input time series. """ - n_features, n_timestamps = X.shape + n_features, n_timepoints = X.shape s0, s1 = X.strides - out_shape = (n_timestamps - (length - 1) * dilation, n_features, np.int64(length)) + out_shape = (n_timepoints - (length - 1) * dilation, n_features, np.int64(length)) strides = (s1, s0, s1 * dilation) return np.lib.stride_tricks.as_strided(X, shape=out_shape, strides=strides) From da2758c4fbfb1c9405b4aff1fa12fa14b94522ba Mon Sep 17 00:00:00 2001 From: baraline Date: Thu, 2 Jan 2025 16:12:53 +0100 Subject: [PATCH 09/18] more debug of subsequence tests --- .../subsequence_search/base.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/aeon/similarity_search/subsequence_search/base.py b/aeon/similarity_search/subsequence_search/base.py index 4c26a6a06a..9df76d233b 100644 --- a/aeon/similarity_search/subsequence_search/base.py +++ b/aeon/similarity_search/subsequence_search/base.py @@ -357,6 +357,29 @@ def _find_neighbors( exclusion_factor: Optional[float] = 2.0, ): ... + @classmethod + def _get_test_params(cls, parameter_set: str = "default") -> dict: + """Return testing parameter settings for the estimator. + + Parameters + ---------- + parameter_set : str, default="default" + Name of the set of test parameters to return, for use in tests. If no + special parameters are defined for a value, will return `"default"` set. + For classifiers, a "default" set of parameters should be provided for + general testing, and a "results_comparison" set for comparing against + previously recorded results if the general set does not produce suitable + probabilities to compare against. + + Returns + ------- + params : dict or list of dict, default={} + Parameters to create testing instances of the class. + Each dict are parameters to construct an "interesting" test instance, i.e., + `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance. + """ + return {"length": 3} + class BaseMatrixProfile(BaseSubsequenceSearch): """Base class for Matrix Profile methods using a length parameter.""" From 2191ac27c259ab09d9e2d21d267af1a1628b3783 Mon Sep 17 00:00:00 2001 From: baraline Date: Fri, 10 Jan 2025 12:03:53 +0100 Subject: [PATCH 10/18] Add functional LSH neighbors --- aeon/similarity_search/_base.py | 1 - .../series_search/__init__.py | 2 +- .../series_search/{base.py => _base.py} | 85 ++++-- .../similarity_search/series_search/_r_lsh.py | 239 ---------------- .../series_search/_rp_lsh.py | 263 ++++++++++++++++++ .../subsequence_search/__init__.py | 8 +- .../subsequence_search/{base.py => _base.py} | 4 + .../subsequence_search/_brute_force.py | 2 +- .../subsequence_search/_stomp.py | 2 +- .../_mock_similarity_searchers.py | 2 +- aeon/utils/base/_register.py | 7 +- 11 files changed, 339 insertions(+), 276 deletions(-) rename aeon/similarity_search/series_search/{base.py => _base.py} (79%) delete mode 100644 aeon/similarity_search/series_search/_r_lsh.py create mode 100644 aeon/similarity_search/series_search/_rp_lsh.py rename aeon/similarity_search/subsequence_search/{base.py => _base.py} (99%) diff --git a/aeon/similarity_search/_base.py b/aeon/similarity_search/_base.py index 476dc664c2..68e35c6cbb 100644 --- a/aeon/similarity_search/_base.py +++ b/aeon/similarity_search/_base.py @@ -26,7 +26,6 @@ class BaseSimilaritySearch(BaseCollectionEstimator): _tags = { "capability:multivariate": True, - "capability:unequal_length": True, "capability:multithreading": True, "fit_is_empty": False, "X_inner_type": ["np-list", "numpy3D"], diff --git a/aeon/similarity_search/series_search/__init__.py b/aeon/similarity_search/series_search/__init__.py index 2f69dab51a..9a618540db 100644 --- a/aeon/similarity_search/series_search/__init__.py +++ b/aeon/similarity_search/series_search/__init__.py @@ -2,4 +2,4 @@ __all__ = ["BaseSeriesSearch", "BaseIndexSearch"] -from aeon.similarity_search.series_search.base import BaseIndexSearch, BaseSeriesSearch +from aeon.similarity_search.series_search._base import BaseIndexSearch, BaseSeriesSearch diff --git a/aeon/similarity_search/series_search/base.py b/aeon/similarity_search/series_search/_base.py similarity index 79% rename from aeon/similarity_search/series_search/base.py rename to aeon/similarity_search/series_search/_base.py index 9f0e4f4ca6..f9c6ed8097 100644 --- a/aeon/similarity_search/series_search/base.py +++ b/aeon/similarity_search/series_search/_base.py @@ -132,11 +132,6 @@ def find_neighbors( self._check_is_fitted() self._check_find_neighbors_motif_format(X) - if self.length != X.shape[1]: - raise ValueError( - f"Expected X to be of shape {(self.n_channels_, self.length)} but" - f" got {X.shape} in find_neighbors." - ) X_index = self._check_X_index_int(X_index) prev_threads = get_num_threads() @@ -178,14 +173,6 @@ def _compute_mean_std_from_collection(self, X: np.ndarray): return means, stds def _fit(self, X, y=None): - if self.length >= self.min_timepoints_ or self.length < 1: - raise ValueError( - "The length of the query should be inferior or equal to the length of " - "data (X_) provided during fit, but got {} for X and {} for X_".format( - self.length, self.min_timepoints_ - ) - ) - if self.normalise: self.X_means_, self.X_stds_ = self._compute_mean_std_from_collection(X) self.X_ = X @@ -199,8 +186,6 @@ def _find_motifs( threshold: Optional[float] = np.inf, X_index: Optional[int] = None, inverse_distance: Optional[bool] = False, - allow_neighboring_matches: Optional[bool] = False, - exclusion_factor: Optional[float] = 2.0, ): ... @abstractmethod @@ -211,17 +196,69 @@ def _find_neighbors( threshold: Optional[float] = np.inf, inverse_distance: Optional[bool] = False, X_index=None, - allow_neighboring_matches: Optional[bool] = False, - exclusion_factor: Optional[float] = 2.0, ): ... -class BaseIndexSearch(BaseSimilaritySearch): - """.""" +# TODO : Add an update method to add series to the index +class BaseIndexSearch(BaseSeriesSearch): + """ + Base class for similarity search on whole time series using indexes. + + Parameters + ---------- + normalise : bool, optional + Whether the inputs should be z-normalised. The default is False. + n_jobs : int, optional + Number of parallel jobs to use. The default is 1. + """ + + def _fit(self, X, y=None): + super()._fit(X) + self._build_index() + return self + + @abstractmethod + def _build_index(self): ... + + @abstractmethod + def _query_index( + self, + X, + k=1, + inverse_distance=False, + threshold=np.inf, + ): ... + + @abstractmethod + def _get_bucket_sizes(self): ... + + @abstractmethod + def _get_bucket_content(self, key): ... - ... + def _find_motifs( + self, + X: np.ndarray, + k: Optional[int] = 1, + threshold: Optional[float] = np.inf, + X_index: Optional[int] = None, + inverse_distance: Optional[bool] = False, + ): + bucket_sizes = self._get_bucket_sizes() + idx_motifs = np.argsort(list(bucket_sizes.values()))[::-1][:, k] + # TODO : review distance return on motif for whole series and buckets + return [self._get_bucket_content(idx_motif) for idx_motif in idx_motifs], [ + 0 for _ in idx_motifs + ] - def batch_fit(sourcefiles, batch_size): - """.""" - # fit - # and then update + def _find_neighbors( + self, + X: np.ndarray, + k: Optional[int] = 1, + threshold: Optional[float] = np.inf, + inverse_distance: Optional[bool] = False, + X_index=None, + ): + top_k, top_k_dist = self._query_index( + X, k=k, inverse_distance=inverse_distance, threshold=threshold + ) + return top_k, top_k_dist diff --git a/aeon/similarity_search/series_search/_r_lsh.py b/aeon/similarity_search/series_search/_r_lsh.py deleted file mode 100644 index a98dbfc562..0000000000 --- a/aeon/similarity_search/series_search/_r_lsh.py +++ /dev/null @@ -1,239 +0,0 @@ -"""Random projection LSH index.""" - -import numpy as np -from numba import njit, prange - -# TPB = 16 - - -@njit(cache=True) -def _hamming_dist(X, Y): - d = 0 - for i in prange(X.shape[0]): - d += X[i] ^ Y[i] - return d - - -@njit(cache=True, parallel=True) -def _hamming_dist_matrix(bool_hashes_value_list, bool_hashes): - n_hashes = bool_hashes.shape[0] - res = np.zeros((n_hashes, bool_hashes_value_list.shape[0]), dtype=np.int64) - for i in prange(n_hashes): - for j in prange(bool_hashes_value_list.shape[0]): - res[i, j] = _hamming_dist(bool_hashes_value_list[j], bool_hashes[i]) - return res - - -@njit(cache=True, fastmath=True, parallel=True) -def _series_to_bool(X, hash_funcs, start_points, length): - n_hash_funcs = hash_funcs.shape[0] - res = np.empty(n_hash_funcs, dtype=np.bool_) - for j in prange(n_hash_funcs): - res[j] = ( - np.dot(X[start_points[j] : start_points[j] + length], hash_funcs[j]) >= 0 - ) - return res - - -@njit(cache=True, fastmath=True, parallel=True) -def _collection_to_bool(X, hash_funcs, start_points, length): - n_hash_funcs = hash_funcs.shape[0] - n_samples = X.shape[0] - res = np.empty((n_samples, n_hash_funcs), dtype=np.bool_) - for i in prange(n_samples): - for j in prange(n_hash_funcs): - res[i, j] = ( - np.dot(X[i, start_points[j] : start_points[j] + length], hash_funcs[j]) - >= 0 - ) - return res - - -class LSH: - """ - . - - Parameters - ---------- - n_vectors : TYPE - DESCRIPTION. - custom_table : TYPE, optional - DESCRIPTION. The default is None. - - Returns - ------- - None. - - """ - - def __init__(self, n_hash_funcs=128, window_length=1.0, seed=None): - self.n_hash_funcs = n_hash_funcs - self.window_length = window_length - self.seed = seed - - def fit(self, X): - """ - . - - Parameters - ---------- - X : TYPE - DESCRIPTION. - - Returns - ------- - TYPE - DESCRIPTION. - - """ - self.rng_ = np.random.default_rng(self.seed) - self.X_ = np.array( - [X[i].flatten() for i in range(len(X))] - ) # n_samples, n_channels * n_timepoints - - self.window_length_ = max(1, int(self.X_.shape[1] * self.window_length)) - # Can replace with choice [-1, 1] - self.hash_funcs_ = self.rng_.uniform( - low=-1, high=1.0, size=(self.n_hash_funcs, self.window_length_) - ) - self.start_points_ = self.rng_.choice( - self.X_.shape[1] - self.window_length_ + 1, - size=self.n_hash_funcs, - replace=True, - ) - - bool_hashes = _collection_to_bool( - self.X_, self.hash_funcs_, self.start_points_, self.window_length_ - ) - # could yield this - str_hashes = [hash(bool_hashes[i].tobytes()) for i in range(len(bool_hashes))] - self.dict_X_index = {} - self.dict_bool_hashes = {} - for i in range(len(str_hashes)): - if str_hashes[i] in self.dict_X_index: - self.dict_X_index[str_hashes[i]].append(i) - else: - self.dict_X_index[str_hashes[i]] = [i] - self.dict_bool_hashes[str_hashes[i]] = bool_hashes[i] - - self.bool_hashes_value_list = np.asarray(list(self.dict_bool_hashes.values())) - self.bool_hashes_key_list = np.asarray(list(self.dict_bool_hashes.keys())) - - return self - - def update(self, X): - """ - . - - Parameters - ---------- - X : TYPE - DESCRIPTION. - - Returns - ------- - TYPE - DESCRIPTION. - - """ - X_ = np.array( - [X[i].flatten() for i in range(len(X))] - ) # n_samples, n_channels * n_timepoints - bool_hashes = _collection_to_bool( - X_, self.hash_funcs_, self.start_points_, self.window_length_ - ) - - str_hashes = [hash(bool_hashes[i].tobytes()) for i in range(len(bool_hashes))] - base_index = self.X_.shape[0] - for i in range(len(str_hashes)): - if str_hashes[i] in self.dict_X_index: - self.dict_X_index[str_hashes[i]].append(i + base_index) - else: - self.dict_X_index[str_hashes[i]] = [i + base_index] - self.dict_bool_hashes[str_hashes[i]] = bool_hashes[i] - self.X_ = np.concatenate((self.X_, X_)) - - self.bool_hashes_value_list = np.asarray(list(self.dict_bool_hashes.values())) - self.bool_hashes_key_list = np.asarray(list(self.dict_bool_hashes.keys())) - return self - - def get_bucket_collection_indexes(self, X): - """ - . - - Parameters - ---------- - X : TYPE - DESCRIPTION. - - Returns - ------- - TYPE - DESCRIPTION. - - """ - bool_hash = _series_to_bool( - X.flatten(), self.hash_funcs_, self.start_points_, self.window_length_ - ) - str_hash = hash(bool_hash.tobytes()) - if str_hash in self.dict_X_index: - return self.dict_X_index[str_hash] - else: - return [] - - def predict(self, X, k=1): - """ - . - - Parameters - ---------- - X : TYPE - DESCRIPTION. - k : TYPE, optional - DESCRIPTION. The default is 1. - - Returns - ------- - top_k : TYPE - DESCRIPTION. - - """ - X_ = np.array([X[i].flatten() for i in range(len(X))]) - bool_hashes = _collection_to_bool( - X_, self.hash_funcs_, self.start_points_, self.window_length_ - ) - top_k = np.zeros((len(X), k), dtype=int) - dists = _hamming_dist_matrix(self.bool_hashes_value_list, bool_hashes) - self.h_dists = dists - # Deal with equality by merging bucket contents ? - for i_x in range(len(X)): - ids = np.argsort(dists[i_x]) - _i = 0 - c = k - while c > 0: - candidates = self.dict_X_index[self.bool_hashes_key_list[ids[_i]]] - # Can do exact search by computing distances here - if len(candidates) > c: - candidates = candidates[:c] - top_k[i_x, k - c : k - c + len(candidates)] = candidates - c -= len(candidates) - _i += 1 - return top_k - - def find_motif(Index, X=None): - """ - . - - Parameters - ---------- - Index : TYPE - DESCRIPTION. - X : TYPE, optional - DESCRIPTION. The default is None. - - Returns - ------- - None. - - """ - pass diff --git a/aeon/similarity_search/series_search/_rp_lsh.py b/aeon/similarity_search/series_search/_rp_lsh.py new file mode 100644 index 0000000000..8d1701793e --- /dev/null +++ b/aeon/similarity_search/series_search/_rp_lsh.py @@ -0,0 +1,263 @@ +"""Random projection LSH index.""" + +import numpy as np +from numba import njit, prange + +from aeon.similarity_search.series_search._base import BaseIndexSearch + +# TPB = 16 + + +@njit(cache=True) +def _hamming_dist(X, Y): + d = 0 + for i in prange(X.shape[0]): + d += X[i] ^ Y[i] + return d + + +@njit(cache=True, parallel=True) +def _hamming_dist_matrix(bool_hashes_value_list, bool_hashes): + n_hash_funcs = bool_hashes.shape[0] + res = np.zeros((n_hash_funcs, bool_hashes_value_list.shape[0]), dtype=np.int64) + for i in prange(n_hash_funcs): + for j in range(bool_hashes_value_list.shape[0]): + res[i, j] = _hamming_dist(bool_hashes_value_list[j], bool_hashes[i]) + return res + + +@njit(cache=True, fastmath=True, parallel=True) +def _series_to_bool(X, hash_funcs, start_points, length): + n_hash_funcs = hash_funcs.shape[0] + res = np.empty(n_hash_funcs, dtype=np.bool_) + for j in prange(n_hash_funcs): + res[j] = _nb_flat_dot( + X[start_points[j] : start_points[j] + length], hash_funcs[j] + ) + return res + + +@njit(cache=True, fastmath=True) +def _nb_flat_dot(X, Y): + n_channels, n_timepoints = X.shape + out = 0 + for i in prange(n_channels): + for j in prange(n_timepoints): + out += X[i, j] * Y[i, j] + return out >= 0 + + +@njit(cache=True, parallel=True) +def _collection_to_bool(X, hash_funcs, start_points, length): + n_hash_funcs = hash_funcs.shape[0] + n_samples = X.shape[0] + res = np.empty((n_samples, n_hash_funcs), dtype=np.bool_) + for j in prange(n_hash_funcs): + for i in range(n_samples): + res[i, j] = _nb_flat_dot( + X[i, :, start_points[j] : start_points[j] + length], hash_funcs[j] + ) + + return res + + +class RP_LSH_Cosine(BaseIndexSearch): + """ + Random Projection Locality Sensitive Hashing index for cosine similarity. + + In this method based on SimHash, we define a hash function as a boolean operation + such as, given a random vector ``V`` of shape ``(n_channels, L)`` and a time series + ``X`` of shape ``(n_channels, n_timeponts)`` (with ``L<=n_timepoints``), we compute + ``X.V > 0`` to obtain the boolean result. + In the case where ``L>> from aeon.datasets import load_classification + >>> from aeon.similarity_search.series_search import RP_LSH_Cosine + >>> index = RP_LSH_Cosine() + >>> X, y = load_classification("ArrowHead") + >>> index.fit(X[:200]) + >>> r = index.find_neighbors(X[201]) + """ + + _tags = { + "capability:unequal_length": False, + } + + def __init__( + self, + n_hash_funcs=128, + hash_func_coverage=0.25, + use_discrete_vectors=True, + random_state=None, + normalise=False, + n_jobs=1, + ): + self.n_hash_funcs = n_hash_funcs + self.hash_func_coverage = hash_func_coverage + self.use_discrete_vectors = use_discrete_vectors + self.random_state = random_state + super().__init__(normalise=normalise, n_jobs=n_jobs) + + def _build_index(self): + """ + Build the index based on the data stored in X_. + + Returns + ------- + self + + """ + rng = np.random.default_rng(self.random_state) + n_timepoints = self.X_.shape[2] + self.window_length_ = max(1, int(n_timepoints * self.hash_func_coverage)) + + if self.use_discrete_vectors: + self.hash_funcs_ = rng.choice( + [-1, 1], size=(self.n_hash_funcs, self.n_channels_, self.window_length_) + ) + else: + self.hash_funcs_ = rng.uniform( + low=-1, + high=1.0, + size=(self.n_hash_funcs, self.n_channels_, self.window_length_), + ) + self.start_points_ = rng.choice( + n_timepoints - self.window_length_ + 1, + size=self.n_hash_funcs, + replace=True, + ) + + bool_hashes = _collection_to_bool( + self.X_, self.hash_funcs_, self.start_points_, self.window_length_ + ) + + str_hashes = [hash(bool_hashes[i].tobytes()) for i in range(len(bool_hashes))] + self.dict_X_index_ = {} + self.dict_bool_hashes_ = {} + for i in range(len(str_hashes)): + if str_hashes[i] in self.dict_X_index_: + self.dict_X_index_[str_hashes[i]].append(i) + else: + self.dict_X_index_[str_hashes[i]] = [i] + self.dict_bool_hashes_[str_hashes[i]] = bool_hashes[i] + + self.bool_hashes_value_list_ = np.asarray(list(self.dict_bool_hashes_.values())) + self.bool_hashes_key_list_ = np.asarray(list(self.dict_bool_hashes_.keys())) + + return self + + def _get_bucket_content(self, key): + return self.dict_X_index_[key] + + def _get_bucket_sizes(self): + return {key: len(self.dict_X_index_[key]) for key in self.dict_X_index_} + + def _get_series_bucket(self, X): + """ + Get the matching bucket of a single series X if it exists in the index. + + Parameters + ---------- + X : TYPE + DESCRIPTION. + + Returns + ------- + TYPE + DESCRIPTION. + + """ + bool_hash = _series_to_bool( + X, self.hash_funcs_, self.start_points_, self.window_length_ + ) + str_hash = hash(bool_hash.tobytes()) + if str_hash in self.dict_X_index_: + return str_hash + else: + return None + + def _query_index( + self, + X, + k=1, + threshold=np.inf, + inverse_distance=False, + ): + """ + Find approximate nearest neighbors of a collection in the index. + + Parameters + ---------- + X : np.ndarray, shape = (n_channels, n_tiempoints) + Series for which we want to find neighbors. + k : int, optional + Number of neighbors to return for each series. The default is 1. + threshold : int, optional + A threshold on the distance to determine which candidates will be returned. + inverse_distance : bool, optional + Wheter to inverse the computed distance, meaning that the method will return + the k most dissimilar neighbors instead of the k most similar. + + Returns + ------- + top_k : np.ndarray, shape = (n_cases, k) + Indexes of k series in X_ (the index) that are close to each series in X. + top_k_dist : np.ndarray, shape = (n_cases, k) + Distance of k series in X_ (the index) to each series in X. The distance + is the hamming distance between the result of each hash function. + """ + bool_hashes = _series_to_bool( + X, self.hash_funcs_, self.start_points_, self.window_length_ + ) + top_k = np.zeros(k, dtype=int) + top_k_dist = np.zeros(k, dtype=float) + dists = _hamming_dist_matrix( + self.bool_hashes_value_list_, bool_hashes[np.newaxis, :] + )[0] + if inverse_distance: + dists = 1 / (dists + 1e-8) + # Get top k buckets + ids = np.argpartition(dists, kth=k)[:k] + # and reoder them + ids = ids[np.argsort(dists[ids])] + + _i_bucket = 0 + current_k = 0 + while current_k < k: + if dists[ids[_i_bucket]] <= threshold: + candidates = self.dict_X_index_[ + self.bool_hashes_key_list_[ids[_i_bucket]] + ] + # Can do exact search by computing distances here + if len(candidates) > k - current_k: + candidates = candidates[: k - current_k] + top_k[current_k : current_k + len(candidates)] = candidates + top_k_dist[current_k : current_k + len(candidates)] = dists[ + ids[_i_bucket] + ] + current_k += len(candidates) + else: + break + _i_bucket += 1 + return top_k[:current_k], top_k_dist[:current_k] diff --git a/aeon/similarity_search/subsequence_search/__init__.py b/aeon/similarity_search/subsequence_search/__init__.py index 5d64f901bd..eb062c46b8 100644 --- a/aeon/similarity_search/subsequence_search/__init__.py +++ b/aeon/similarity_search/subsequence_search/__init__.py @@ -7,11 +7,11 @@ "BruteForceMatrixProfile", ] +from aeon.similarity_search.subsequence_search._base import ( + BaseMatrixProfile, + BaseSubsequenceSearch, +) from aeon.similarity_search.subsequence_search._brute_force import ( BruteForceMatrixProfile, ) from aeon.similarity_search.subsequence_search._stomp import StompMatrixProfile -from aeon.similarity_search.subsequence_search.base import ( - BaseMatrixProfile, - BaseSubsequenceSearch, -) diff --git a/aeon/similarity_search/subsequence_search/base.py b/aeon/similarity_search/subsequence_search/_base.py similarity index 99% rename from aeon/similarity_search/subsequence_search/base.py rename to aeon/similarity_search/subsequence_search/_base.py index 9df76d233b..0c55881ad0 100644 --- a/aeon/similarity_search/subsequence_search/base.py +++ b/aeon/similarity_search/subsequence_search/_base.py @@ -44,6 +44,10 @@ def __init__( self.length = length super().__init__(n_jobs=n_jobs, normalise=normalise) + _tags = { + "capability:unequal_length": True, + } + @final def find_motifs( self, diff --git a/aeon/similarity_search/subsequence_search/_brute_force.py b/aeon/similarity_search/subsequence_search/_brute_force.py index 2225fbd0ad..269cdc369b 100644 --- a/aeon/similarity_search/subsequence_search/_brute_force.py +++ b/aeon/similarity_search/subsequence_search/_brute_force.py @@ -9,11 +9,11 @@ from numba import njit, prange from numba.typed import List +from aeon.similarity_search.subsequence_search._base import BaseMatrixProfile from aeon.similarity_search.subsequence_search._commons import ( _extract_top_k_from_dist_profile, _inverse_distance_profile_list, ) -from aeon.similarity_search.subsequence_search.base import BaseMatrixProfile from aeon.utils.numba.general import ( get_all_subsequences, z_normalise_series_2d, diff --git a/aeon/similarity_search/subsequence_search/_stomp.py b/aeon/similarity_search/subsequence_search/_stomp.py index a1d6ac8730..66d5270872 100644 --- a/aeon/similarity_search/subsequence_search/_stomp.py +++ b/aeon/similarity_search/subsequence_search/_stomp.py @@ -7,13 +7,13 @@ from numba import njit, prange from numba.typed import List +from aeon.similarity_search.subsequence_search._base import BaseMatrixProfile from aeon.similarity_search.subsequence_search._commons import ( _extract_top_k_from_dist_profile, _inverse_distance_profile_list, fft_sliding_dot_product, get_ith_products, ) -from aeon.similarity_search.subsequence_search.base import BaseMatrixProfile from aeon.utils.numba.general import ( AEON_NUMBA_STD_THRESHOLD, sliding_mean_std_one_series, diff --git a/aeon/testing/mock_estimators/_mock_similarity_searchers.py b/aeon/testing/mock_estimators/_mock_similarity_searchers.py index 5362137ba9..1d3161514a 100644 --- a/aeon/testing/mock_estimators/_mock_similarity_searchers.py +++ b/aeon/testing/mock_estimators/_mock_similarity_searchers.py @@ -8,7 +8,7 @@ import numpy as np -from aeon.similarity_search.subsequence_search.base import ( +from aeon.similarity_search.subsequence_search._base import ( BaseMatrixProfile, BaseSubsequenceSearch, ) diff --git a/aeon/utils/base/_register.py b/aeon/utils/base/_register.py index 9c576c350a..024ad447ee 100644 --- a/aeon/utils/base/_register.py +++ b/aeon/utils/base/_register.py @@ -24,13 +24,12 @@ from aeon.forecasting.base import BaseForecaster from aeon.regression.base import BaseRegressor from aeon.segmentation.base import BaseSegmenter -from aeon.similarity_search.subsequence_search.base import BaseSubsequenceSearch +from aeon.similarity_search.series_search._base import BaseSeriesSearch +from aeon.similarity_search.subsequence_search._base import BaseSubsequenceSearch from aeon.transformations.base import BaseTransformer from aeon.transformations.collection import BaseCollectionTransformer from aeon.transformations.series import BaseSeriesTransformer -# from aeon.similarity_search.series_search.base import BaseSeriesSearch - # all base classes BASE_CLASS_REGISTER = { # abstract - no estimator directly inherits from these @@ -49,7 +48,7 @@ "series-transformer": BaseSeriesTransformer, "forecaster": BaseForecaster, "subsequence_searcher": BaseSubsequenceSearch, - # "series_searcher": BaseSeriesSearch, + "series_searcher": BaseSeriesSearch, } # base classes which are valid for estimator to directly inherit from From cd33d0a02ea98e9364c7e766f1a5c3520795e388 Mon Sep 17 00:00:00 2001 From: baraline Date: Mon, 13 Jan 2025 10:43:09 +0100 Subject: [PATCH 11/18] add notebook for sim search tasks --- .../subsequence_search/_base.py | 2 + .../similarity_search_tasks.ipynb | 136 ++++++++++++++++++ 2 files changed, 138 insertions(+) create mode 100644 examples/similarity_search/similarity_search_tasks.ipynb diff --git a/aeon/similarity_search/subsequence_search/_base.py b/aeon/similarity_search/subsequence_search/_base.py index 0c55881ad0..a8d78b029d 100644 --- a/aeon/similarity_search/subsequence_search/_base.py +++ b/aeon/similarity_search/subsequence_search/_base.py @@ -19,6 +19,8 @@ # We can define a BaseVariableLengthSubsequenceSearch later for VALMOD and the likes. +# BaseSubSeries 'replace sub by series' + class BaseSubsequenceSearch(BaseSimilaritySearch): """ diff --git a/examples/similarity_search/similarity_search_tasks.ipynb b/examples/similarity_search/similarity_search_tasks.ipynb new file mode 100644 index 0000000000..a42339c611 --- /dev/null +++ b/examples/similarity_search/similarity_search_tasks.ipynb @@ -0,0 +1,136 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2347de94-27a7-486e-a900-e80db5c7f427", + "metadata": {}, + "source": [ + "# Similarity search tasks\n", + "\n", + "To discuss : the term subsequences appear more often than subseries in similarity search papers, so maybe stick to subsequences ?\n", + "\n", + "## Notations\n", + "- A single time point $x \\in \\mathbb{R}^{d}$ representing a vector of size $d$, with $d$ the number of channels\n", + "- A single time series $X \\in \\mathbb{R}^{d,m}$ of $d$ channels and $m$ timepoints\n", + "- A collection ${\\cal X} \\in \\mathbb{R}^{n,d,m}$ of $n$ time series \n", + "- $l$ a length parameter for subseries extracted using a sliding window on a time series $X$ over its timepoints\n", + "- $W_{i,j} \\in \\mathbb{R}^{d,l}$ a subseries extracted from a collection ${\\cal X}$, with $i$ the sample id and $j$ the starting timepoint, such as $W_{i,j} = X_{i,[j:j+l[}$. Denoted $W_{j}$ if used outside of the context of a collection. ${\\cal W}$ denotes the set of all admissible subseries.\n", + " \n", + "## Series tasks\n", + "Given a single series $X$, we want to be able to do the following tasks :\n", + "\n", + "#### Subseries Neighbor search:\n", + "$K$-nn based and/or range ($r$) based search (radius only for now, extent necessary for [k-Motiflefts](https://www.vldb.org/pvldb/vol16/p725-schafer.pdf) ?). Given a series $X$ and a subseries $W_i$, find the other subseries in $X$ that are the most similar to $W_i$. In terms of parameterization, we want to be able to toggle on/off :\n", + "- ignore neighboring matches. Given $W_j$ a neighboring subseries of $W_i$, the subseries $W_{j-l//\\epsilon}, ..., W_{j+l//\\epsilon}$ cannot be in the returned set.\n", + "- inverse distance. Return the worst matches instead of the best ones.\n", + "- normalize. Wheter subseries should be normalized prior to distance computations\n", + "\n", + "#### Subseries Motif search :\n", + "Extract $k$-motifs or range $r$-motifs.\n", + "\n", + "The $k^{th}$ motif is the $k^{th}$ most similar pair of subseries in $X$. Given $\\forall a,b,i,j$ the pair ${W_i, W_j}$ is the motif if $dist(W_i, W_j) ≤ dist(W_a, W_b), i \\neq j$ and $a \\neq b$\n", + "\n", + "For the $r$-motif,: $S$ is a maximal set of subseries with range $r$ if $\\forall\\ W_i,W_j \\in S,\\ dist(W_i, W_j) \\leq 2r$ and $\\forall\\ W_a \\in {\\cal W}-S,\\ dist(W_a, W_i) > 2r$\n", + "\n", + "\n", + "#### Compute self distance profile\n", + "Given a subseries $W_i$, compute the self distance profile to $X$. Returns a vector of size $m-l+1$ containing the distance to all subseries. \n", + "\n", + "In terms of parameterization, we want to be able to toggle on/off :\n", + "- ignore neighboring matches. Given $W_j$ a neighboring subseries of $W_i$, the subseries $W_{j-l//\\epsilon}, ..., W_{j+l//\\epsilon}$ cannot be in the returned set.\n", + "- inverse distance. Return the worst matches instead of the best ones.\n", + "- normalize. Wheter subseries should be normalized prior to distance computations\n", + "\n", + "\n", + "#### Compute self matrix profile\n", + "Given a series $X$ and a length parameter $l$, compute its self matrix profile. Returns a vector of size $m-l+1$ containing the distances to the best matches of each subseries $W_i$, and another vector of size $m-l+1$ containg the timestamp of the best matches in $X$ for each subseries. Implement it as A/B matrix profile with B=A.\n", + "\n", + "In terms of parameterization, we want to be able to toggle on/off :\n", + "- $k$ : number of best matches to return for each subseries in $X$\n", + "- $r$ : maximal distance of the best matches to be in the returned set for each subseries in $X$\n", + "- ignore neighboring matches. Given $W_j$ a neighboring subseries of $W_i$, the subseries $W_{j-l//\\epsilon}, ..., W_{j+l//\\epsilon}$ cannot be in the returned set.\n", + "- inverse distance. Return the worst matches instead of the best ones.\n", + "- normalize. Wheter subseries should be normalized prior to distance computations\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "7313599d-66e2-4d03-959e-bd0abe05baed", + "metadata": {}, + "source": [ + "\n", + "## Collection tasks\n", + "Given a time series collection $\\cal X$, we want to be able to do the following tasks :\n", + "(we consider all subseries $W_{i,j}$ part its of $\\cal X$ due to notation but doesn't have to be when given as inputs for example in neighbor search).\n", + "\n", + "#### Subseries Neighbor search :\n", + "$K$-nn based and/or range ($r$) based search (radius only for now, extent necessary for [k-Motiflefts](https://www.vldb.org/pvldb/vol16/p725-schafer.pdf) ?). Given a subseries $W_{i,j}$, find the other subseries in $\\cal X$ that are the most similar to $W_{i,j}$. In terms of parameterization, we want to be able to toggle on/off :\n", + "- ignore neighboring matches. Given ${W_a,b}$ a neighboring subseries of $W_{i,j}$, the subseries $W_{a, b-l//\\epsilon}, ..., W_{a,b+l//\\epsilon}$ cannot be in the returned set.\n", + "- inverse distance. Return the worst matches instead of the best ones.\n", + "- normalize. Wheter subseries should be normalized prior to distance computations\n", + "\n", + "#### Series Neighbor search :\n", + "$K$-nn based and/or range ($r$) based search. Given a series $X_i$, find the other series in $\\cal X$ that are the most similar to $X_i$. In terms of parameterization, we want to be able to toggle on/off :\n", + "- inverse distance. Return the worst matches instead of the best ones.\n", + "- normalize. Wheter subseries should be normalized prior to distance computations\n", + "\n", + "#### Subseries Motif search :\n", + "Extract $k$-motifs or range $r$-motifs.\n", + "\n", + "The $k^{th}$ motif is the $k^{th}$ most similar pair of subseries in $X$. Given $\\forall a,b,a^\\prime,b^\\prime,i,j,i^\\prime,j^\\prime$ the pair $(W_{i,j}, W_{i^\\prime,j^\\prime})$ is the motif if $dist(W_{i,j}, W_{i^\\prime,j^\\prime}) ≤ dist(W_{a,b}, W_{a^\\prime,b^\\prime}), i \\neq i^\\prime, j \\neq j^\\prime, a \\neq a^\\prime, b \\neq b^\\prime$.\n", + "\n", + "For the $r$-motif,: $S$ is a maximal set of subseries with range $r$ if $\\forall\\ (W_{i,j},W_{i^\\prime,j^\\prime}) \\in S,\\ dist(W_{i,j}, W_{i^\\prime,j^\\prime}) \\leq 2r$ and $\\forall\\ W_{a,b} \\in {\\cal W}-S,\\ dist(W_{i,j}, W_{a,b}) > 2r$\n", + "\n", + "\n", + "#### Compute distance profiles :\n", + "Given a subseries $W_{i,j}$, compute the distance profiles to all series in $\\cal X$. Returns a vector of size $n, m-l+1$ containing the distance to all subseries. \n", + "\n", + "In terms of parameterization, we want to be able to toggle on/off :\n", + "- ignore neighboring matches. Given $W_{i,b}$ a neighboring subseries of $W_{i,j}$ the subseries $W_{i,b-l//\\epsilon}, ..., W_{i,b+l//\\epsilon}$ cannot be in the returned set.\n", + "- inverse distance. Return the worst matches instead of the best ones.\n", + "- normalize. Wheter subseries should be normalized prior to distance computations.\n", + "\n", + "\n", + "#### Compute matrix profiles :\n", + "Given a series $X_i \\in {\\cal X}$ and a length parameter $l$, compute its matrix profile over the collection. Returns a vector of size $m-l+1$ containing the distances to the best matches of each subseries $W_{i,j}$, and another vector of size $m-l+1$ containg the timestamp of the best matches in ${\\cal X}$ for each subseries.\n", + "\n", + "In terms of parameterization, we want to be able to toggle on/off :\n", + "- $k$ : number of best matches to return for each subseries in $X$\n", + "- $r$ : maximal distance of the best matches to be in the returned set for each subseries in $X$\n", + "- ignore neighboring matches. Given $W_{a,b}$ a neighbor of subseries $W_{i,j}$ the subseries $W_{a,b-l//\\epsilon}, ..., W_{a,b+l//\\epsilon}$ cannot be in the returned set.\n", + "- inverse distance. Return the worst matches instead of the best ones.\n", + "- normalize. Wheter subseries should be normalized prior to distance computations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be1430f0-dce0-4de4-b702-11ee5e33f462", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (Spyder)", + "language": "python3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From b841b79796e4f7214fd0bb2dc4c381c42cfe554b Mon Sep 17 00:00:00 2001 From: baraline Date: Thu, 16 Jan 2025 09:55:29 +0100 Subject: [PATCH 12/18] Updated series similarity search --- aeon/similarity_search/_base.py | 252 ++----- aeon/similarity_search/collection/__init__.py | 1 + aeon/similarity_search/collection/_base.py | 96 +++ .../collection/motifs/__init__.py | 1 + .../collection/neighbors/__init__.py | 1 + .../neighbors/_rp_cosine_lsh.py} | 0 aeon/similarity_search/series/__init__.py | 7 + aeon/similarity_search/series/_base.py | 115 ++++ aeon/similarity_search/series/_commons.py | 210 ++++++ .../series/motifs/__init__.py | 7 + .../similarity_search/series/motifs/_stomp.py | 457 +++++++++++++ .../series/motifs/tests/__init__.py | 1 + .../series/motifs/tests/test_stomp.py | 149 +++++ .../series/neighbors/__init__.py | 9 + .../series/neighbors/_dummy.py | 159 +++++ .../series/neighbors/_mass.py | 247 +++++++ .../series/neighbors/tests/__init__.py | 1 + .../series/neighbors/tests/test_dummy.py | 40 ++ .../series/neighbors/tests/test_mass.py | 44 ++ .../series/tests/__init__.py | 1 + .../tests/test_base.py | 31 +- .../series/tests/test_commons.py | 174 +++++ .../series_search/__init__.py | 5 - aeon/similarity_search/series_search/_base.py | 264 -------- .../subsequence_search/__init__.py | 17 - .../subsequence_search/_base.py | 509 -------------- .../subsequence_search/_brute_force.py | 319 --------- .../subsequence_search/_commons.py | 170 ----- .../subsequence_search/_stomp.py | 619 ------------------ .../subsequence_search/tests/__init__.py | 1 - .../tests/test_brute_force.py | 172 ----- .../subsequence_search/tests/test_commons.py | 97 --- .../subsequence_search/tests/test_stomp.py | 332 ---------- .../_mock_similarity_searchers.py | 81 +-- aeon/transformations/collection/base.py | 4 +- .../similarity_search_tasks.ipynb | 2 +- 36 files changed, 1793 insertions(+), 2802 deletions(-) create mode 100644 aeon/similarity_search/collection/__init__.py create mode 100644 aeon/similarity_search/collection/_base.py create mode 100644 aeon/similarity_search/collection/motifs/__init__.py create mode 100644 aeon/similarity_search/collection/neighbors/__init__.py rename aeon/similarity_search/{series_search/_rp_lsh.py => collection/neighbors/_rp_cosine_lsh.py} (100%) create mode 100644 aeon/similarity_search/series/__init__.py create mode 100644 aeon/similarity_search/series/_base.py create mode 100644 aeon/similarity_search/series/_commons.py create mode 100644 aeon/similarity_search/series/motifs/__init__.py create mode 100644 aeon/similarity_search/series/motifs/_stomp.py create mode 100644 aeon/similarity_search/series/motifs/tests/__init__.py create mode 100644 aeon/similarity_search/series/motifs/tests/test_stomp.py create mode 100644 aeon/similarity_search/series/neighbors/__init__.py create mode 100644 aeon/similarity_search/series/neighbors/_dummy.py create mode 100644 aeon/similarity_search/series/neighbors/_mass.py create mode 100644 aeon/similarity_search/series/neighbors/tests/__init__.py create mode 100644 aeon/similarity_search/series/neighbors/tests/test_dummy.py create mode 100644 aeon/similarity_search/series/neighbors/tests/test_mass.py create mode 100644 aeon/similarity_search/series/tests/__init__.py rename aeon/similarity_search/{subsequence_search => series}/tests/test_base.py (76%) create mode 100644 aeon/similarity_search/series/tests/test_commons.py delete mode 100644 aeon/similarity_search/series_search/__init__.py delete mode 100644 aeon/similarity_search/series_search/_base.py delete mode 100644 aeon/similarity_search/subsequence_search/__init__.py delete mode 100644 aeon/similarity_search/subsequence_search/_base.py delete mode 100644 aeon/similarity_search/subsequence_search/_brute_force.py delete mode 100644 aeon/similarity_search/subsequence_search/_commons.py delete mode 100644 aeon/similarity_search/subsequence_search/_stomp.py delete mode 100644 aeon/similarity_search/subsequence_search/tests/__init__.py delete mode 100644 aeon/similarity_search/subsequence_search/tests/test_brute_force.py delete mode 100644 aeon/similarity_search/subsequence_search/tests/test_commons.py delete mode 100644 aeon/similarity_search/subsequence_search/tests/test_stomp.py diff --git a/aeon/similarity_search/_base.py b/aeon/similarity_search/_base.py index 68e35c6cbb..fc58838eb4 100644 --- a/aeon/similarity_search/_base.py +++ b/aeon/similarity_search/_base.py @@ -1,176 +1,87 @@ """Base class for similarity search.""" __maintainer__ = ["baraline"] +__all__ = [ + "BaseSimilaritySearch", +] + from abc import abstractmethod -from typing import Optional, Union, final +from typing import Union import numpy as np -from numba import get_num_threads, set_num_threads from numba.typed import List -from aeon.base import BaseCollectionEstimator - +from aeon.base import BaseAeonEstimator -class BaseSimilaritySearch(BaseCollectionEstimator): - """ - Base class for similarity search applications. - Parameters - ---------- - normalise : bool, optional - Whether the inputs should be z-normalised. The default is False. - n_jobs : int, optional - Number of parallel jobs to use. The default is 1. - """ +class BaseSimilaritySearch(BaseAeonEstimator): + """Base class for similarity search applications.""" _tags = { + "requires_y": False, "capability:multivariate": True, - "capability:multithreading": True, "fit_is_empty": False, - "X_inner_type": ["np-list", "numpy3D"], } @abstractmethod - def __init__( - self, - normalise: Optional[bool] = False, - n_jobs: Optional[int] = 1, - ): - self.n_jobs = n_jobs - self.normalise = normalise + def __init__(self): super().__init__() - @final + @abstractmethod def fit( self, X: Union[np.ndarray, List], y=None, ): """ - Fit method: data preprocessing and storage. + Fit estimator to X. + + State change: + Changes state to "fitted". + + Writes to self: + _is_fitted : flag is set to True. Parameters ---------- - X : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints) - Input array to be used as database for the similarity search. If it is an - unequal length collection, it should be a list of 2d numpy arrays. - y : optional - Not used. - - Raises - ------ - TypeError - If the input X array is not 3D raise an error. + X : Series or Collection, any supported type + Data to fit transform to, of python type as follows: + Series: 2D np.ndarray shape (n_channels, n_timepoints) + Collection: 3D np.ndarray shape (n_cases, n_channels, n_timepoints) + or list of 2D np.ndarray, case i has shape (n_channels, n_timepoints_i) + y: ignored, exists for API consistency reasons. Returns ------- - self + self : a fitted instance of the estimator """ - self.reset() - prev_threads = get_num_threads() - self._check_fit_format(X) - X = self._preprocess_collection(X) - # Store minimum number of n_timepoints for unequal length collections - self.min_timepoints_ = min([X[i].shape[-1] for i in range(len(X))]) - self.n_channels_ = X[0].shape[0] - self.n_cases_ = len(X) - if self.metadata_["unequal_length"]: - X = List(X) - set_num_threads(self._n_jobs) - self._fit(X, y) - set_num_threads(prev_threads) - self.is_fitted = True - return self + ... @abstractmethod - def find_motifs( + def predict( self, - X: np.ndarray, - k: int, - threshold: float, + X: Union[np.ndarray, None] = None, ): """ - Find the top-k motifs in the training data. - - Given ``k`` and ``threshold`` parameters, this methods returns the top-k motif - sets. We define a motif set as a set of candidates which all are at a distance - of at most ``threshold`` from each other. The top-k motifs sets are the - motif sets with the most candidates. + Predict method. Parameters ---------- - X : np.ndarray, - A series in which we want to indentify motifs. - k : int, optional - Number of motifs to return - threshold : int, optional - A threshold on the similarity measure to determine which candidates will be - part of a motif set. - - Returns - ------- - ndarray, shape=(k,) - A numpy array of at most ``k`` elements containing the indexes of the - motifs. - ndarray, shape=(k,) - A numpy array of at most ``k`` elements containing the distances of the - motifs to . - + X : 2D np.array of shape ``(n_cases, n_timepoints)`` + Optional data to use for predict. """ ... - @abstractmethod - def find_neighbors( - self, - X: np.ndarray, - k: Optional[int] = 1, - threshold: Optional[float] = np.inf, - ): + def _check_predict_series_format(self, X): """ - Find the top-k neighbors of X in the database. - - Given ``k`` and ``threshold`` parameters, this methods returns the top-k - neighbors of X, such as each of the ``k`` neighbors as a distance inferior or - equal to ``threshold``. By default, ``threshold`` is set to infinity. It is - possible for this method to return less than ``k`` neighbors, either if there - is less than ``k`` admissible candidate in the database, or if in the top-k - candidates, some do not meet the ``threshold`` condition. + Check wheter a series X in predict is correctly formated. Parameters ---------- - X: np.ndarray - The query for which we want to identify nearest neighbors in the database. - k : int, optional - Number of neighbors to return. - threshold : int, optional - A threshold on the distance to determine which candidates will be returned. - - Returns - ------- - ndarray, shape=(k,) - A numpy array of at most ``k`` elements containing the indexes of the - candidates in each motif. - + X : np.ndarray, shape = (n_channels, n_timepoints) + A series to be used in predict. """ - ... - - def _check_fit_format(self, X): - if isinstance(X, np.ndarray): # "numpy3D" or numpy2D - if X.ndim != 3: - raise TypeError( - f"A np.ndarray given in fit must be 3D but found {X.ndim}D" - ) - elif isinstance(X, list): # np-list or df-list - if isinstance(X[0], np.ndarray): # if one a numpy they must all be 2D numpy - for a in X: - if not (isinstance(a, np.ndarray) and a.ndim == 2): - raise TypeError( - "A np-list given in fit must contain 2D np.ndarray but" - f" found {a.ndim}D" - ) - - def _check_find_neighbors_motif_format(self, X): if isinstance(X, np.ndarray): if X.ndim != 2: raise TypeError( @@ -186,91 +97,8 @@ def _check_find_neighbors_motif_format(self, X): f"Expected X to have {self.n_channels_} channels but" f" got {X.shape[0]} channels." ) - - @abstractmethod - def _fit(self, X, y=None): ... - - def _check_X_index_int(self, X_index: int): - """ - Check wheter the X_index parameter is correctly formated and is admissible. - - This check is made for motif search functions. - - Parameters - ---------- - X_index : int - Index of a series in X_. - - Returns - ------- - X_index : int - Index of a series in X_ - - """ - if X_index is not None: - if not isinstance(X_index, int): - raise TypeError("Expected an integer for X_index but got {X_index}") - - if X_index >= self.n_cases_ or X_index < 0: - raise ValueError( - "The value of X_index cannot exced the number " - "of series in the collection given during fit. Expected a value " - f"between [0, {self.n_cases_ - 1}] but got {X_index}" - ) - return X_index - - def _check_X_index_array(self, X_index: np.ndarray): - """ - Check wheter the X_index parameter is correctly formated and is admissible. - - This check is made for neighbour search functions. - - Parameters - ---------- - X_index : np.ndarray, 1D array of shape (2) - Array of integer containing the sample and timestamp identifiers of the - starting point of a subsequence in X_. - - Returns - ------- - X_index : np.ndarray, 1D array of shape (2) - Array of integer containing the sample and timestamp identifiers of the - starting point of a subsequence in X_. - - """ - if X_index is not None: - if ( - isinstance(X_index, list) - and len(X_index) == 2 - and isinstance(X_index[0], int) - and isinstance(X_index[1], int) - ): - X_index = np.asarray(X_index, dtype=int) - elif len(X_index) != 2: - raise TypeError( - "Expected a numpy array or list of integers with 2 elements " - f"for X_index but got {X_index}" - ) - elif ( - not (isinstance(X_index[0], int) or not isinstance(X_index[1], int)) - or X_index.dtype != int - ): - raise TypeError( - "Expected a numpy array or list of integers for X_index but got " - f"{X_index}" - ) - - if X_index[0] >= self.n_cases_ or X_index[0] < 0: - raise ValueError( - "The sample ID (first element) of X_index cannot exced the number " - "of series in the collection given during fit. Expected a value " - f"between [0, {self.n_cases_ - 1}] but got {X_index[0]}" - ) - _max_timestamp = self.X_[X_index[0]].shape[1] - self.length + 1 - if X_index[1] >= _max_timestamp: - raise ValueError( - "The timestamp ID (second element) of X_index cannot exced the " - "number of timestamps minus the length parameter plus one. Expected" - f" a value between [0, {_max_timestamp - 1}] but got {X_index[1]}" - ) - return X_index + if hasattr(self, "length") and X.shape[1] != self.length: + raise ValueError( + f"Expected X to have {self.length} timepoints but" + f" got {X.shape[1]} timepoints." + ) diff --git a/aeon/similarity_search/collection/__init__.py b/aeon/similarity_search/collection/__init__.py new file mode 100644 index 0000000000..0aef46ef49 --- /dev/null +++ b/aeon/similarity_search/collection/__init__.py @@ -0,0 +1 @@ +"""Similarity search for time series collection.""" diff --git a/aeon/similarity_search/collection/_base.py b/aeon/similarity_search/collection/_base.py new file mode 100644 index 0000000000..402b7342a2 --- /dev/null +++ b/aeon/similarity_search/collection/_base.py @@ -0,0 +1,96 @@ +"""Base similiarity search for collections.""" + +__maintainer__ = ["baraline"] +__all__ = [ + "BaseCollectionSimilaritySearch", +] + +from abc import abstractmethod +from typing import Union, final + +import numpy as np +from numba import get_num_threads, set_num_threads + +from aeon.base import BaseCollectionEstimator +from aeon.similarity_search._base import BaseSimilaritySearch + + +class BaseCollectionSimilaritySearch(BaseCollectionEstimator, BaseSimilaritySearch): + """Similarity search base class for collections.""" + + # tag values specific to CollectionTransformers + _tags = { + "input_data_type": "Collection", + } + + @abstractmethod + def __init__(self, n_jobs=1): + self.n_jobs = n_jobs + super().__init__() + + @final + def fit( + self, + X: np.ndarray, + y=None, + ): + """ + Fit method: data preprocessing and storage. + + Parameters + ---------- + X : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints) + Input array to be used as database for the similarity search. If it is an + unequal length collection, it should be a list of 2d numpy arrays. + y : optional + Not used. + + Raises + ------ + TypeError + If the input X array is not 3D raise an error. + + Returns + ------- + self + """ + self.reset() + X = self._preprocess_collection(X) + # Store minimum number of n_timepoints for unequal length collections + self.n_channels_ = X[0].shape[1] + self.n_cases_ = len(X) + self.X_ = X + + prev_threads = get_num_threads() + set_num_threads(self._n_jobs) + self._fit(X, y=y) + set_num_threads(prev_threads) + + self.is_fitted = True + return self + + @abstractmethod + def _fit( + self, + X: np.ndarray, + y=None, + ): ... + + def _pre_predict( + self, + X: Union[np.ndarray, None] = None, + ): + """ + Predict method. + + Parameters + ---------- + X : Union[np.ndarray, None], optional + Optional data to use for predict.. The default is None. + + """ + self._check_is_fitted() + if X is not None: + # Could we call somehow _preprocess_series from a BaseCollectionEstimator ? + self._check_predict_format(X) + return X diff --git a/aeon/similarity_search/collection/motifs/__init__.py b/aeon/similarity_search/collection/motifs/__init__.py new file mode 100644 index 0000000000..fc014bcced --- /dev/null +++ b/aeon/similarity_search/collection/motifs/__init__.py @@ -0,0 +1 @@ +"""Motif search for time series collection.""" diff --git a/aeon/similarity_search/collection/neighbors/__init__.py b/aeon/similarity_search/collection/neighbors/__init__.py new file mode 100644 index 0000000000..e9a5d49d49 --- /dev/null +++ b/aeon/similarity_search/collection/neighbors/__init__.py @@ -0,0 +1 @@ +"""Neighbors search for time series collection.""" diff --git a/aeon/similarity_search/series_search/_rp_lsh.py b/aeon/similarity_search/collection/neighbors/_rp_cosine_lsh.py similarity index 100% rename from aeon/similarity_search/series_search/_rp_lsh.py rename to aeon/similarity_search/collection/neighbors/_rp_cosine_lsh.py diff --git a/aeon/similarity_search/series/__init__.py b/aeon/similarity_search/series/__init__.py new file mode 100644 index 0000000000..23df7d1b53 --- /dev/null +++ b/aeon/similarity_search/series/__init__.py @@ -0,0 +1,7 @@ +"""Similarity search for series.""" + +__all__ = [ + "BaseSeriesSimilaritySearch", +] + +from aeon.similarity_search.series._base import BaseSeriesSimilaritySearch diff --git a/aeon/similarity_search/series/_base.py b/aeon/similarity_search/series/_base.py new file mode 100644 index 0000000000..b9cca5d5cb --- /dev/null +++ b/aeon/similarity_search/series/_base.py @@ -0,0 +1,115 @@ +"""Base similiarity search for series.""" + +from abc import abstractmethod +from typing import Union, final + +import numpy as np +from numba import get_num_threads, set_num_threads + +from aeon.base import BaseSeriesEstimator +from aeon.similarity_search._base import BaseSimilaritySearch +from aeon.utils.validation import check_n_jobs + + +class BaseSeriesSimilaritySearch(BaseSeriesEstimator, BaseSimilaritySearch): + """Base class for similarity search applications on single series.""" + + _tags = { + "input_data_type": "Series", + } + + @abstractmethod + def __init__(self, axis=1, n_jobs=1): + self.n_jobs = n_jobs + super().__init__(axis=axis) + + @final + def fit( + self, + X: np.ndarray, + y=None, + ): + """ + Fit method: data preprocessing and storage. + + Parameters + ---------- + X : np.ndarray, 2D array of shape (n_channels, n_timepoints) + Input series to be used for the similarity search operations. + y : optional + Not used. + + Raises + ------ + TypeError + If the input X array is not 2D raise an error. + + Returns + ------- + self + """ + self.reset() + self._n_jobs = check_n_jobs(self.n_jobs) + X = self._preprocess_series(X, self.axis, True) + # Store minimum number of n_timepoints for unequal length collections + self.n_channels_ = X.shape[0] + self.n_timepoints_ = X.shape[1] + self.X_ = X + + prev_threads = get_num_threads() + set_num_threads(self._n_jobs) + self._fit(X, y=y) + set_num_threads(prev_threads) + + self.is_fitted = True + return self + + @abstractmethod + def _fit( + self, + X: np.ndarray, + y=None, + ): ... + + def _pre_predict( + self, + X: Union[np.ndarray, None] = None, + ): + """ + Predict method. + + Parameters + ---------- + X : Union[np.ndarray, None], optional + Optional data to use for predict.. The default is None. + + """ + self._check_is_fitted() + if X is not None: + X = self._preprocess_series(X, self.axis, False) + self._check_predict_format(X) + return X + + def _check_X_index(self, X_index: int): + """ + Check wheter a X_index parameter is correctly formated and is admissible. + + Parameters + ---------- + X_index : int + Index of a timestamp in X_. + + """ + if X_index is not None: + if not isinstance(X_index, int): + raise TypeError("Expected an integer for X_index but got {X_index}") + + max_timepoints = self.n_timepoints_ + if hasattr(self, "length"): + max_timepoints -= self.length + if X_index >= max_timepoints or X_index < 0: + raise ValueError( + "The value of X_index cannot exced the number " + "of timepoint in series given during fit. Expected a value " + f"between [0, {max_timepoints - 1}] but got {X_index}" + ) diff --git a/aeon/similarity_search/series/_commons.py b/aeon/similarity_search/series/_commons.py new file mode 100644 index 0000000000..d14281573f --- /dev/null +++ b/aeon/similarity_search/series/_commons.py @@ -0,0 +1,210 @@ +"""Helper and common function for similarity search series estimators.""" + +__maintainer__ = ["baraline"] + +import numpy as np +from numba import njit +from scipy.signal import convolve + + +def fft_sliding_dot_product(X, q): + """ + Use FFT convolution to calculate the sliding window dot product. + + This function applies the Fast Fourier Transform (FFT) to efficiently compute + the sliding dot product between the input time series `X` and the query `q`. + The dot product is computed for each channel individually. The sliding window + approach ensures that the dot product is calculated for every possible subsequence + of `X` that matches the length of `q` + + Parameters + ---------- + X : array, shape=(n_channels, n_timepoints) + Input time series + q : array, shape=(n_channels, query_length) + Input query + + Returns + ------- + out : np.ndarray, 2D array of shape (n_channels, n_timepoints - query_length + 1) + Sliding dot product between q and X. + """ + n_channels, n_timepoints = X.shape + query_length = q.shape[1] + out = np.zeros((n_channels, n_timepoints - query_length + 1)) + for i in range(n_channels): + out[i, :] = convolve(np.flipud(q[i, :]), X[i, :], mode="valid").real + return out + + +def get_ith_products(X, T, L, ith): + """ + Compute dot products between X and the i-th subsequence of size L in T. + + Parameters + ---------- + X : array, shape = (n_channels, n_timepoints_X) + Input data. + T : array, shape = (n_channels, n_timepoints_T) + Data containing the query. + L : int + Overall query length. + ith : int + Query starting index in T. + + Returns + ------- + np.ndarray, 2D array of shape (n_channels, n_timepoints_X - L + 1) + Sliding dot product between the i-th subsequence of size L in T and X. + + """ + return fft_sliding_dot_product(X, T[:, ith : ith + L]) + + +def _inverse_distance_profile(dist_profile): + return 1 / (dist_profile + 1e-8) + + +@njit(cache=True) +def _extract_top_k_from_dist_profile( + dist_profile, + k, + threshold, + allow_trivial_matches, + exclusion_size, +): + """ + Given a distance profiles, extract the top k lower distances. + + Parameters + ---------- + dist_profile : np.ndarray, shape = (n_timepoints - length + 1) + A distance profile of length ``n_timepoints - length + 1``, with + ``length`` the size of the query used to compute the distance profiles. + k : int + Number of best matches to return + threshold : float + A threshold on the distances of the best matches. To be returned, a candidate + must have a distance bellow this threshold. This can reduce the number of + returned matches to be bellow ``k`` + allow_trivial_matches : bool + Wheter to allow returning matches that are in the same neighborhood. + exclusion_size : int + The size of the exlusion size to apply when ``allow_trivial_matches`` is + False. It is applied on both side of existing matches (+/- their indexes). + + Returns + ------- + top_k_indexes : np.ndarray, shape = (k) + The indexes of the best matches in ``distance_profile``. + top_k_distances : np.ndarray, shape = (k) + The distances of the best matches. + + """ + if k == np.inf: + k = dist_profile.shape[0] + top_k_indexes = np.zeros((k), dtype=np.int64) - 1 + top_k_distances = np.full(k, np.inf, dtype=np.float64) + ub = np.full(k, np.inf) + lb = np.full(k, -1.0) + # Could be optimized by using argpartition + sorted_indexes = np.argsort(dist_profile) + _current_k = 0 + if not allow_trivial_matches: + _current_j = 0 + # Until we extract k value or explore all the array or until dist is > threshold + while _current_k < k and _current_j < len(sorted_indexes): + # if we didn't insert anything or there is a conflict in lb/ub + if _current_k > 0 and np.any( + (sorted_indexes[_current_j] >= lb[:_current_k]) + & (sorted_indexes[_current_j] <= ub[:_current_k]) + ): + pass + else: + _idx = sorted_indexes[_current_j] + if dist_profile[_idx] <= threshold: + top_k_indexes[_current_k] = _idx + top_k_distances[_current_k] = dist_profile[_idx] + ub[_current_k] = min( + top_k_indexes[_current_k] + exclusion_size, + len(dist_profile), + ) + lb[_current_k] = max(top_k_indexes[_current_k] - exclusion_size, 0) + _current_k += 1 + else: + break + _current_j += 1 + else: + _current_k += min(k, len(dist_profile)) + dist_profile = dist_profile[sorted_indexes[:_current_k]] + dist_profile = dist_profile[dist_profile <= threshold] + _current_k = len(dist_profile) + + top_k_indexes[:_current_k] = sorted_indexes[:_current_k] + top_k_distances[:_current_k] = dist_profile[:_current_k] + + return top_k_indexes[:_current_k], top_k_distances[:_current_k] + + +# Could add aggregation function as parameter instead of just max +def _extract_top_k_motifs(MP, IP, k): + criterion = np.zeros(len(MP)) + for i in range(len(MP)): + criterion[i] = max(MP[i]) + idx = np.argsort(criterion) + return [MP[i] for i in idx[:k]], [IP[i] for i in idx[:k]] + + +def _extract_top_r_motifs(MP, IP, k): + criterion = np.zeros(len(MP)) + for i in range(len(MP)): + criterion[i] = len(MP[i]) + idx = np.argsort(criterion)[::-1] + return [MP[i] for i in idx[:k]], [IP[i] for i in idx[:k]] + + +@njit(cache=True, fastmath=True) +def _update_dot_products( + X, + T, + XT_products, + L, + i_query, +): + """ + Update dot products of the i-th query of size L in T from the dot products of i-1. + + Parameters + ---------- + X: np.ndarray, 2D array of shape (n_channels, n_timepoints) + Input time series on which the sliding dot product is computed. + T: np.ndarray, 2D array of shape (n_channels, series_length) + The series used for similarity search. Note that series_length can be equal, + superior or inferior to n_timepoints, it doesn't matter. + L : int + The length of the subsequences considered during the search. This parameter + cannot be larger than n_timepoints and series_length. + i_query : int + Query starting index in T. + + Returns + ------- + XT_products : np.ndarray of shape (n_channels, n_timepoints - L + 1) + Sliding dot product between the i-th subsequence of size L in T and X. + + """ + n_channels = T.shape[0] + Q = T[:, i_query : i_query + L] + n_candidates = X.shape[1] - L + 1 + + for i_ft in range(n_channels): + # first element of all 0 to n-1 candidates * first element of previous query + _a1 = X[i_ft, : n_candidates - 1] * T[i_ft, i_query - 1] + # last element of all 1 to n candidates * last element of current query + _a2 = X[i_ft, L : L - 1 + n_candidates] * T[i_ft, i_query + L - 1] + + XT_products[i_ft, 1:] = XT_products[i_ft, :-1] - _a1 + _a2 + + # Compute first dot product + XT_products[i_ft, 0] = np.sum(Q[i_ft] * X[i_ft, :L]) + return XT_products diff --git a/aeon/similarity_search/series/motifs/__init__.py b/aeon/similarity_search/series/motifs/__init__.py new file mode 100644 index 0000000000..d4853a68fe --- /dev/null +++ b/aeon/similarity_search/series/motifs/__init__.py @@ -0,0 +1,7 @@ +"""Subsequence Neighbor search for series.""" + +__all__ = [ + "StompMotif", +] + +from aeon.similarity_search.series.motifs._stomp import StompMotif diff --git a/aeon/similarity_search/series/motifs/_stomp.py b/aeon/similarity_search/series/motifs/_stomp.py new file mode 100644 index 0000000000..9287d80241 --- /dev/null +++ b/aeon/similarity_search/series/motifs/_stomp.py @@ -0,0 +1,457 @@ +"""Implementation of STOMP with squared euclidean distance.""" + +__maintainer__ = ["baraline"] +__all__ = ["StompMotif"] + +from typing import Optional + +import numpy as np +from numba import njit +from numba.typed import List + +from aeon.similarity_search.series._base import BaseSeriesSimilaritySearch +from aeon.similarity_search.series._commons import ( + _extract_top_k_from_dist_profile, + _extract_top_k_motifs, + _extract_top_r_motifs, + _inverse_distance_profile, + _update_dot_products, + get_ith_products, +) +from aeon.similarity_search.series.neighbors._mass import ( + _normalized_squared_distance_profile, + _squared_distance_profile, +) +from aeon.utils.numba.general import sliding_mean_std_one_series + + +class StompMotif(BaseSeriesSimilaritySearch): + """ + Estimator to extract top k motifs using STOMP, descibed in [1]_. + + This estimators allows to perform multiple type of motif search operations by using + different parameterization. We base oursleves on Figure 3 of [2]_ to establish the + following list, we do not yet support "Learning" and "Valmod" motifs : + + - for "Pair Motifs" : This is the default configuration + + - for "k-Motiflets" : { + "motif_size": k, + } + + - for "k-motifs" (naming is confusing here, it is a range based motif): { + "motif_size":np.inf, + "dist_threshold":r, + "motif_extraction_method":"r_motifs" + } + + Parameters + ---------- + length : int + The length of the motifs to extract. This is the length of the subsequence + that will be used in the computations. + normalize : bool + Wheter the computations between subsequences should use a z-normalied distance. + + Notes + ----- + This estimator only provide exact computation method, faster approximate methods + also exists in the litterature. We use a squared euclidean distance instead of the + euclidean distance, if you want euclidean distance results, you should square root + the obtained results. + + References + ---------- + .. [1] Yan Zhu, Zachary Zimmerman, Nader Shakibay Senobari, Chin-Chia Michael + Yeh, Gareth Funning, Abdullah Mueen, Philip Brisk, and Eamonn Keogh. 2016. + Matrix profile II: Exploiting a novel algorithm and GPUs to break the one hundred + million barrier for time series motifs and joins. In 2016 IEEE 16th international + conference on data mining (ICDM). IEEE, 739–748. + .. [2] Patrick Schäfer and Ulf Leser. 2022. Motiflets: Simple and Accurate Detection + of Motifs in Time Series. Proc. VLDB Endow. 16, 4 (December 2022), 725–737. + https://doi.org/10.14778/3574245.3574257 + """ + + def __init__( + self, + length: int, + normalize: Optional[bool] = False, + ): + self.length = length + self.normalize = normalize + super().__init__() + + def _fit( + self, + X: np.ndarray, + y=None, + ): + if self.normalize: + self.X_means_, X_stds_ = sliding_mean_std_one_series(X, self.length, 1) + return self + + def predict( + self, + X: np.ndarray = None, + k: Optional[int] = 1, + motif_size: Optional[int] = 1, + dist_threshold: Optional[float] = np.inf, + allow_trivial_matches: Optional[bool] = False, + exclusion_factor: Optional[float] = 2, + inverse_distance: Optional[bool] = False, + motif_extraction_method: Optional[str] = "k_motifs", + ): + """ + Exctract the motifs of X_ relative to a series X using STOMP matrix prfoile. + + To compute self-motifs, X is set to None. + + Parameters + ---------- + X : np.ndarray, shape=(n_channels, n_timepoint) + Series to use to compute the matrix profile against X_. If None, will + compute the self matrix profile of X_. Motifs will then be extracted from + the matrix profile. + k : int + The number of motifs to return. The default is 1, meaning we return only + the motif set with the minimal sum of distances to its query. + motif_size : int + The number of subsequences in a motif. Default is 1, meaning we extract + motif pairs (the query and its best match) + dist_threshold : float + The maximum allowed distance of a candidate subsequence of X to a query + subsequence from X_ for the candidate to be considered as a neighbor. + allow_trivial_matches: bool, optional + Wheter a neighbors of a match to a query can be also considered as matches + (True), or if an exclusion zone is applied around each match to avoid + trivial matches with their direct neighbors (False). + exclusion_factor : float, default=1. + A factor of the query length used to define the exclusion zone when + ``allow_trivial_matches`` is set to False. For a given timestamp, + the exclusion zone starts from + :math:`id_timestamp - length//exclusion_factor` and end at + :math:`id_timestamp + length//exclusion_factor`. + inverse_distance : bool + If True, the matching will be made on the inverse of the distance, and thus, + the farther neighbors will be returned instead of the closest ones. + motif_extraction_method : str + A string indicating the methodology to use to extract the top motifs. + Available methods are "r_motifs" and "k_motifs". "r_motifs" means we rank + motif set by their cardinality, with higher is better. "k_motifs" means + we rank motif set by their maximum distance to their query + + Returns + ------- + np.ndarray, shape = (k, motif_size) + The indexes of the best matches in ``distance_profile``. + np.ndarray, shape = (k, motif_size) + The distances of the best matches. + + """ + X = self._pre_predict(X) + if motif_extraction_method not in ["k_motifs", "r_motifs"]: + raise ValueError( + "Expected motif_extraction_method to be either 'k_motifs' or 'r_motifs'" + f"but got {motif_extraction_method}" + ) + exclusion_size = self.length // exclusion_factor + MP, IP = self.compute_matrix_profile( + X, + motif_size=motif_size, + dist_threshold=dist_threshold, + allow_trivial_matches=allow_trivial_matches, + exclusion_size=exclusion_size, + inverse_distance=inverse_distance, + ) + if motif_extraction_method == "k_motifs": + return _extract_top_k_motifs(MP, IP, k) + elif motif_extraction_method == "r_motifs": + return _extract_top_r_motifs(MP, IP, k) + + def compute_matrix_profile( + self, + X: np.ndarray, + motif_size: Optional[int] = 1, + dist_threshold: Optional[float] = np.inf, + allow_trivial_matches: Optional[bool] = False, + exclusion_size: Optional[float] = 2, + inverse_distance: Optional[bool] = False, + ): + """ + Compute matrix profile. + + The matrix profile is computed on the series given in fit (X_). If X is + not given, computes the self matrix profile of X_. Otherwise, compute the matrix + profile of X_ relative to X. + + Parameters + ---------- + X : np.ndarray, shape = (n_channels, n_timepoints) + A 2D array time series on against which the matrix profile of X_ will be + computed. + motif_size : int + The number of subsequences in a motif. Default is 1, meaning we extract + motif pairs (the query and its best match). + dist_threshold : float + The maximum allowed distance of a candidate subsequence of X to a query + subsequence from X_ for the candidate to be considered as a neighbor. + inverse_distance : bool + If True, the matching will be made on the inverse of the distance, and thus, + the worst matches to the query will be returned instead of the best ones. + exclusion_size : int + The size of the exclusion zone used to prevent returning as top k candidates + the ones that are close to each other (for example i and i+1). + It is used to define a region between + :math:`id_timestamp - exclusion_size` and + :math:`id_timestamp + exclusion_size` which cannot be returned + as best match if :math:`id_timestamp` was already selected. By default, + the value None means that this is not used. + + Returns + ------- + MP : TypedList of np.ndarray (n_timepoints - L + 1) + Matrix profile distances for each query subsequence. n_timepoints is the + number of timepoint of X_. Each element of the list contains array of + variable size. + IP : TypedList of np.ndarray (n_timepoints - L + 1) + Indexes of the top matches for each query subsequence. n_timepoints is the + number of timepoint of X_. Each element of the list contains array of + variable size. + """ + if X is None: + is_self_mp = True + X = self.X_ + if self.normalize: + X_means, X_stds = self.X_means_, self.X_stds_ + else: + is_self_mp = False + if self.normalize: + X_means, X_stds = sliding_mean_std_one_series(X, self.length, 1) + + X_dotX = get_ith_products(X, self.X_, self.length, 0) + + if self.normalize: + MP, IP = _stomp_normalized( + self.X_, + X, + X_dotX, + self.X_means_, + self.X_stds_, + X_means, + X_stds, + self.length, + motif_size, + dist_threshold, + allow_trivial_matches, + exclusion_size, + inverse_distance, + is_self_mp, + ) + else: + MP, IP = _stomp( + self.X_, + X, + X_dotX, + self.length, + motif_size, + dist_threshold, + allow_trivial_matches, + exclusion_size, + inverse_distance, + is_self_mp, + ) + return MP, IP + + +@njit(cache=True, fastmath=True) +def _stomp_normalized( + X_A, + X_B, + AdotB, + X_A_means, + X_A_stds, + X_B_means, + X_B_stds, + L, + motif_size, + dist_threshold, + allow_trivial_matches, + exclusion_size, + inverse_distance, + is_self_mp, +): + """ + Compute the Matrix Profile using the STOMP algorithm with normalized distances. + + X_A : np.ndarray, 2D array of shape (n_channels, n_timepoints) + The series from which the queries will be extracted. + X_B : np.ndarray, 2D array of shape (n_channels, series_length) + The time series on which the distance profile of each query will be computed. + AdotB : np.ndarray, 2D array of shape (n_channels, series_length - L + 1) + Precomputed dot products between the first query of size L of X_A and X_B. + X_A_means : np.ndarray, 2D array of shape (n_channels, n_timepoints - L + 1) + Means of each subsequences of X_A of size L. + X_A_stds : np.ndarray, 2D array of shape (n_channels, n_timepoints - L + 1) + Stds of each subsequences of X of size L. + X_B_means : np.ndarray, 2D array of shape (n_channels, series_length - L + 1) + Means of each subsequences of X_B of size L. + X_B_stds : np.ndarray, 2D array of shape (n_channels, series_length - L + 1) + Stds of each subsequences of X_B of size L. + L : int + Length of the subsequences used for the distance computation. + motif_size : int + The number of subsequences to extract from each distance profile. + dist_threshold : float + The maximum allowed distance of a candidate subsequence of X to a query + subsequence from X_ for the candidate to be considered as a neighbor. + allow_trivial_matches : bool + Wheter the top-k candidates can be neighboring subsequences. + exclusion_size : int + The size of the exclusion zone used to prevent returning as top k candidates + the ones that are close to each other (for example i and i+1). + It is used to define a region between + :math:`id_timestamp - exclusion_size` and + :math:`id_timestamp + exclusion_size` which cannot be returned + as best match if :math:`id_timestamp` was already selected. By default, + the value None means that this is not used. + inverse_distance : bool + If True, the matching will be made on the inverse of the distance, and thus, the + worst matches to the query will be returned instead of the best ones. + is_self_mp : bool + Wheter X_A == X_B. + + Returns + ------- + MP : TypedList of np.ndarray (n_timepoints - L + 1) + Matrix profile distances for each query subsequence. n_timepoints is the + number of timepoint of X_. Each element of the list contains array of + variable size. + IP : TypedList of np.ndarray (n_timepoints - L + 1) + Indexes of the top matches for each query subsequence. n_timepoints is the + number of timepoint of X_. Each element of the list contains array of + variable size. + """ + n_queries = X_A.shape[1] - L + 1 + _max_timestamp = X_B.shape[1] - L + MP = List() + IP = List() + + for i_q in range(n_queries): + # size T.shape[1] - L + 1 + dist_profile = _normalized_squared_distance_profile( + AdotB, X_B_means, X_B_stds, X_A_means[:, i_q], X_A_stds[:, i_q], L + ) + + if i_q + 1 < n_queries: + AdotB = _update_dot_products(X_B, X_A, AdotB, L, i_q + 1) + + if inverse_distance: + dist_profile = _inverse_distance_profile(dist_profile) + + if is_self_mp: + ub = min(i_q + exclusion_size, _max_timestamp) + lb = max(0, i_q - exclusion_size) + dist_profile[lb:ub] = np.inf + + top_indexes, top_dists = _extract_top_k_from_dist_profile( + dist_profile, + motif_size, + dist_threshold, + allow_trivial_matches, + exclusion_size, + ) + + MP.append(top_dists) + IP.append(top_indexes) + + return MP, IP + + +@njit(cache=True, fastmath=True) +def _stomp( + X_A, + X_B, + AdotB, + L, + motif_size, + dist_threshold, + allow_trivial_matches, + exclusion_size, + inverse_distance, + is_self_mp, +): + """ + Compute the Matrix Profile using the STOMP algorithm with non-normalized distances. + + X_A : np.ndarray, 2D array of shape (n_channels, n_timepoints) + The series from which the queries will be extracted. + X_B : np.ndarray, 2D array of shape (n_channels, series_length) + The time series on which the distance profile of each query will be computed. + AdotB : np.ndarray, 2D array of shape (n_channels, series_length - L + 1) + Precomputed dot products between the first query of size L of X_A and X_B. + L : int + Length of the subsequences used for the distance computation. + motif_size : int + The number of subsequences to extract from each distance profile. + dist_threshold : float + The maximum allowed distance of a candidate subsequence of X to a query + subsequence from X_ for the candidate to be considered as a neighbor. + allow_trivial_matches : bool + Wheter the top-k candidates can be neighboring subsequences. + exclusion_size : int + The size of the exclusion zone used to prevent returning as top k candidates + the ones that are close to each other (for example i and i+1). + It is used to define a region between + :math:`id_timestamp - exclusion_size` and + :math:`id_timestamp + exclusion_size` which cannot be returned + as best match if :math:`id_timestamp` was already selected. By default, + the value None means that this is not used. + inverse_distance : bool + If True, the matching will be made on the inverse of the distance, and thus, the + worst matches to the query will be returned instead of the best ones. + is_self_mp : bool + Wheter X_A == X_B. + + Returns + ------- + MP : TypedList of np.ndarray (n_timepoints - L + 1) + Matrix profile distances for each query subsequence. n_timepoints is the + number of timepoint of X_. Each element of the list contains array of + variable size. + IP : TypedList of np.ndarray (n_timepoints - L + 1) + Indexes of the top matches for each query subsequence. n_timepoints is the + number of timepoint of X_. Each element of the list contains array of + variable size. + """ + n_queries = X_A.shape[1] - L + 1 + _max_timestamp = X_B.shape[1] - L + MP = List() + IP = List() + + # For each query of size L in X_A + for i_q in range(n_queries): + Q = X_A[:, i_q : i_q + L] + dist_profile = _squared_distance_profile(AdotB, X_B, Q) + if i_q + 1 < n_queries: + AdotB = _update_dot_products(X_B, X_A, AdotB, L, i_q + 1) + + if inverse_distance: + dist_profile = _inverse_distance_profile(dist_profile) + + if is_self_mp: + ub = min(i_q + exclusion_size, _max_timestamp) + lb = max(0, i_q - exclusion_size) + dist_profile[lb:ub] = np.inf + + top_indexes, top_dists = _extract_top_k_from_dist_profile( + dist_profile, + motif_size, + dist_threshold, + allow_trivial_matches, + exclusion_size, + ) + + MP.append(top_dists) + IP.append(top_indexes) + + return MP, IP diff --git a/aeon/similarity_search/series/motifs/tests/__init__.py b/aeon/similarity_search/series/motifs/tests/__init__.py new file mode 100644 index 0000000000..d0d8f2c42c --- /dev/null +++ b/aeon/similarity_search/series/motifs/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for series motif search methods.""" diff --git a/aeon/similarity_search/series/motifs/tests/test_stomp.py b/aeon/similarity_search/series/motifs/tests/test_stomp.py new file mode 100644 index 0000000000..67ff930de1 --- /dev/null +++ b/aeon/similarity_search/series/motifs/tests/test_stomp.py @@ -0,0 +1,149 @@ +""" +Tests for stomp algorithm. + +We do not test equality for returned indexes due to the unstable nature of argsort +and the fact that the "kind=stable" parameter is not yet supported in numba. We instead +test that the returned index match the expected distance value. +""" + +__maintainer__ = ["baraline"] + +import numpy as np +import pytest +from numpy.testing import assert_almost_equal, assert_array_almost_equal + +from aeon.similarity_search.series._commons import ( + _extract_top_k_from_dist_profile, + _inverse_distance_profile, + get_ith_products, +) +from aeon.similarity_search.series.motifs._stomp import _stomp, _stomp_normalized +from aeon.similarity_search.series.neighbors._dummy import ( + _naive_squared_distance_profile, +) +from aeon.testing.data_generation import make_example_2d_numpy_series +from aeon.utils.numba.general import ( + get_all_subsequences, + sliding_mean_std_one_series, + z_normalise_series_3d, +) + +MOTIFS_SIZE_VALUES = [1, 3] +THRESHOLD = [np.inf, 0.75] +THRESHOLD_NORM = [np.inf, 4.5] +NN_MATCHES = [True, False] +INVERSE = [True, False] + + +@pytest.mark.parametrize("motif_size", MOTIFS_SIZE_VALUES) +@pytest.mark.parametrize("threshold", THRESHOLD) +@pytest.mark.parametrize("allow_trivial_matches", NN_MATCHES) +@pytest.mark.parametrize("inverse_distance", INVERSE) +def test__stomp(motif_size, threshold, allow_trivial_matches, inverse_distance): + """Test STOMP method.""" + L = 3 + + X_A = make_example_2d_numpy_series( + n_channels=2, + n_timepoints=10, + ) + X_B = make_example_2d_numpy_series(n_channels=2, n_timepoints=10) + AdotB = get_ith_products(X_B, X_A, L, 0) + + exclusion_size = L + # MP : distances to best matches for each query + # IP : Indexes of best matches for each query + MP, IP = _stomp( + X_A, + X_B, + AdotB, + L, + motif_size, + threshold, + allow_trivial_matches, + exclusion_size, + inverse_distance, + False, + ) + # For each query of size L in T + X_B_subs = get_all_subsequences(X_B, L, 1) + X_A_subs = get_all_subsequences(X_A, L, 1) + for i in range(X_A.shape[1] - L + 1): + dist_profile = _naive_squared_distance_profile(X_B_subs, X_A_subs[i]) + # Check that the top matches extracted have the same value that the + # top matches in the distance profile + if inverse_distance: + dist_profile = _inverse_distance_profile(dist_profile) + + top_k_indexes, top_k_distances = _extract_top_k_from_dist_profile( + dist_profile, motif_size, threshold, allow_trivial_matches, exclusion_size + ) + # Check that the top matches extracted have the same value that the + # top matches in the distance profile + assert_array_almost_equal(MP[i], top_k_distances) + + # Check that the index in IP correspond to a distance profile point + # with value equal to the corresponding MP point. + for j, index in enumerate(top_k_indexes): + assert_almost_equal(MP[i][j], dist_profile[index]) + + +@pytest.mark.parametrize("motif_size", MOTIFS_SIZE_VALUES) +@pytest.mark.parametrize("threshold", THRESHOLD_NORM) +@pytest.mark.parametrize("allow_trivial_matches", NN_MATCHES) +@pytest.mark.parametrize("inverse_distance", INVERSE) +def test__stomp_normalised( + motif_size, threshold, allow_trivial_matches, inverse_distance +): + """Test STOMP normalised method.""" + L = 3 + + X_A = make_example_2d_numpy_series( + n_channels=2, + n_timepoints=10, + ) + X_B = make_example_2d_numpy_series(n_channels=2, n_timepoints=10) + X_A_means, X_A_stds = sliding_mean_std_one_series(X_A, L, 1) + X_B_means, X_B_stds = sliding_mean_std_one_series(X_B, L, 1) + AdotB = get_ith_products(X_B, X_A, L, 0) + + exclusion_size = L + # MP : distances to best matches for each query + # IP : Indexes of best matches for each query + MP, IP = _stomp_normalized( + X_A, + X_B, + AdotB, + X_A_means, + X_A_stds, + X_B_means, + X_B_stds, + L, + motif_size, + threshold, + allow_trivial_matches, + exclusion_size, + inverse_distance, + False, + ) + # For each query of size L in T + X_B_subs = z_normalise_series_3d(get_all_subsequences(X_B, L, 1)) + X_A_subs = z_normalise_series_3d(get_all_subsequences(X_A, L, 1)) + for i in range(X_A.shape[1] - L + 1): + dist_profile = _naive_squared_distance_profile(X_B_subs, X_A_subs[i]) + # Check that the top matches extracted have the same value that the + # top matches in the distance profile + if inverse_distance: + dist_profile = _inverse_distance_profile(dist_profile) + top_k_indexes, top_k_distances = _extract_top_k_from_dist_profile( + dist_profile, motif_size, threshold, allow_trivial_matches, exclusion_size + ) + + # Check that the top matches extracted have the same value that the + # top matches in the distance profile + assert_array_almost_equal(MP[i], top_k_distances) + + # Check that the index in IP correspond to a distance profile point + # with value equal to the corresponding MP point. + for j, index in enumerate(top_k_indexes): + assert_almost_equal(MP[i][j], dist_profile[index]) diff --git a/aeon/similarity_search/series/neighbors/__init__.py b/aeon/similarity_search/series/neighbors/__init__.py new file mode 100644 index 0000000000..047bfbe9c4 --- /dev/null +++ b/aeon/similarity_search/series/neighbors/__init__.py @@ -0,0 +1,9 @@ +"""Subsequence Neighbor search for series.""" + +__all__ = [ + "DummySNN", + "MassSNN", +] + +from aeon.similarity_search.series.neighbors._dummy import DummySNN +from aeon.similarity_search.series.neighbors._mass import MassSNN diff --git a/aeon/similarity_search/series/neighbors/_dummy.py b/aeon/similarity_search/series/neighbors/_dummy.py new file mode 100644 index 0000000000..bbea714eda --- /dev/null +++ b/aeon/similarity_search/series/neighbors/_dummy.py @@ -0,0 +1,159 @@ +"""Implementation of NN with brute force.""" + +from typing import Optional + +__maintainer__ = ["baraline"] +__all__ = ["DummySNN"] + +import numpy as np +from numba import njit, prange + +from aeon.similarity_search.series._base import BaseSeriesSimilaritySearch +from aeon.similarity_search.series._commons import ( + _extract_top_k_from_dist_profile, + _inverse_distance_profile, +) +from aeon.utils.numba.general import ( + get_all_subsequences, + z_normalise_series_2d, + z_normalise_series_3d, +) + + +class DummySNN(BaseSeriesSimilaritySearch): + """Estimator to compute the on profile and distance profile using brute force.""" + + def __init__( + self, + length: int, + normalize: Optional[bool] = False, + n_jobs: Optional[int] = 1, + ): + self.length = length + self.normalize = normalize + super().__init__(n_jobs=n_jobs) + + def _fit( + self, + X: np.ndarray, + y=None, + ): + self.X_subs = get_all_subsequences(self.X_, self.length, 1) + if self.normalize: + self.X_subs = z_normalise_series_3d(self.X_subs) + return self + + def predict( + self, + X: np.ndarray, + k: Optional[int] = 1, + threshold: Optional[float] = np.inf, + exclusion_factor: Optional[float] = 2, + inverse_distance: Optional[bool] = False, + allow_neighboring_matches: Optional[bool] = False, + X_index: Optional[int] = None, + ): + """ + Compute nearest neighbors to X in subsequences of X_. + + Parameters + ---------- + X : np.ndarray, shape=(n_channels, length) + Subsequence we want to find neighbors for. + k : int + The number of neighbors to return. + threshold : float + The maximum distance of neighbors to X. + inverse_distance : bool + If True, the matching will be made on the inverse of the distance, and thus, + the farther neighbors will be returned instead of the closest ones. + exclusion_factor : float, default=1. + A factor of the query length used to define the exclusion zone when + ``allow_neighboring_matches`` is set to False. For a given timestamp, + the exclusion zone starts from + :math:`id_timestamp - length//exclusion_factor` and end at + :math:`id_timestamp + length//exclusion_factor`. + X_index : Optional[int], optional + If ``X`` is a subsequence of X_, specify its starting timestamp in ``X_``. + If specified, neighboring subsequences of X won't be able to match as + neighbors. + + Returns + ------- + np.ndarray, shape = (k) + The indexes of the best matches in ``distance_profile``. + np.ndarray, shape = (k) + The distances of the best matches. + + """ + X = self._pre_predict(X) + X_index = self._check_X_index(X_index) + dist_profile = self.compute_distance_profile(X) + if inverse_distance: + dist_profile = _inverse_distance_profile(dist_profile) + + if X_index is not None: + exclusion_size = self.length // exclusion_factor + _max_timestamp = self.n_timepoints_ - self.length + ub = min(X_index + exclusion_size, _max_timestamp) + lb = max(0, X_index - exclusion_size) + dist_profile[lb:ub] = np.inf + + return _extract_top_k_from_dist_profile( + dist_profile, + k, + threshold, + allow_neighboring_matches, + exclusion_size, + ) + + def compute_distance_profile(self, X: np.ndarray): + """ + Compute the distance profile of X to all samples in X_. + + Parameters + ---------- + X : np.ndarray, 2D array of shape (n_channels, length) + The query to use to compute the distance profiles. + + Returns + ------- + distance_profile : np.ndarray, 1D array of shape (n_candidates) + The distance profile of X to X_. The ``n_candidates`` value + is equal to ``n_timepoins - length + 1``, with ``n_timepoints`` the + length of X_. + + """ + if self.normalize: + X = z_normalise_series_2d(X) + return _naive_squared_distance_profile(self.X_subs, X) + + +@njit(cache=True, fastmath=True, parallel=True) +def _naive_squared_distance_profile( + X_subs, + Q, +): + """ + Compute a squared euclidean distance profile. + + Parameters + ---------- + X_subs : array, shape=(n_subsequences, n_channels, length) + Subsequences of size length of the input time series to search in. + Q : array, shape=(n_channels, query_length) + Query used during the search. + + Returns + ------- + out : np.ndarray, 1D array of shape (n_samples, n_timepoints_t - query_length + 1) + The distance between the query and all candidates in X. + + """ + n_subs, n_channels, length = X_subs.shape + dist_profile = np.zeros(n_subs) + for i in prange(n_subs): + for j in range(n_channels): + for k in range(length): + dist_profile[i] += (X_subs[i, j, k] - Q[j, k]) ** 2 + return dist_profile diff --git a/aeon/similarity_search/series/neighbors/_mass.py b/aeon/similarity_search/series/neighbors/_mass.py new file mode 100644 index 0000000000..bb56815f4e --- /dev/null +++ b/aeon/similarity_search/series/neighbors/_mass.py @@ -0,0 +1,247 @@ +"""Implementation of NN with MASS.""" + +from typing import Optional + +__maintainer__ = ["baraline"] +__all__ = ["MassSNN"] + +import numpy as np +from numba import njit, prange + +from aeon.similarity_search.series._base import BaseSeriesSimilaritySearch +from aeon.similarity_search.series._commons import ( + _extract_top_k_from_dist_profile, + _inverse_distance_profile, + fft_sliding_dot_product, +) +from aeon.utils.numba.general import ( + AEON_NUMBA_STD_THRESHOLD, + sliding_mean_std_one_series, +) + + +class MassSNN(BaseSeriesSimilaritySearch): + """Estimator to compute the on profile and distance profile using MASS.""" + + def __init__( + self, + length: int, + normalize: Optional[bool] = False, + ): + self.length = length + self.normalize = normalize + super().__init__() + + def _fit( + self, + X: np.ndarray, + y=None, + ): + if self.normalize: + self.X_means_, X_stds_ = sliding_mean_std_one_series(X, self.length, 1) + return self + + def predict( + self, + X: np.ndarray, + k: Optional[int] = 1, + dist_threshold: Optional[float] = np.inf, + allow_trivial_matches: Optional[bool] = False, + exclusion_factor: Optional[float] = 2, + inverse_distance: Optional[bool] = False, + X_index: Optional[int] = None, + ): + """ + Compute nearest neighbors to X in subsequences of X_. + + Parameters + ---------- + X : np.ndarray, shape=(n_channels, length) + Subsequence we want to find neighbors for. + k : int + The number of neighbors to return. + dist_threshold : float + The maximum allowed distance of a candidate subsequence of X_ to X + for the candidate to be considered as a neighbor. + allow_trivial_matches: bool, optional + Wheter a neighbors of a match to a query can be also considered as matches + (True), or if an exclusion zone is applied around each match to avoid + trivial matches with their direct neighbors (False). + inverse_distance : bool + If True, the matching will be made on the inverse of the distance, and thus, + the farther neighbors will be returned instead of the closest ones. + exclusion_factor : float, default=1. + A factor of the query length used to define the exclusion zone when + ``allow_trivial_matches`` is set to False. For a given timestamp, + the exclusion zone starts from + :math:`id_timestamp - length//exclusion_factor` and end at + :math:`id_timestamp + length//exclusion_factor`. + X_index : Optional[int], optional + If ``X`` is a subsequence of X_, specify its starting timestamp in ``X_``. + If specified, neighboring subsequences of X won't be able to match as + neighbors. + + Returns + ------- + np.ndarray, shape = (k) + The indexes of the best matches in ``distance_profile``. + np.ndarray, shape = (k) + The distances of the best matches. + + """ + X = self._pre_predict(X) + X_index = self._check_X_index(X_index) + dist_profile = self.compute_distance_profile(X) + if inverse_distance: + dist_profile = _inverse_distance_profile(dist_profile) + + if X_index is not None: + exclusion_size = self.length // exclusion_factor + _max_timestamp = self.n_timepoints_ - self.length + ub = min(X_index + exclusion_size, _max_timestamp) + lb = max(0, X_index - exclusion_size) + dist_profile[lb:ub] = np.inf + + return _extract_top_k_from_dist_profile( + dist_profile, + k, + dist_threshold, + allow_trivial_matches, + exclusion_size, + ) + + def compute_distance_profile(self, X: np.ndarray): + """ + Compute the distance profile of X to all samples in X_. + + Parameters + ---------- + X : np.ndarray, 2D array of shape (n_channels, length) + The query to use to compute the distance profiles. + + Returns + ------- + distance_profiles : np.ndarray, 2D array of shape (n_cases, n_candidates) + The distance profile of X to all samples in X_. The ``n_candidates`` value + is equal to ``n_timepoins - length + 1``. If X_ is an unequal length + collection, returns a numba typed list instead of an ndarray. + + """ + QT = fft_sliding_dot_product(self.X_, X) + + if self.normalize: + distance_profile = _normalized_squared_distance_profile( + QT, + self.X_means_, + self.X_stds_, + X.mean(axis=1), + X.std(axis=1), + self.length, + ) + else: + distance_profile = _squared_distance_profile( + QT, + self.X_, # T + X, # Q + ) + + return distance_profile + + +@njit(cache=True, fastmath=True) +def _squared_distance_profile(QT, T, Q): + """ + Compute squared distance profile between query subsequence and a single time series. + + This function calculates the squared distance profile for a single time series by + leveraging the dot product of the query and time series as well as precomputed sums + of squares to efficiently compute the squared distances. + + Parameters + ---------- + QT : np.ndarray, 2D array of shape (n_channels, n_timepoints - query_length + 1) + The dot product between the query and the time series. + T : np.ndarray, 2D array of shape (n_channels, series_length) + The series used for similarity search. Note that series_length can be equal, + superior or inferior to n_timepoints, it doesn't matter. + Q : np.ndarray + 2D array of shape (n_channels, query_length) representing query subsequence. + + Returns + ------- + distance_profile : np.ndarray + 2D array of shape (n_channels, n_timepoints - query_length + 1) + The squared distance profile between the query and the input time series. + """ + n_channels, profile_length = QT.shape + query_length = Q.shape[1] + _QT = -2 * QT + distance_profile = np.zeros(profile_length) + for k in prange(n_channels): + _sum = 0 + _qsum = 0 + for j in prange(query_length): + _sum += T[k, j] ** 2 + _qsum += Q[k, j] ** 2 + + distance_profile += _qsum + _QT[k] + distance_profile[0] += _sum + for i in prange(1, profile_length): + _sum += T[k, i + (query_length - 1)] ** 2 - T[k, i - 1] ** 2 + distance_profile[i] += _sum + return distance_profile + + +@njit(cache=True, fastmath=True) +def _normalized_squared_distance_profile( + QT, T_means, T_stds, Q_means, Q_stds, query_length +): + """ + Compute the z-normalized squared Euclidean distance profile for one time series. + + Parameters + ---------- + QT : np.ndarray, 2D array of shape (n_channels, n_timepoints - query_length + 1) + The dot product between the query and the time series. + T_means : np.ndarray, 1D array of length n_channels + The mean values of the time series for each channel. + T_stds : np.ndarray, 2D array of shape (n_channels, profile_length) + The standard deviations of the time series for each channel and position. + Q_means : np.ndarray, 1D array of shape (n_channels) + Means of the query q + Q_stds : np.ndarray, 1D array of shape (n_channels) + Stds of the query q + query_length : int + The length of the query subsequence used for the distance profile computation. + + + Returns + ------- + np.ndarray + 2D array of shape (n_channels, n_timepoints - query_length + 1) containing the + z-normalized squared distance profile between the query subsequence and the time + series. Entries are computed based on the z-normalized values, with special + handling for constant values. + """ + n_channels, profile_length = QT.shape + distance_profile = np.zeros(profile_length) + Q_is_constant = Q_stds <= AEON_NUMBA_STD_THRESHOLD + for i in prange(profile_length): + Sub_is_constant = T_stds[:, i] <= AEON_NUMBA_STD_THRESHOLD + for k in prange(n_channels): + # Two Constant case + if Q_is_constant[k] and Sub_is_constant[k]: + _val = 0 + # One Constant case + elif Q_is_constant[k] or Sub_is_constant[k]: + _val = query_length + else: + denom = query_length * Q_stds[k] * T_stds[k, i] + + p = (QT[k, i] - query_length * (Q_means[k] * T_means[k, i])) / denom + p = min(p, 1.0) + + _val = abs(2 * query_length * (1.0 - p)) + distance_profile[i] += _val + + return distance_profile diff --git a/aeon/similarity_search/series/neighbors/tests/__init__.py b/aeon/similarity_search/series/neighbors/tests/__init__.py new file mode 100644 index 0000000000..00ef2e73ec --- /dev/null +++ b/aeon/similarity_search/series/neighbors/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for series neighbors search methods.""" diff --git a/aeon/similarity_search/series/neighbors/tests/test_dummy.py b/aeon/similarity_search/series/neighbors/tests/test_dummy.py new file mode 100644 index 0000000000..df8ff72655 --- /dev/null +++ b/aeon/similarity_search/series/neighbors/tests/test_dummy.py @@ -0,0 +1,40 @@ +""" +Tests for stomp algorithm. + +We do not test equality for returned indexes due to the unstable nature of argsort +and the fact that the "kind=stable" parameter is not yet supported in numba. We instead +test that the returned index match the expected distance value. +""" + +__maintainer__ = ["baraline"] + +import numpy as np +import pytest +from numpy.testing import assert_almost_equal + +from aeon.similarity_search.series.neighbors._brute_force import ( + _naive_squared_distance_profile, +) +from aeon.testing.data_generation import make_example_2d_numpy_series +from aeon.utils.numba.general import get_all_subsequences, z_normalise_series_2d + +NORMALIZE = [True, False] + + +@pytest.mark.parametrize("normalize", NORMALIZE) +def test__naive_squared_distance_profile(normalize): + """Test Euclidean distance with brute force.""" + L = 3 + X = make_example_2d_numpy_series(n_channels=1, n_timepoints=10) + Q = make_example_2d_numpy_series(n_channels=1, n_timepoints=L) + dist_profile = _naive_squared_distance_profile( + get_all_subsequences(X, L, 1), Q, normalize=normalize + ) + + if normalize: + Q = z_normalise_series_2d(Q) + for i_t in range(X.shape[1] - L + 1): + S = X[:, i_t : i_t + L] + if normalize: + S = z_normalise_series_2d(X[:, i_t : i_t + L]) + assert_almost_equal(dist_profile[i_t], np.sum((S - Q) ** 2)) diff --git a/aeon/similarity_search/series/neighbors/tests/test_mass.py b/aeon/similarity_search/series/neighbors/tests/test_mass.py new file mode 100644 index 0000000000..b6bf1953ea --- /dev/null +++ b/aeon/similarity_search/series/neighbors/tests/test_mass.py @@ -0,0 +1,44 @@ +"""Tests for MASS algorithm.""" + +__maintainer__ = ["baraline"] + +import numpy as np +from numpy.testing import assert_almost_equal + +from aeon.similarity_search.series._commons import fft_sliding_dot_product +from aeon.similarity_search.series.neighbors._mass import ( + _normalized_squared_distance_profile, + _squared_distance_profile, +) +from aeon.testing.data_generation import make_example_2d_numpy_series +from aeon.utils.numba.general import sliding_mean_std_one_series, z_normalise_series_2d + + +def test__squared_distance_profile(): + """Test squared distance profile.""" + L = 3 + X = make_example_2d_numpy_series(n_channels=1, n_timepoints=10) + Q = make_example_2d_numpy_series(n_channels=1, n_timepoints=L) + QX = fft_sliding_dot_product(X, Q) + dist_profile = _squared_distance_profile(QX, X, Q) + for i_t in range(X.shape[1] - L + 1): + assert_almost_equal(dist_profile[i_t], np.sum((X[:, i_t : i_t + L] - Q) ** 2)) + + +def test__normalized_squared_distance_profile(): + """Test Euclidean distance.""" + L = 3 + X = make_example_2d_numpy_series(n_channels=1, n_timepoints=10) + Q = make_example_2d_numpy_series(n_channels=1, n_timepoints=L) + QX = fft_sliding_dot_product(X, Q) + X_mean, X_std = sliding_mean_std_one_series(X, L, 1) + Q_mean = Q.mean(axis=1) + Q_std = Q.std(axis=1) + + dist_profile = _normalized_squared_distance_profile( + QX, X_mean, X_std, Q_mean, Q_std, L + ) + Q = z_normalise_series_2d(Q) + for i_t in range(X.shape[1] - L + 1): + S = z_normalise_series_2d(X[:, i_t : i_t + L]) + assert_almost_equal(dist_profile[i_t], np.sum((S - Q) ** 2)) diff --git a/aeon/similarity_search/series/tests/__init__.py b/aeon/similarity_search/series/tests/__init__.py new file mode 100644 index 0000000000..4762fe16ce --- /dev/null +++ b/aeon/similarity_search/series/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for base class and commons functions.""" diff --git a/aeon/similarity_search/subsequence_search/tests/test_base.py b/aeon/similarity_search/series/tests/test_base.py similarity index 76% rename from aeon/similarity_search/subsequence_search/tests/test_base.py rename to aeon/similarity_search/series/tests/test_base.py index e1a314f38a..d3dc953c6a 100644 --- a/aeon/similarity_search/subsequence_search/tests/test_base.py +++ b/aeon/similarity_search/series/tests/test_base.py @@ -1,10 +1,11 @@ """Test for subsequence search base class.""" +__maintainer__ = ["baraline"] + import pytest from aeon.testing.mock_estimators._mock_similarity_searchers import ( - MockMatrixProfile, - MockSubsequenceSearch, + MockSeriesSimilaritySearch, ) from aeon.testing.testing_data import ( make_example_1d_numpy, @@ -13,11 +14,11 @@ make_example_3d_numpy_list, ) -BASES = [MockMatrixProfile, MockSubsequenceSearch] +BASES = [MockSeriesSimilaritySearch] @pytest.mark.parametrize("base", BASES) -def test_input_shape_fit_neighbord_motifs(base): +def test_input_shape_fit_predict(base): """Test input shapes.""" estimator = base() # dummy data to pass to fit when testing predict/predict_proba @@ -29,31 +30,31 @@ def test_input_shape_fit_neighbord_motifs(base): X_2D_multi = make_example_2d_numpy_series(n_channels=2) X_1D = make_example_1d_numpy() - valid_inputs_fit = [X_3D_uni, X_3D_multi, X_3D_uni_list, X_3D_multi_list] + valid_inputs_fit = [X_2D_uni, X_2D_multi] # Valid inputs for _input in valid_inputs_fit: estimator.fit(_input) - invalid_inputs_fit = [X_2D_uni, X_2D_multi, X_1D] + invalid_inputs_fit = [X_1D, X_3D_multi_list, X_3D_uni_list, X_3D_multi, X_3D_uni] for _input in invalid_inputs_fit: with pytest.raises(TypeError): estimator.fit(_input) - valid_inputs_neighboord_motifs_uni = [X_2D_uni] - invalid_inputs_neighboord_motifs_uni = [ + valid_inputs_predict = [X_2D_uni, X_2D_multi] + invalid_inputs_predict_uni = [ X_1D, X_3D_uni, X_3D_uni_list, ] - invalid_inputs_neighboord_motifs_multi = [ + invalid_inputs_predict_multi = [ X_3D_multi, X_3D_multi_list, ] - L = 5 - estimator_multi = base(length=L).fit(X_3D_multi) - estimator_uni = base(length=L).fit(X_3D_uni) + L = 3 + estimator_multi = base(length=L).fit(X_2D_multi) + estimator_uni = base(length=L).fit(X_2D_uni) - for _input in valid_inputs_neighboord_motifs_uni: + for _input in valid_inputs_predict: estimator_uni.find_neighbors(_input[:, :L]) estimator_uni.find_motifs(_input) with pytest.raises(ValueError): @@ -65,7 +66,7 @@ def test_input_shape_fit_neighbord_motifs(base): with pytest.raises(ValueError): estimator_uni.find_neighbors(_input[:, : L + 2]) - for _input in invalid_inputs_neighboord_motifs_uni: + for _input in invalid_inputs_predict_uni: with pytest.raises(TypeError): estimator_uni.find_neighbors(_input) with pytest.raises(TypeError): @@ -75,7 +76,7 @@ def test_input_shape_fit_neighbord_motifs(base): with pytest.raises(TypeError): estimator_multi.find_motifs(_input) - for _input in invalid_inputs_neighboord_motifs_multi: + for _input in invalid_inputs_predict_multi: with pytest.raises(TypeError): estimator_uni.find_neighbors(_input) with pytest.raises(TypeError): diff --git a/aeon/similarity_search/series/tests/test_commons.py b/aeon/similarity_search/series/tests/test_commons.py new file mode 100644 index 0000000000..774eee8dee --- /dev/null +++ b/aeon/similarity_search/series/tests/test_commons.py @@ -0,0 +1,174 @@ +"""Test _commons.py functions.""" + +__maintainer__ = ["baraline"] +import numpy as np +import pytest +from numba.typed import List +from numpy.testing import assert_, assert_array_almost_equal + +from aeon.similarity_search.series._commons import ( + _extract_top_k_from_dist_profile, + _extract_top_k_motifs, + _extract_top_r_motifs, + _inverse_distance_profile, + _update_dot_products, + fft_sliding_dot_product, + get_ith_products, +) +from aeon.testing.data_generation import ( + make_example_1d_numpy, + make_example_2d_numpy_series, +) + +K_VALUES = [1, 3, np.inf] +THRESHOLDS = [np.inf, 0.7] +NN_MATCHES = [False, True] +EXCLUSION_SIZE = [3, 5] + + +def test_fft_sliding_dot_product(): + """Test the fft_sliding_dot_product function.""" + L = 4 + X = make_example_2d_numpy_series(n_channels=1, n_timepoints=10) + Q = make_example_2d_numpy_series(n_channels=1, n_timepoints=L) + + values = fft_sliding_dot_product(X, Q) + # Compare values[0] only as input is univariate + assert_array_almost_equal( + values[0], + [np.dot(Q[0], X[0, i : i + L]) for i in range(X.shape[1] - L + 1)], + ) + + +def test__update_dot_products(): + """Test the _update_dot_product function.""" + X = make_example_2d_numpy_series(n_channels=1, n_timepoints=20) + T = make_example_2d_numpy_series(n_channels=1, n_timepoints=10) + L = 7 + current_product = get_ith_products(X, T, L, 0) + for i_query in range(1, T.shape[1] - L + 1): + new_product = get_ith_products( + X, + T, + L, + i_query, + ) + current_product = _update_dot_products( + X, + T, + current_product, + L, + i_query, + ) + assert_array_almost_equal(new_product, current_product) + + +def test_get_ith_products(): + """Test i-th dot product of a subsequence of size L.""" + X = make_example_2d_numpy_series(n_channels=1, n_timepoints=10) + Q = make_example_2d_numpy_series(n_channels=1, n_timepoints=10) + L = 5 + + values = get_ith_products(X, Q, L, 0) + # Compare values[0] only as input is univariate + assert_array_almost_equal( + values[0], + [np.dot(Q[0, 0:L], X[0, i : i + L]) for i in range(X.shape[1] - L + 1)], + ) + + values = get_ith_products(X, Q, L, 4) + # Compare values[0] only as input is univariate + assert_array_almost_equal( + values[0], + [np.dot(Q[0, 4 : 4 + L], X[0, i : i + L]) for i in range(X.shape[1] - L + 1)], + ) + + +def test__inverse_distance_profile(): + """Test method to inverse a TypedList of distance profiles.""" + X = make_example_1d_numpy() + X_inv = _inverse_distance_profile(X) + assert_array_almost_equal(1 / (X + 1e-8), X_inv) + + +def test__extract_top_k_motifs(): + """Test motif extraction based on max distance.""" + MP = List( + [ + [1.0, 2.0], + [1.0, 4.0], + [0.5, 0.9], + [0.6, 0.7], + ] + ) + IP = List( + [ + [1, 2], + [1, 4], + [0, 3], + [0, 7], + ] + ) + MP_k, IP_k = _extract_top_k_motifs(MP, IP, 2) + assert_(len(MP_k) == 2) + assert_(MP_k[0] == [0.6, 0.7]) + assert_(IP_k[0] == [0, 7]) + assert_(MP_k[1] == [0.5, 0.9]) + assert_(IP_k[1] == [0, 3]) + + +def test__extract_top_r_motifs(): + """Test motif extraction based on motif set cardinality.""" + MP = List( + [ + [1.0, 1.5, 2.0, 1.5], + [1.0, 4.0], + [0.5, 0.9, 1.0], + [0.6, 0.7], + ] + ) + IP = List( + [ + [1, 2, 3, 4], + [1, 4], + [0, 3, 6], + [0, 7], + ] + ) + MP_k, IP_k = _extract_top_r_motifs(MP, IP, 2) + assert_(len(MP_k) == 2) + assert_(MP_k[0] == [1.0, 1.5, 2.0, 1.5]) + assert_(IP_k[0] == [1, 2, 3, 4]) + assert_(MP_k[1] == [0.5, 0.9, 1.0]) + assert_(IP_k[1] == [0, 3, 6]) + + +@pytest.mark.parametrize("k", K_VALUES) +@pytest.mark.parametrize("threshold", THRESHOLDS) +@pytest.mark.parametrize("allow_nn_matches", NN_MATCHES) +@pytest.mark.parametrize("exclusion_size", EXCLUSION_SIZE) +def test__extract_top_k_from_dist_profile( + k, threshold, allow_nn_matches, exclusion_size +): + """Test method to esxtract the top k candidates from a list of distance profiles.""" + X = make_example_1d_numpy(n_timepoints=30) + X_sort = np.argsort(X) + exclusion_size = 3 + top_k_indexes, top_k_distances = _extract_top_k_from_dist_profile( + X, k, threshold, allow_nn_matches, exclusion_size + ) + + if len(top_k_indexes) == 0 or len(top_k_distances) == 0: + raise AssertionError("_extract_top_k_from_dist_profile returned empty list") + for i, index in enumerate(top_k_indexes): + assert_(X[index] == top_k_distances[i]) + + assert_(np.all(top_k_distances <= threshold)) + + if allow_nn_matches: + assert_(np.all(top_k_distances <= X_sort[len(top_k_indexes) - 1])) + + if not allow_nn_matches: + same_X = np.sort(top_k_indexes) + if len(same_X) > 1: + assert_(np.all(np.diff(same_X) >= exclusion_size)) diff --git a/aeon/similarity_search/series_search/__init__.py b/aeon/similarity_search/series_search/__init__.py deleted file mode 100644 index 9a618540db..0000000000 --- a/aeon/similarity_search/series_search/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -"""Series search module.""" - -__all__ = ["BaseSeriesSearch", "BaseIndexSearch"] - -from aeon.similarity_search.series_search._base import BaseIndexSearch, BaseSeriesSearch diff --git a/aeon/similarity_search/series_search/_base.py b/aeon/similarity_search/series_search/_base.py deleted file mode 100644 index f9c6ed8097..0000000000 --- a/aeon/similarity_search/series_search/_base.py +++ /dev/null @@ -1,264 +0,0 @@ -"""Base class for whole series search.""" - -__maintainer__ = ["baraline"] - -import warnings -from abc import abstractmethod -from typing import Optional, final - -import numpy as np -from numba import get_num_threads, set_num_threads - -from aeon.similarity_search._base import BaseSimilaritySearch -from aeon.utils.numba.general import compute_mean_stds_collection_parallel - - -class BaseSeriesSearch(BaseSimilaritySearch): - """ - Base class for similarity search on whole time series. - - Parameters - ---------- - normalise : bool, optional - Whether the inputs should be z-normalised. The default is False. - n_jobs : int, optional - Number of parallel jobs to use. The default is 1. - """ - - @final - def find_motifs( - self, - X: np.ndarray, - k: Optional[int] = 1, - threshold: Optional[float] = np.inf, - X_index: Optional[int] = None, - inverse_distance: Optional[bool] = False, - ): - """ - Find the top-k motifs in the training data. - - Given ``k`` and ``threshold`` parameters, this methods returns the top-k motif - sets. We define a motif set as a set of candidates which all are at a distance - of at most ``threshold`` from each other. The top-k motifs sets are the - motif sets with the most candidates. - - Parameters - ---------- - X : np.ndarray, 2D array of shape (n_channels, n_timestamps) - A series in which we want to indentify motifs. - k : int, optional - Number of motifs to return - threshold : int, optional - A threshold on the similarity measure to determine which candidates will be - part of a motif set. - X_index : Optional[int], optional - If ``X`` is a series of the database given in fit, specify its index in - ``X_``. If specified, this series won't be able to match with itself. - inverse_distance : bool, optional - Wheter to inverse the computed distance, meaning that the method will return - the anomalies instead of motifs. - - Returns - ------- - ndarray, shape=(k,) - A numpy array of at most ``k`` elements containing the indexes of the - motifs in X. - ndarray, shape=(k,) - A numpy array of at most ``k`` elements containing the distances of the - motifs macthes to the motif in X. - - """ - self._check_is_fitted() - if X is not None: - self._check_find_neighbors_motif_format(X) - prev_threads = get_num_threads() - X_index = self._check_X_index_int(X_index) - motifs_indexes, distances = self._find_motifs( - X, - k=k, - threshold=threshold, - inverse_distance=inverse_distance, - X_index=X_index, - ) - set_num_threads(prev_threads) - return motifs_indexes, distances - - @final - def find_neighbors( - self, - X: np.ndarray, - k: Optional[int] = 1, - threshold: Optional[float] = np.inf, - X_index: Optional[int] = None, - inverse_distance: Optional[bool] = False, - ): - """ - Find the top-k neighbors of X in the database. - - Given ``k`` and ``threshold`` parameters, this methods returns the top-k - neighbors of X, such as each of the ``k`` neighbors as a distance inferior or - equal to ``threshold``. By default, ``threshold`` is set to infinity. It is - possible for this method to return less than ``k`` neighbors, either if there - is less than ``k`` admissible candidate in the database, or if in the top-k - candidates, some do not meet the ``threshold`` condition. - - Parameters - ---------- - X : np.ndarray, 2D array of shape (n_channels, length) - The subsequence for which we want to identify nearest neighbors in the - database. - k : int, optional - Number of neighbors to return. - threshold : int, optional - A threshold on the distance to determine which candidates will be returned. - X_index : Optional[int], optional - If ``X`` is a series of the database given in fit, specify its index in - ``X_``. If specified, this series won't be able to match with itself. - inverse_distance : bool, optional - Wheter to inverse the computed distance, meaning that the method will return - the k most dissimilar neighbors instead of the k most similar. - - - Returns - ------- - ndarray, shape=(k,) - A numpy array of at most ``k`` elements containing the indexes of the - neighbors. - ndarray, shape=(k,) - A numpy array of at most ``k`` elements containing the distances of the - neighbors to X. - - """ - self._check_is_fitted() - - self._check_find_neighbors_motif_format(X) - - X_index = self._check_X_index_int(X_index) - prev_threads = get_num_threads() - set_num_threads(self._n_jobs) - neighbors, distances = self._find_neighbors( - X, - k=k, - threshold=threshold, - inverse_distance=inverse_distance, - X_index=X_index, - ) - set_num_threads(prev_threads) - if len(neighbors) < k: - warnings.warn( - f"The number of admissible neighbors found is {len(neighbors)}, instead" - f" of {k}", - stacklevel=2, - ) - return neighbors, distances - - def _compute_mean_std_from_collection(self, X: np.ndarray): - """ - Compute the mean and std of each channel for all series in X. - - Parameters - ---------- - X : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints) - Collection of series from which we extract mean and stds. If it is an - unequal length collection, it should be a list of 2d numpy arrays. - - Returns - ------- - Tuple(np.ndarray, np.ndarray) - Both array are of shape (n_cases, n_channels), the first contains the means - and the second the stds for each series in X. - - """ - means, stds = compute_mean_stds_collection_parallel(X) - return means, stds - - def _fit(self, X, y=None): - if self.normalise: - self.X_means_, self.X_stds_ = self._compute_mean_std_from_collection(X) - self.X_ = X - return self - - @abstractmethod - def _find_motifs( - self, - X: np.ndarray, - k: Optional[int] = 1, - threshold: Optional[float] = np.inf, - X_index: Optional[int] = None, - inverse_distance: Optional[bool] = False, - ): ... - - @abstractmethod - def _find_neighbors( - self, - X: np.ndarray, - k: Optional[int] = 1, - threshold: Optional[float] = np.inf, - inverse_distance: Optional[bool] = False, - X_index=None, - ): ... - - -# TODO : Add an update method to add series to the index -class BaseIndexSearch(BaseSeriesSearch): - """ - Base class for similarity search on whole time series using indexes. - - Parameters - ---------- - normalise : bool, optional - Whether the inputs should be z-normalised. The default is False. - n_jobs : int, optional - Number of parallel jobs to use. The default is 1. - """ - - def _fit(self, X, y=None): - super()._fit(X) - self._build_index() - return self - - @abstractmethod - def _build_index(self): ... - - @abstractmethod - def _query_index( - self, - X, - k=1, - inverse_distance=False, - threshold=np.inf, - ): ... - - @abstractmethod - def _get_bucket_sizes(self): ... - - @abstractmethod - def _get_bucket_content(self, key): ... - - def _find_motifs( - self, - X: np.ndarray, - k: Optional[int] = 1, - threshold: Optional[float] = np.inf, - X_index: Optional[int] = None, - inverse_distance: Optional[bool] = False, - ): - bucket_sizes = self._get_bucket_sizes() - idx_motifs = np.argsort(list(bucket_sizes.values()))[::-1][:, k] - # TODO : review distance return on motif for whole series and buckets - return [self._get_bucket_content(idx_motif) for idx_motif in idx_motifs], [ - 0 for _ in idx_motifs - ] - - def _find_neighbors( - self, - X: np.ndarray, - k: Optional[int] = 1, - threshold: Optional[float] = np.inf, - inverse_distance: Optional[bool] = False, - X_index=None, - ): - top_k, top_k_dist = self._query_index( - X, k=k, inverse_distance=inverse_distance, threshold=threshold - ) - return top_k, top_k_dist diff --git a/aeon/similarity_search/subsequence_search/__init__.py b/aeon/similarity_search/subsequence_search/__init__.py deleted file mode 100644 index eb062c46b8..0000000000 --- a/aeon/similarity_search/subsequence_search/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -"""Subsequence search module.""" - -__all__ = [ - "BaseSubsequenceSearch", - "BaseMatrixProfile", - "StompMatrixProfile", - "BruteForceMatrixProfile", -] - -from aeon.similarity_search.subsequence_search._base import ( - BaseMatrixProfile, - BaseSubsequenceSearch, -) -from aeon.similarity_search.subsequence_search._brute_force import ( - BruteForceMatrixProfile, -) -from aeon.similarity_search.subsequence_search._stomp import StompMatrixProfile diff --git a/aeon/similarity_search/subsequence_search/_base.py b/aeon/similarity_search/subsequence_search/_base.py deleted file mode 100644 index a8d78b029d..0000000000 --- a/aeon/similarity_search/subsequence_search/_base.py +++ /dev/null @@ -1,509 +0,0 @@ -"""Base class for subsequence search.""" - -__maintainer__ = ["baraline"] - -import warnings -from abc import abstractmethod -from typing import Optional, final - -import numpy as np -from numba import get_num_threads, set_num_threads -from numba.typed import List - -from aeon.similarity_search._base import BaseSimilaritySearch -from aeon.similarity_search.subsequence_search._commons import ( - _extract_top_k_from_dist_profile, - _inverse_distance_profile_list, -) -from aeon.utils.numba.general import sliding_mean_std_one_series - -# We can define a BaseVariableLengthSubsequenceSearch later for VALMOD and the likes. - -# BaseSubSeries 'replace sub by series' - - -class BaseSubsequenceSearch(BaseSimilaritySearch): - """ - Base class for similarity search on time series subsequences. - - Parameters - ---------- - length : int - The length of the subsequence to be considered. - normalise : bool, optional - Whether the inputs should be z-normalised. The default is False. - n_jobs : int, optional - Number of parallel jobs to use. The default is 1. - """ - - @abstractmethod - def __init__( - self, - length: int, - normalise: Optional[bool] = False, - n_jobs: Optional[int] = 1, - ): - self.length = length - super().__init__(n_jobs=n_jobs, normalise=normalise) - - _tags = { - "capability:unequal_length": True, - } - - @final - def find_motifs( - self, - X: np.ndarray, - k: Optional[int] = 1, - threshold: Optional[float] = np.inf, - X_index: Optional[int] = None, - inverse_distance: Optional[bool] = False, - allow_neighboring_matches: Optional[bool] = False, - exclusion_factor: Optional[float] = 2.0, - ): - """ - Find the top-k motifs in the training data. - - Given ``k`` and ``threshold`` parameters, this methods returns the top-k motif - sets. We define a motif set as a set of candidates which all are at a distance - of at most ``threshold`` from each other. The top-k motifs sets are the - motif sets with the most candidates. - - Parameters - ---------- - X : np.ndarray, 2D array of shape (n_channels, n_timestamps) - A series in which we want to indentify motifs. - k : int, optional - Number of motifs to return - threshold : int, optional - A threshold on the similarity measure to determine which candidates will be - part of a motif set. - X_index : Optional[int], optional - If ``X`` is a series of the database given in fit, specify its index in - ``X_``. If specified, each query of this series won't be able to match with - its neighboring subsequences. - inverse_distance : bool, optional - Wheter to inverse the computed distance, meaning that the method will return - the anomalies instead of motifs. - allow_neighboring_matches: bool, optional - Wheter a candidate can be part of multiple motif sets (True), or if motif - sets should be mutually exclusive (False). - exclusion_factor : float, default=2. - A factor of the query length used to define the exclusion zone when - ``allow_neighboring_matches`` is set to False. For a given timestamp, - the exclusion zone starts from - :math:`id_timestamp - query_length//exclusion_factor` and end at - :math:`id_timestamp + query_length//exclusion_factor`. - - Returns - ------- - ndarray, shape=(k,) - A numpy array of at most ``k`` elements containing the indexes of the - motifs in X. - ndarray, shape=(k,) - A numpy array of at most ``k`` elements containing the distances of the - motifs macthes to the motif in X. - - """ - self._check_is_fitted() - if X is not None: - self._check_find_neighbors_motif_format(X) - prev_threads = get_num_threads() - X_index = self._check_X_index_int(X_index) - motifs_indexes, distances = self._find_motifs( - X, - k=k, - threshold=threshold, - exclusion_factor=exclusion_factor, - inverse_distance=inverse_distance, - allow_neighboring_matches=allow_neighboring_matches, - X_index=X_index, - ) - set_num_threads(prev_threads) - return motifs_indexes, distances - - @final - def find_neighbors( - self, - X: np.ndarray, - k: Optional[int] = 1, - threshold: Optional[float] = np.inf, - inverse_distance: Optional[bool] = False, - X_index: Optional[np.ndarray] = None, - allow_neighboring_matches: Optional[bool] = False, - exclusion_factor: Optional[float] = 2.0, - ): - """ - Find the top-k neighbors of X in the database. - - Given ``k`` and ``threshold`` parameters, this methods returns the top-k - neighbors of X, such as each of the ``k`` neighbors as a distance inferior or - equal to ``threshold``. By default, ``threshold`` is set to infinity. It is - possible for this method to return less than ``k`` neighbors, either if there - is less than ``k`` admissible candidate in the database, or if in the top-k - candidates, some do not meet the ``threshold`` condition. - - Parameters - ---------- - X : np.ndarray, 2D array of shape (n_channels, length) - The subsequence for which we want to identify nearest neighbors in the - database. - k : int, optional - Number of neighbors to return. - threshold : int, optional - A threshold on the distance to determine which candidates will be returned. - inverse_distance : bool, optional - Wheter to inverse the computed distance, meaning that the method will return - the k most dissimilar neighbors instead of the k most similar. - X_index : np.ndarray, shape=(2,), optional - If ``X`` is a subsequence of the database given in fit, specify its starting - index as (i_case, i_timestamp). If specified, this subsequence and the - neighboring ones (according to ``exclusion_factor``) won't be considered as - admissible candidates. - allow_neighboring_matches: bool, optional - Wheter the top-k candidates can be neighboring subsequences. - exclusion_factor : float, default=2. - A factor of the query length used to define the exclusion zone when - ``allow_neighboring_matches`` is set to False. For a given timestamp, - the exclusion zone starts from - :math:`id_timestamp - query_length//exclusion_factor` and end at - :math:`id_timestamp + query_length//exclusion_factor`. - - Returns - ------- - ndarray, shape=(k,) - A numpy array of at most ``k`` elements containing the indexes of the - neighbors. - ndarray, shape=(k,) - A numpy array of at most ``k`` elements containing the distances of the - neighbors to X. - - """ - self._check_is_fitted() - - self._check_find_neighbors_motif_format(X) - if self.length != X.shape[1]: - raise ValueError( - f"Expected X to be of shape {(self.n_channels_, self.length)} but" - f" got {X.shape} in find_neighbors." - ) - - X_index = self._check_X_index_array(X_index) - prev_threads = get_num_threads() - set_num_threads(self._n_jobs) - neighbors, distances = self._find_neighbors( - X, - k=k, - threshold=threshold, - inverse_distance=inverse_distance, - X_index=X_index, - allow_neighboring_matches=allow_neighboring_matches, - exclusion_factor=exclusion_factor, - ) - set_num_threads(prev_threads) - if len(neighbors) < k: - warnings.warn( - f"The number of admissible neighbors found is {len(neighbors)}, instead" - f" of {k}", - stacklevel=2, - ) - return neighbors, distances - - def _check_X_index_int(self, X_index: int): - """ - Check wheter the X_index parameter is correctly formated and is admissible. - - This check is made for motif search functions. - - Parameters - ---------- - X_index : int - Index of a series in X_. - - Returns - ------- - X_index : int - Index of a series in X_ - - """ - if X_index is not None: - if not isinstance(X_index, int): - raise TypeError("Expected an integer for X_index but got {X_index}") - - if X_index >= self.n_cases_ or X_index < 0: - raise ValueError( - "The value of X_index cannot exced the number " - "of series in the collection given during fit. Expected a value " - f"between [0, {self.n_cases_ - 1}] but got {X_index}" - ) - return X_index - - def _check_X_index_array(self, X_index: np.ndarray): - """ - Check wheter the X_index parameter is correctly formated and is admissible. - - This check is made for neighbour search functions. - - Parameters - ---------- - X_index : np.ndarray, 1D array of shape (2) - Array of integer containing the sample and timestamp identifiers of the - starting point of a subsequence in X_. - - Returns - ------- - X_index : np.ndarray, 1D array of shape (2) - Array of integer containing the sample and timestamp identifiers of the - starting point of a subsequence in X_. - - """ - if X_index is not None: - if ( - isinstance(X_index, list) - and len(X_index) == 2 - and isinstance(X_index[0], int) - and isinstance(X_index[1], int) - ): - X_index = np.asarray(X_index, dtype=int) - elif len(X_index) != 2: - raise TypeError( - "Expected a numpy array or list of integers with 2 elements " - f"for X_index but got {X_index}" - ) - elif ( - not (isinstance(X_index[0], int) or not isinstance(X_index[1], int)) - or X_index.dtype != int - ): - raise TypeError( - "Expected a numpy array or list of integers for X_index but got " - f"{X_index}" - ) - - if X_index[0] >= self.n_cases_ or X_index[0] < 0: - raise ValueError( - "The sample ID (first element) of X_index cannot exced the number " - "of series in the collection given during fit. Expected a value " - f"between [0, {self.n_cases_ - 1}] but got {X_index[0]}" - ) - _max_timestamp = self.X_[X_index[0]].shape[1] - self.length + 1 - if X_index[1] >= _max_timestamp: - raise ValueError( - "The timestamp ID (second element) of X_index cannot exced the " - "number of timestamps minus the length parameter plus one. Expected" - f" a value between [0, {_max_timestamp - 1}] but got {X_index[1]}" - ) - return X_index - - def _compute_mean_std_from_collection(self, X: np.ndarray): - """ - Compute the mean and std of each subsequence of size ``length`` in X. - - Parameters - ---------- - X : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints) - Collection of series from which we extract mean and stds. If it is an - unequal length collection, it should be a list of 2d numpy arrays. - - Returns - ------- - Tuple(np.ndarray, np.ndarray) - Both array are of shape (n_cases, n_timepoints-length+1, n_channels), - the first contains the means and the second the stds for each subsequence - of size ``length`` in X. - - """ - means = [] - stds = [] - - for i_x in range(len(X)): - _mean, _std = sliding_mean_std_one_series(X[i_x], self.length, 1) - stds.append(_std) - means.append(_mean) - - if self.metadata_["unequal_length"]: - return List(means), List(stds) - else: - return np.asarray(means), np.asarray(stds) - - def _fit(self, X, y=None): - if self.length >= self.min_timepoints_ or self.length < 1: - raise ValueError( - "The length of the query should be inferior or equal to the length of " - "data (X_) provided during fit, but got {} for X and {} for X_".format( - self.length, self.min_timepoints_ - ) - ) - - if self.normalise: - self.X_means_, self.X_stds_ = self._compute_mean_std_from_collection(X) - self.X_ = X - return self - - @abstractmethod - def _find_motifs( - self, - X: np.ndarray, - k: Optional[int] = 1, - threshold: Optional[float] = np.inf, - X_index: Optional[int] = None, - inverse_distance: Optional[bool] = False, - allow_neighboring_matches: Optional[bool] = False, - exclusion_factor: Optional[float] = 2.0, - ): ... - - @abstractmethod - def _find_neighbors( - self, - X: np.ndarray, - k: Optional[int] = 1, - threshold: Optional[float] = np.inf, - inverse_distance: Optional[bool] = False, - X_index=None, - allow_neighboring_matches: Optional[bool] = False, - exclusion_factor: Optional[float] = 2.0, - ): ... - - @classmethod - def _get_test_params(cls, parameter_set: str = "default") -> dict: - """Return testing parameter settings for the estimator. - - Parameters - ---------- - parameter_set : str, default="default" - Name of the set of test parameters to return, for use in tests. If no - special parameters are defined for a value, will return `"default"` set. - For classifiers, a "default" set of parameters should be provided for - general testing, and a "results_comparison" set for comparing against - previously recorded results if the general set does not produce suitable - probabilities to compare against. - - Returns - ------- - params : dict or list of dict, default={} - Parameters to create testing instances of the class. - Each dict are parameters to construct an "interesting" test instance, i.e., - `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance. - """ - return {"length": 3} - - -class BaseMatrixProfile(BaseSubsequenceSearch): - """Base class for Matrix Profile methods using a length parameter.""" - - def _find_motifs( - self, - X: np.ndarray, - k: Optional[int] = 1, - threshold: Optional[float] = np.inf, - X_index: Optional[int] = None, - inverse_distance: Optional[bool] = False, - allow_neighboring_matches: Optional[bool] = False, - exclusion_factor: Optional[float] = 2.0, - ): - exclusion_size = self.length // exclusion_factor - - MP, IP = self.compute_matrix_profile( - k, - threshold, - exclusion_size, - inverse_distance, - allow_neighboring_matches, - X=X, - X_index=X_index, - ) - # TODO check motif extraction logic, sure its not this one - MP_avg = np.array([[np.mean(MP[i]) for i in range(len(MP))]]) - # TODO: appening IP of identified motifs to return to get motifs matches in X_ - return _extract_top_k_from_dist_profile( - MP_avg, - k, - threshold, - allow_neighboring_matches, - exclusion_size, - ) - - def _find_neighbors( - self, - X: np.ndarray, - k: Optional[int] = 1, - threshold: Optional[float] = np.inf, - inverse_distance: Optional[bool] = False, - X_index=None, - allow_neighboring_matches: Optional[bool] = False, - exclusion_factor: Optional[float] = 2.0, - ): - """ - Find the top-k neighbors of X in the database. - - Given ``k`` and ``threshold`` parameters, this methods returns the top-k - neighbors of X, such as each of the ``k`` neighbors as a distance inferior or - equal to ``threshold``. By default, ``threshold`` is set to infinity. It is - possible for this method to return less than ``k`` neighbors, either if there - is less than ``k`` admissible candidate in the database, or if in the top-k - candidates, some do not meet the ``threshold`` condition. - - Parameters - ---------- - X : np.ndarray, 2D array of shape (n_channels, length) - The subsequence for which we want to identify nearest neighbors in the - database. - k : int, optional - Number of neighbors to return. - threshold : int, optional - A threshold on the distance to determine which candidates will be returned. - inverse_distance : bool, optional - Wheter to inverse the computed distance, meaning that the method will return - the k most dissimilar neighbors instead of the k most similar. - X_index : np.ndarray, shape=(2,), optional - If ``X`` is a subsequence of the database given in fit, specify its starting - index as (i_case, i_timestamp). If specified, this subsequence and the - neighboring ones (according to ``exclusion_factor``) won't be considered as - admissible candidates. - allow_neighboring_matches: bool, optional - Wheter the top-k candidates can be neighboring subsequences. - exclusion_factor : float, default=2. - A factor of the query length used to define the exclusion zone when - ``allow_neighboring_matches`` is set to False. For a given timestamp, the - exclusion zone starts from - :math:`id_timestamp - query_length//exclusion_factor` and end at - :math:`id_timestamp + query_length//exclusion_factor`. - """ - exclusion_size = self.length // exclusion_factor - dist_profiles = self.compute_distance_profile(X) - - if inverse_distance: - dist_profiles = _inverse_distance_profile_list(dist_profiles) - - # Deal with self-matches - if X_index is not None: - _max_timestamp = self.X_[X_index[0]].shape[1] - self.length - ub = min(X_index[1] + exclusion_size, _max_timestamp) - lb = max(0, X_index[1] - exclusion_size) - dist_profiles[X_index[0]][lb:ub] = np.inf - - return _extract_top_k_from_dist_profile( - dist_profiles, - k, - threshold, - allow_neighboring_matches, - exclusion_size, - ) - - @abstractmethod - def compute_matrix_profile( - self, - X: np.ndarray, - k: int, - threshold: float, - exclusion_size: int, - inverse_distance: bool, - allow_neighboring_matches: bool, - X_index: Optional[int] = None, - ): - """Compute matrix profiles between X_ and X or between all series in X_.""" - ... - - @abstractmethod - def compute_distance_profile(self, X: np.ndarray): - """Compute distrance profiles between X_ and X (a series of size length).""" - ... diff --git a/aeon/similarity_search/subsequence_search/_brute_force.py b/aeon/similarity_search/subsequence_search/_brute_force.py deleted file mode 100644 index 269cdc369b..0000000000 --- a/aeon/similarity_search/subsequence_search/_brute_force.py +++ /dev/null @@ -1,319 +0,0 @@ -"""Implementation of matrix profile with brute force.""" - -from typing import Optional - -__maintainer__ = ["baraline"] - - -import numpy as np -from numba import njit, prange -from numba.typed import List - -from aeon.similarity_search.subsequence_search._base import BaseMatrixProfile -from aeon.similarity_search.subsequence_search._commons import ( - _extract_top_k_from_dist_profile, - _inverse_distance_profile_list, -) -from aeon.utils.numba.general import ( - get_all_subsequences, - z_normalise_series_2d, - z_normalise_series_3d, -) - -# TODO : check function params and make docstrings -# TODO : make tests - - -class BruteForceMatrixProfile(BaseMatrixProfile): - """Estimator to compute matrix profile and distance profile using brute force.""" - - def __init__( - self, - length: int, - normalise: Optional[bool] = False, - n_jobs: Optional[int] = 1, - ): - super().__init__(length=length, n_jobs=n_jobs, normalise=normalise) - - def compute_matrix_profile( - self, - k, - threshold, - exclusion_size, - inverse_distance, - allow_neighboring_matches, - X: Optional[np.ndarray] = None, - X_index: Optional[int] = None, - ): - """ - Compute matrix profiles. - - The matrix profiles are computed on the collection given in fit. If ``X`` is - not given, computes the matrix profile of each series in the collection. If it - is given, only computes it for ``X``. - - Parameters - ---------- - k : int - The number of best matches to return during predict for each subsequence. - threshold : float - The number of best matches to return during predict for each subsequence. - inverse_distance : bool - If True, the matching will be made on the inverse of the distance, and thus, - the worst matches to the query will be returned instead of the best ones. - exclusion_size : int - The size of the exclusion zone used to prevent returning as top k candidates - the ones that are close to each other (for example i and i+1). - It is used to define a region between - :math:`id_timestomp - exclusion_size` and - :math:`id_timestomp + exclusion_size` which cannot be returned - as best match if :math:`id_timestomp` was already selected. By default, - the value None means that this is not used. - X : Optional[np.ndarray], optional - The time series on which the matrix profile will be compute. - The default is None, meaning that the series in the collection given in fit - will be used instead. - X_index : Optional[int], optional - If ``X`` is a series of the database given in fit, specify its index in - ``X_``. If specified, each query of this series won't be able to match with - its neighboring subsequences. - - Returns - ------- - MP : array of shape (series_length - L + 1,) - Matrix profile distances for each query subsequence. If X is none, this - will be a list of MP for each series in X_. - IP : array of shape (series_length - L + 1,) - Indexes of the top matches for each query subsequence. If X is none, this - will be a list of MP for each series in X_. - """ - # pairwise if none - if X is None: - MP = [] - IP = [] - for i in range(len(self.X_)): - _MP, _IP = self.compute_matrix_profile( - k, - threshold, - exclusion_size, - inverse_distance, - X=self.X_[i], - X_index=i, - ) - MP.append(_MP) - IP.append(_IP) - else: - MP, IP = _naive_squared_matrix_profile( - self.X_, - X, - self.length, - X_index, - k, - threshold, - allow_neighboring_matches, - exclusion_size, - inverse_distance, - normalise=self.normalise, - ) - - return MP, IP - - def compute_distance_profile(self, X: np.ndarray): - """ - Compute the distance profile of X to all samples in X_. - - Parameters - ---------- - X : np.ndarray, 2D array of shape (n_channels, length) - The query to use to compute the distance profiles. - - Returns - ------- - distance_profiles : np.ndarray, 2D array of shape (n_cases, n_candidates) - The distance profile of X to all samples in X_. The ``n_candidates`` value - is equal to ``n_timepoins - length + 1``. If X_ is an unequal length - collection, returns a numba typed list instead of an ndarray. - - """ - distance_profiles = _naive_squared_distance_profile( - self.X_, X, normalise=self.normalise - ) - - if not self.metadata_["unequal_length"]: - distance_profiles = np.asarray(distance_profiles) - return distance_profiles - - -@njit(cache=True, fastmath=True) -def _compute_dist_profile(X_subs, q): - """ - Compute the distance profile between subsequences and a query. - - Parameters - ---------- - X_subs : array, shape=(n_samples, n_channels, query_length) - Input subsequences extracted from a time series. - q : array, shape=(n_channels, query_length) - Query used for the distance computation - - Returns - ------- - dist_profile : np.ndarray, 1D array of shape (n_samples) - The distance between the query all subsequences. - - """ - n_candidates, n_channels, q_length = X_subs.shape - dist_profile = np.zeros(n_candidates) - for i in range(n_candidates): - for j in range(n_channels): - for k in range(q_length): - dist_profile[i] += (X_subs[i, j, k] - q[j, k]) ** 2 - return dist_profile - - -@njit(cache=True, fastmath=True, parallel=True) -def _naive_squared_distance_profile( - X, - Q, - normalise=False, -): - """ - Compute a squared euclidean distance profile. - - Parameters - ---------- - X : array, shape=(n_samples, n_channels, n_timepoints) - Input time series dataset to search in. - Q : array, shape=(n_channels, query_length) - Query used during the search. - normalise : bool - Wheter to use a z-normalised distance. - - Returns - ------- - out : np.ndarray, 1D array of shape (n_samples, n_timepoints_t - query_length + 1) - The distance between the query and all candidates in X. - - """ - query_length = Q.shape[1] - dist_profiles = List() - # Init distance profile array with unequal length support - for i in range(len(X)): - dist_profiles.append(np.zeros(X[i].shape[1] - query_length + 1)) - if normalise: - Q = z_normalise_series_2d(Q) - else: - Q = Q.astype(np.float64) - - for _i in prange(len(X)): - # cast uint64 due to parallel prange - i = np.int64(_i) - X_subs = get_all_subsequences(X[i], query_length, 1) - if normalise: - X_subs = z_normalise_series_3d(X_subs) - - dist_profile = _compute_dist_profile(X_subs, Q) - dist_profiles[i] = dist_profile - return dist_profiles - - -@njit(cache=True, fastmath=True, parallel=True) -def _naive_squared_matrix_profile( - X, - T, - L, - T_index, - k, - threshold, - allow_neighboring_matches, - exclusion_size, - inverse_distance, - normalise=False, -): - """ - Compute a squared euclidean matrix profile. - - Parameters - ---------- - X: np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints) - The input samples. If X is an unquel length collection, expect a TypedList - of 2D arrays of shape (n_channels, n_timepoints) - T : np.ndarray, 2D array of shape (n_channels, series_length) - The series used for similarity search. Note that series_length can be equal, - superior or inferior to n_timepoints, it doesn't matter. - L : int - Length of the subsequences used for the distance computation. - T_index : int, - If ``T`` is a series of ``X``, specify its index - in ``X``. If specified, each query of this series won't be able to - match with its neighboring subsequences. - k : int - The number of best matches to return during predict for each subsequence. - threshold : float - The number of best matches to return during predict for each subsequence. - allow_neighboring_matches : bool - Wheter the top-k candidates can be neighboring subsequences. - exclusion_size : int - The size of the exclusion zone used to prevent returning as top k candidates - the ones that are close to each other (for example i and i+1). - It is used to define a region between - :math:`id_timestomp - exclusion_size` and - :math:`id_timestomp + exclusion_size` which cannot be returned - as best match if :math:`id_timestomp` was already selected. By default, - the value None means that this is not used. - inverse_distance : bool - If True, the matching will be made on the inverse of the distance, and thus, the - worst matches to the query will be returned instead of the best ones. - normalise : bool - Wheter to use a z-normalised distance. - - Returns - ------- - out : np.ndarray, 1D array of shape (n_timepoints_t - query_length + 1) - The minimum distance between each query in T and all candidates in X. - """ - n_queries = T.shape[1] - L + 1 - MP = List() - IP = List() - - # Init List to allow parallel, we'll re-use it for all dist profiles - dist_profiles = List() - for i_x in range(len(X)): - dist_profiles.append(np.zeros(X[i_x].shape[1] - L + 1)) - - X_subs = List() - for i in range(len(X)): - i_subs = get_all_subsequences(X[i], L, 1) - if normalise: - i_subs = z_normalise_series_3d(i_subs) - X_subs.append(i_subs) - - for i_q in range(n_queries): - Q = T[:, i_q : i_q + L] - if normalise: - Q = z_normalise_series_2d(Q) - for i_x in prange(len(X)): - dist_profiles[i_x][0 : X[i_x].shape[1] - L + 1] = _compute_dist_profile( - X_subs[i_x], Q - ) - - if T_index is not None: - _max_timestamp = X[T_index].shape[1] - L - ub = min(i_q + exclusion_size, _max_timestamp) - lb = max(0, i_q - exclusion_size) - dist_profiles[T_index][lb:ub] = np.inf - - if inverse_distance: - dist_profiles = _inverse_distance_profile_list(dist_profiles) - - top_indexes, top_dists = _extract_top_k_from_dist_profile( - dist_profiles, - k, - threshold, - allow_neighboring_matches, - exclusion_size, - ) - - MP.append(top_dists) - IP.append(top_indexes) - return MP, IP diff --git a/aeon/similarity_search/subsequence_search/_commons.py b/aeon/similarity_search/subsequence_search/_commons.py deleted file mode 100644 index c13a7381bc..0000000000 --- a/aeon/similarity_search/subsequence_search/_commons.py +++ /dev/null @@ -1,170 +0,0 @@ -"""Helper and common function for similarity search estimators and functions.""" - -__maintainer__ = ["baraline"] - -import numpy as np -from numba import njit, prange -from scipy.signal import convolve - - -def fft_sliding_dot_product(X, q): - """ - Use FFT convolution to calculate the sliding window dot product. - - This function applies the Fast Fourier Transform (FFT) to efficiently compute - the sliding dot product between the input time series `X` and the query `q`. - The dot product is computed for each channel individually. The sliding window - approach ensures that the dot product is calculated for every possible subsequence - of `X` that matches the length of `q` - - Parameters - ---------- - X : array, shape=(n_channels, n_timepoints) - Input time series - q : array, shape=(n_channels, query_length) - Input query - - Returns - ------- - out : np.ndarray, 2D array of shape (n_channels, n_timepoints - query_length + 1) - Sliding dot product between q and X. - """ - n_channels, n_timepoints = X.shape - query_length = q.shape[1] - out = np.zeros((n_channels, n_timepoints - query_length + 1)) - for i in range(n_channels): - out[i, :] = convolve(np.flipud(q[i, :]), X[i, :], mode="valid").real - return out - - -def get_ith_products(X, T, L, ith): - """ - Compute dot products between X and the i-th subsequence of size L in T. - - Parameters - ---------- - X : array, shape = (n_channels, n_timepoints_X) - Input data. - T : array, shape = (n_channels, n_timepoints_T) - Data containing the query. - L : int - Overall query length. - ith : int - Query starting index in T. - - Returns - ------- - np.ndarray, 2D array of shape (n_channels, n_timepoints_X - L + 1) - Sliding dot product between the i-th subsequence of size L in T and X. - - """ - return fft_sliding_dot_product(X, T[:, ith : ith + L]) - - -@njit(cache=True, fastmath=True, parallel=True) -def _inverse_distance_profile_list(dist_profiles): - for i in prange(len(dist_profiles)): - dist_profiles[i] = 1 / (dist_profiles[i] + 1e-8) - return dist_profiles - - -@njit(cache=True) -def _extract_top_k_from_dist_profile( - dist_profiles, - k, - threshold, - allow_neighboring_matches, - exclusion_size, -): - """ - Given an array (or list) of distance profiles, extract the top k lower distances. - - Parameters - ---------- - dist_profiles : np.ndarray, shape = (n_samples, n_timepoints - length + 1) - A collection of distance profiles computed from ``n_samples`` time series of - size ``n_timepoints``, giving distance profiles of length - ``n_timepoints - length + 1``, with ``length`` the size of the query used to - compute the distance profiles. - k : int - Number of best matches to return - threshold : float - A threshold on the distances of the best matches. To be returned, a candidate - must have a distance bellow this threshold. This can reduce the number of - returned matches to be bellow ``k`` - allow_neighboring_matches : bool - Wheter to allow returning matches that are in the same neighborhood. - exclusion_size : int - The size of the exlusion size to apply when ``allow_neighboring_matches`` is - False. It is applied on both side of existing matches (+/- their indexes). - - Returns - ------- - top_k_indexes : np.ndarray, shape = (k, 2) - The indexes of the best matches in ``distance_profiles``. - top_k_distances : np.ndarray, shape = (k) - The distances of the best matches. - - """ - top_k_indexes = np.zeros((2 * k, 2), dtype=np.int64) - 1 - top_k_distances = np.full(2 * k, np.inf) - for i_profile in range(len(dist_profiles)): - # Extract top-k without neighboring matches - if not allow_neighboring_matches: - _sorted_indexes = np.argsort(dist_profiles[i_profile]) - _top_k_indexes = np.zeros(k, dtype=np.int64) - 1 - _current_k = 0 - _current_j = 0 - # Until we extract k value or explore all the array - while _current_k < k and _current_j < len(_sorted_indexes): - _insert = True - # Check for validity with each previously inserted - for i_k in range(_current_k): - ub = min( - _top_k_indexes[i_k] + exclusion_size, - len(dist_profiles[i_profile]), - ) - lb = max(_top_k_indexes[i_k] - exclusion_size, 0) - if ( - _sorted_indexes[_current_j] >= lb - and _sorted_indexes[_current_j] <= ub - ): - _insert = False - break - - if _insert: - _top_k_indexes[_current_k] = _sorted_indexes[_current_j] - _current_k += 1 - _current_j += 1 - - _top_k_indexes = _top_k_indexes[:_current_k] - _top_k_distances = dist_profiles[i_profile][_top_k_indexes] - # Extract top-k with neighboring matches - else: - _top_k_indexes = np.argsort(dist_profiles[i_profile])[:k] - _top_k_distances = dist_profiles[i_profile][_top_k_indexes] - - # Select overall top k by using the buffer array of size 2*k - # Inset top from current sample - top_k_distances[k : k + len(_top_k_distances)] = _top_k_distances - top_k_indexes[k : k + len(_top_k_distances), 1] = _top_k_indexes - top_k_indexes[k : k + len(_top_k_distances), 0] = i_profile - - # Sort overall - idx = np.argsort(top_k_distances) - # Keep top k overall - top_k_distances[:k] = top_k_distances[idx[:k]] - top_k_indexes[:k] = top_k_indexes[idx[:k]] - - top_k_distances[k:] = np.inf - - # get the actual number of extracted values and apply threshold - true_k = 0 - for i in range(k): - # if top_k is inf, it means that no value was extracted - if top_k_distances[i] != np.inf and top_k_distances[i] <= threshold: - true_k += 1 - else: - break - - return top_k_indexes[:true_k], top_k_distances[:true_k] diff --git a/aeon/similarity_search/subsequence_search/_stomp.py b/aeon/similarity_search/subsequence_search/_stomp.py deleted file mode 100644 index 66d5270872..0000000000 --- a/aeon/similarity_search/subsequence_search/_stomp.py +++ /dev/null @@ -1,619 +0,0 @@ -"""Implementation of STOMP with squared euclidean distance.""" - -__maintainer__ = ["baraline"] -from typing import Optional - -import numpy as np -from numba import njit, prange -from numba.typed import List - -from aeon.similarity_search.subsequence_search._base import BaseMatrixProfile -from aeon.similarity_search.subsequence_search._commons import ( - _extract_top_k_from_dist_profile, - _inverse_distance_profile_list, - fft_sliding_dot_product, - get_ith_products, -) -from aeon.utils.numba.general import ( - AEON_NUMBA_STD_THRESHOLD, - sliding_mean_std_one_series, -) - - -class StompMatrixProfile(BaseMatrixProfile): - """Estimator to compute matrix profile and distance profile using STOMP.""" - - def __init__( - self, - length: int, - normalise: Optional[bool] = False, - n_jobs: Optional[int] = 1, - ): - super().__init__(length=length, n_jobs=n_jobs, normalise=normalise) - - def compute_matrix_profile( - self, - X: np.ndarray, - k: int, - threshold: float, - exclusion_size: int, - inverse_distance: bool, - allow_neighboring_matches: bool, - X_index=None, - ): - """ - Compute matrix profiles. - - The matrix profiles are computed on the collection given in fit. If ``X`` is - not given, computes the matrix profile of each series in the collection. If it - is given, only computes it for ``X``. - - Parameters - ---------- - X : np.ndarray, shape = (n_channels, n_timepoints) - A 2D array time series on which the matrix profile will be computed. - k : int - The number of best matches to return during predict for each subsequence. - threshold : float - The number of best matches to return during predict for each subsequence. - inverse_distance : bool - If True, the matching will be made on the inverse of the distance, and thus, - the worst matches to the query will be returned instead of the best ones. - exclusion_size : int - The size of the exclusion zone used to prevent returning as top k candidates - the ones that are close to each other (for example i and i+1). - It is used to define a region between - :math:`id_timestamp - exclusion_size` and - :math:`id_timestamp + exclusion_size` which cannot be returned - as best match if :math:`id_timestamp` was already selected. By default, - the value None means that this is not used. - X_index : Optional[int], optional - If ``X`` is a series of the database given in fit, specify its index in - ``X_``. If specified, each query of this series won't be able to match with - its neighboring subsequences. - - Returns - ------- - MP : array of shape (n_timepoints - L + 1,) - Matrix profile distances for each query subsequence. If X is none, this - will be a list of MP for each series in X_. - IP : array of shape (n_timepoints - L + 1,) - Indexes of the top matches for each query subsequence. If X is none, this - will be a list of MP for each series in X_. - """ - XdotT = [ - get_ith_products(self.X[i], X, self.length, 0) for i in range(len(self.X_)) - ] - if isinstance(X, np.ndarray): - XdotT = np.asarray(XdotT) - elif isinstance(X, List): - XdotT = List(XdotT) - - if X_index is None: - X_means, X_stds = sliding_mean_std_one_series(X, self.length, 1) - else: - X_means, X_stds = self.X_means_[X_index], self.X_stds_[X_index] - if self.normalise: - MP, IP = _stomp_normalised( - self.X_, - X, - XdotT, - self.X_means_, - self.X_stds_, - X_means, - X_stds, - self.length, - X_index, - k, - threshold, - allow_neighboring_matches, - exclusion_size, - inverse_distance, - ) - else: - MP, IP = _stomp( - self.X_, - X, - XdotT, - self.length, - X_index, - k, - threshold, - allow_neighboring_matches, - exclusion_size, - inverse_distance, - ) - return MP, IP - - def compute_distance_profile(self, X: np.ndarray): - """ - Compute the distance profile of X to all samples in X_. - - Parameters - ---------- - X : np.ndarray, 2D array of shape (n_channels, length) - The query to use to compute the distance profiles. - - Returns - ------- - distance_profiles : np.ndarray, 2D array of shape (n_cases, n_candidates) - The distance profile of X to all samples in X_. The ``n_candidates`` value - is equal to ``n_timepoins - length + 1``. If X_ is an unequal length - collection, returns a numba typed list instead of an ndarray. - - """ - QX = [fft_sliding_dot_product(self.X_[i], X) for i in range(len(self.X_))] - if self.metadata_["unequal_length"]: - QX = List(QX) - else: - QX = np.asarray(QX) - - if self.normalise: - distance_profiles = _normalised_squared_distance_profile( - QX, - self.X_means_, - self.X_stds_, - X.mean(axis=1), - X.std(axis=1), - self.length, - ) - else: - distance_profiles = _squared_distance_profile( - QX, - self.X_, - X, - ) - - if not self.metadata_["unequal_length"]: - distance_profiles = np.asarray(distance_profiles) - return distance_profiles - - -@njit(cache=True, fastmath=True) -def _stomp_normalised( - X, - T, - XdotT, - X_means, - X_stds, - T_means, - T_stds, - L, - T_index, - k, - threshold, - allow_neighboring_matches, - exclusion_size, - inverse_distance, -): - """ - Compute the Matrix Profile using the STOMP algorithm with normalised distances. - - X: np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints) - The input samples. If X is an unquel length collection, expect a TypedList - of 2D arrays of shape (n_channels, n_timepoints) - T : np.ndarray, 2D array of shape (n_channels, series_length) - The series used for similarity search. Note that series_length can be equal, - superior or inferior to n_timepoints, it doesn't matter. - XdotT : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - L + 1) - Precomputed dot products between each time series in X and the query series T. - X_means : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - L + 1) - Means of each subsequences of X of size L. Should be a numba TypedList if X is - unequal length. - X_stds : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - L + 1) - Stds of each subsequences of X of size L. Should be a numba TypedList if X is - unequal length. - T_means : np.ndarray, 2D array of shape (n_channels, n_timepoints - L + 1) - Means of each subsequences of T of size L. - T_stds : np.ndarray, 2D array of shape (n_channels, n_timepoints - L + 1) - Stds of each subsequences of T of size L. - L : int - Length of the subsequences used for the distance computation. - T_index : int, - If ``T`` is a series of the database given in fit, specify its index - in ``X_``. If specified, each query of this series won't be able to - match with its neighboring subsequences. - k : int, - The number of best matches to return during predict for each subsequence. - threshold : float - The number of best matches to return during predict for each subsequence. - allow_neighboring_matches : bool - Wheter the top-k candidates can be neighboring subsequences. - exclusion_size : int - The size of the exclusion zone used to prevent returning as top k candidates - the ones that are close to each other (for example i and i+1). - It is used to define a region between - :math:`id_timestamp - exclusion_size` and - :math:`id_timestamp + exclusion_size` which cannot be returned - as best match if :math:`id_timestamp` was already selected. By default, - the value None means that this is not used. - inverse_distance : bool - If True, the matching will be made on the inverse of the distance, and thus, the - worst matches to the query will be returned instead of the best ones. - - Returns - ------- - tuple of np.ndarray - - MP : array of shape (series_length - L + 1,) - Matrix profile distances for each query subsequence. - - IP : array of shape (series_length - L + 1,) - Indexes of the top matches for each query subsequence. - """ - n_queries = T.shape[1] - L + 1 - MP = List() - IP = List() - - for i_q in range(n_queries): - dist_profiles = _normalised_squared_distance_profile( - XdotT, X_means, X_stds, T_means[:, i_q], T_stds[:, i_q], L - ) - if i_q + 1 < n_queries: - for i_x in range(len(X)): - XdotT[i_x] = _update_dot_products_one_series( - X[i_x], T, XdotT[i_x], L, i_q + 1 - ) - - if inverse_distance: - dist_profiles = _inverse_distance_profile_list(dist_profiles) - - # Deal with self-matches - if T_index is not None: - _max_timestamp = X[T_index].shape[1] - L - ub = min(i_q + exclusion_size, _max_timestamp) - lb = max(0, i_q - exclusion_size) - dist_profiles[T_index][lb:ub] = np.inf - - top_indexes, top_dists = _extract_top_k_from_dist_profile( - dist_profiles, - k, - threshold, - allow_neighboring_matches, - exclusion_size, - ) - - MP.append(top_dists) - IP.append(top_indexes) - - return MP, IP - - -@njit(cache=True, fastmath=True) -def _stomp( - X, - T, - XdotT, - L, - T_index, - k, - threshold, - allow_neighboring_matches, - exclusion_size, - inverse_distance, -): - """ - Compute the Matrix Profile using the STOMP algorithm with non-normalised distances. - - X: np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints) - The input samples. If X is an unquel length collection, expect a TypedList - of 2D arrays of shape (n_channels, n_timepoints) - T : np.ndarray, 2D array of shape (n_channels, series_length) - The series used for similarity search. Note that series_length can be equal, - superior or inferior to n_timepoints, it doesn't matter. - XdotT : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - L + 1) - Precomputed dot products between each time series in X and the query series T. - L : int - Length of the subsequences used for the distance computation. - T_index : int, - If ``T`` is a series of the database given in fit, specify its index - in ``X_``. If specified, each query of this series won't be able to - match with its neighboring subsequences. - k : int, - The number of best matches to return during predict for each subsequence. - threshold : float - The number of best matches to return during predict for each subsequence. - allow_neighboring_matches : bool - Wheter the top-k candidates can be neighboring subsequences. - exclusion_size : int - The size of the exclusion zone used to prevent returning as top k candidates - the ones that are close to each other (for example i and i+1). - It is used to define a region between - :math:`id_timestamp - exclusion_size` and - :math:`id_timestamp + exclusion_size` which cannot be returned - as best match if :math:`id_timestamp` was already selected. By default, - the value None means that this is not used. - inverse_distance : bool - If True, the matching will be made on the inverse of the distance, and thus, the - worst matches to the query will be returned instead of the best ones. - - Returns - ------- - tuple of np.ndarray - - MP : array of shape (series_length - L + 1,) - Matrix profile distances for each query subsequence. - - IP : array of shape (series_length - L + 1,) - Indexes of the top matches for each query subsequence. - """ - n_queries = T.shape[1] - L + 1 - MP = List() - IP = List() - - # For each query of size L in T - for i_q in range(n_queries): - Q = T[:, i_q : i_q + L] - dist_profiles = _squared_distance_profile(XdotT, X, Q) - # For each series in X compute distance profile to the query - if i_q + 1 < n_queries: - for i_x in range(len(X)): - XdotT[i_x] = _update_dot_products_one_series( - X[i_x], T, XdotT[i_x], L, i_q + 1 - ) - - if inverse_distance: - dist_profiles = _inverse_distance_profile_list(dist_profiles) - - # Deal with self-matches - if T_index is not None: - _max_timestamp = X[T_index].shape[1] - L - ub = min(i_q + exclusion_size, _max_timestamp) - lb = max(0, i_q - exclusion_size) - dist_profiles[T_index][lb:ub] = np.inf - - top_indexes, top_dists = _extract_top_k_from_dist_profile( - dist_profiles, - k, - threshold, - allow_neighboring_matches, - exclusion_size, - ) - - MP.append(top_dists) - IP.append(top_indexes) - - return MP, IP - - -@njit(cache=True, fastmath=True) -def _update_dot_products_one_series( - X, - T, - XT_products, - L, - i_query, -): - """ - Update dot products of the i-th query of size L in T from the dot products of i-1. - - Parameters - ---------- - X: np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints) - Input time series on which the sliding dot product is computed. - T: np.ndarray, 2D array of shape (n_channels, series_length) - The series used for similarity search. Note that series_length can be equal, - superior or inferior to n_timepoints, it doesn't matter. - L : int - The length of the subsequences considered during the search. This parameter - cannot be larger than n_timepoints and series_length. - i_query : int - Query starting index in T. - - Returns - ------- - XT_products : np.ndarray of shape (n_cases, n_channels, n_timepoints - L + 1) - Sliding dot product between the i-th subsequence of size L in T and X. - - """ - n_channels = T.shape[0] - Q = T[:, i_query : i_query + L] - n_candidates = X.shape[1] - L + 1 - - for i_ft in range(n_channels): - # first element of all 0 to n-1 candidates * first element of previous query - _a1 = X[i_ft, : n_candidates - 1] * T[i_ft, i_query - 1] - # last element of all 1 to n candidates * last element of current query - _a2 = X[i_ft, L : L - 1 + n_candidates] * T[i_ft, i_query + L - 1] - - XT_products[i_ft, 1:] = XT_products[i_ft, :-1] - _a1 + _a2 - - # Compute first dot product - XT_products[i_ft, 0] = np.sum(Q[i_ft] * X[i_ft, :L]) - return XT_products - - -@njit(cache=True, fastmath=True, parallel=True) -def _squared_distance_profile(QX, X, Q): - """ - Compute squared distance profiles between query subsequence and time series. - - Parameters - ---------- - QX : List of np.ndarray - List of precomputed dot products between queries and time series, with each - element corresponding to a different time series. - Shape of each array is (n_channels, n_timepoints - query_length + 1). - X : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints) - The input samples. If X is an unquel length collection, expect a numba TypedList - 2D array of shape (n_channels, n_timepoints) - Q : np.ndarray, 2D array of shape (n_channels, query_length) - The query used for similarity search. - mask : np.ndarray, 3D array of shape (n_cases, n_timepoints - query_length + 1) - Boolean mask of the shape of the distance profile indicating for which part - of it the distance should be computed. - - Returns - ------- - distance_profiles : np.ndarray - 3D array of shape (n_cases, n_timepoints - query_length + 1) - The distance profile between Q and the input time series X. - - """ - distance_profiles = List() - query_length = Q.shape[1] - - # Init distance profile array with unequal length support - for i_instance in range(len(X)): - profile_length = X[i_instance].shape[1] - query_length + 1 - distance_profiles.append(np.full((profile_length), np.inf)) - - for _i_instance in prange(len(QX)): - # prange cast iterator to unit64 with parallel=True - i_instance = np.int_(_i_instance) - - distance_profiles[i_instance] = _squared_dist_profile_one_series( - QX[i_instance], X[i_instance], Q - ) - return distance_profiles - - -@njit(cache=True, fastmath=True) -def _squared_dist_profile_one_series(QT, T, Q): - """ - Compute squared distance profile between query subsequence and a single time series. - - This function calculates the squared distance profile for a single time series by - leveraging the dot product of the query and time series as well as precomputed sums - of squares to efficiently compute the squared distances. - - Parameters - ---------- - QT : np.ndarray, 2D array of shape (n_channels, n_timepoints - query_length + 1) - The dot product between the query and the time series. - T : np.ndarray, 2D array of shape (n_channels, series_length) - The series used for similarity search. Note that series_length can be equal, - superior or inferior to n_timepoints, it doesn't matter. - Q : np.ndarray - 2D array of shape (n_channels, query_length) representing query subsequence. - - Returns - ------- - distance_profile : np.ndarray - 2D array of shape (n_channels, n_timepoints - query_length + 1) - The squared distance profile between the query and the input time series. - """ - n_channels, profile_length = QT.shape - query_length = Q.shape[1] - _QT = -2 * QT - distance_profile = np.zeros(profile_length) - for k in prange(n_channels): - _sum = 0 - _qsum = 0 - for j in prange(query_length): - _sum += T[k, j] ** 2 - _qsum += Q[k, j] ** 2 - - distance_profile += _qsum + _QT[k] - distance_profile[0] += _sum - for i in prange(1, profile_length): - _sum += T[k, i + (query_length - 1)] ** 2 - T[k, i - 1] ** 2 - distance_profile[i] += _sum - return distance_profile - - -@njit(cache=True, fastmath=True, parallel=True) -def _normalised_squared_distance_profile( - QX, X_means, X_stds, Q_means, Q_stds, query_length -): - """ - Compute the normalised squared distance profiles between query subsequence and input time series. - - Parameters - ---------- - QX : List of np.ndarray - List of precomputed dot products between queries and time series, with each element - corresponding to a different time series. - Shape of each array is (n_channels, n_timepoints - query_length + 1). - X_means : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - query_length + 1) # noqa: E501 - Means of each subsequences of X of size query_length - X_stds : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - query_length + 1) # noqa: E501 - Stds of each subsequences of X of size query_length - Q_means : np.ndarray, 1D array of shape (n_channels) - Means of the query q - Q_stds : np.ndarray, 1D array of shape (n_channels) - Stds of the query q - query_length : int - The length of the query subsequence used for the distance profile computation. - - Returns - ------- - List of np.ndarray - List of 2D arrays, each of shape (n_channels, n_timepoints - query_length + 1). - Each array contains the normalised squared distance profile between the query subsequence and the corresponding time series. - Entries in the array are set to infinity where the mask is False. - """ - distance_profiles = List() - Q_is_constant = Q_stds <= AEON_NUMBA_STD_THRESHOLD - # Init distance profile array with unequal length support - for i_instance in range(len(QX)): - profile_length = QX[i_instance].shape[1] - distance_profiles.append(np.zeros(profile_length)) - - for _i_instance in prange(len(QX)): - # iterator is uint64 with prange and parallel so cast to int to avoid warnings - i_instance = np.int64(_i_instance) - distance_profiles[i_instance] = _normalised_squared_dist_profile_one_series( - QX[i_instance], - X_means[i_instance], - X_stds[i_instance], - Q_means, - Q_stds, - query_length, - Q_is_constant, - ) - return distance_profiles - - -@njit(cache=True, fastmath=True) -def _normalised_squared_dist_profile_one_series( - QT, T_means, T_stds, Q_means, Q_stds, query_length, Q_is_constant -): - """ - Compute the z-normalised squared Euclidean distance profile for one time series. - - Parameters - ---------- - QT : np.ndarray, 2D array of shape (n_channels, n_timepoints - query_length + 1) - The dot product between the query and the time series. - T_means : np.ndarray, 1D array of length n_channels - The mean values of the time series for each channel. - T_stds : np.ndarray, 2D array of shape (n_channels, profile_length) - The standard deviations of the time series for each channel and position. - Q_means : np.ndarray, 1D array of shape (n_channels) - Means of the query q - Q_stds : np.ndarray, 1D array of shape (n_channels) - Stds of the query q - query_length : int - The length of the query subsequence used for the distance profile computation. - Q_is_constant : np.ndarray - 1D array of shape (n_channels,) where each element is a Boolean indicating - whether the query standard deviation for that channel is less than or equal - to a specified threshold. - - Returns - ------- - np.ndarray - 2D array of shape (n_channels, n_timepoints - query_length + 1) containing the - z-normalised squared distance profile between the query subsequence and the time - series. Entries are computed based on the z-normalised values, with special - handling for constant values. - """ - n_channels, profile_length = QT.shape - distance_profile = np.zeros(profile_length) - - for i in prange(profile_length): - Sub_is_constant = T_stds[:, i] <= AEON_NUMBA_STD_THRESHOLD - for k in prange(n_channels): - # Two Constant case - if Q_is_constant[k] and Sub_is_constant[k]: - _val = 0 - # One Constant case - elif Q_is_constant[k] or Sub_is_constant[k]: - _val = query_length - else: - denom = query_length * Q_stds[k] * T_stds[k, i] - - p = (QT[k, i] - query_length * (Q_means[k] * T_means[k, i])) / denom - p = min(p, 1.0) - - _val = abs(2 * query_length * (1.0 - p)) - distance_profile[i] += _val - - return distance_profile diff --git a/aeon/similarity_search/subsequence_search/tests/__init__.py b/aeon/similarity_search/subsequence_search/tests/__init__.py deleted file mode 100644 index 0287f2ee04..0000000000 --- a/aeon/similarity_search/subsequence_search/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Tests for subsequence search methods.""" diff --git a/aeon/similarity_search/subsequence_search/tests/test_brute_force.py b/aeon/similarity_search/subsequence_search/tests/test_brute_force.py deleted file mode 100644 index 9ef0eb44e8..0000000000 --- a/aeon/similarity_search/subsequence_search/tests/test_brute_force.py +++ /dev/null @@ -1,172 +0,0 @@ -""" -Tests for stomp algorithm. - -We do not test equality for returned indexes due to the unstable nature of argsort -and the fact that the "kind=stable" parameter is not yet supported in numba. We instead -test that the returned index match the expected distance value. -""" - -__maintainer__ = ["baraline"] - -import numpy as np -import pytest -from numba.typed import List -from numpy.testing import ( - assert_almost_equal, - assert_array_almost_equal, - assert_array_equal, -) - -from aeon.similarity_search.subsequence_search._brute_force import ( - _compute_dist_profile, - _naive_squared_distance_profile, - _naive_squared_matrix_profile, -) -from aeon.similarity_search.subsequence_search._commons import ( - _extract_top_k_from_dist_profile, - _inverse_distance_profile_list, -) -from aeon.testing.data_generation import ( - make_example_2d_numpy_series, - make_example_3d_numpy, - make_example_3d_numpy_list, -) -from aeon.utils.numba.general import ( - get_all_subsequences, - sliding_mean_std_one_series, - z_normalise_series_2d, -) - -K_VALUES = [1, 3, 5] -NN_MATCHES = [True, False] -INVERSE = [True, False] -NORMALISE = [True, False] - - -def _get_mean_sdts_inputs(X, Q, L): - X_means = [] - X_stds = [] - - for i_x in range(len(X)): - _mean, _std = sliding_mean_std_one_series(X[i_x], L, 1) - X_stds.append(_std) - X_means.append(_mean) - - Q_means = Q.mean(axis=1) - Q_stds = Q.std(axis=1) - - return X_means, X_stds, Q_means, Q_stds - - -def test__compute_dist_profile(): - """Test Euclidean distance with brute force.""" - L = 3 - X = make_example_2d_numpy_series(n_channels=1, n_timepoints=10) - Q = make_example_2d_numpy_series(n_channels=1, n_timepoints=L) - dist_profile = _compute_dist_profile(get_all_subsequences(X, L, 1), Q) - for i_t in range(X.shape[1] - L + 1): - assert_almost_equal(dist_profile[i_t], np.sum((X[:, i_t : i_t + L] - Q) ** 2)) - - -@pytest.mark.parametrize("normalise", NORMALISE) -def test__naive_squared_distance_profile(normalise): - """Test Euclidean distance profile calculation with brute force.""" - L = 3 - X = make_example_3d_numpy(n_cases=3, n_channels=1, n_timepoints=10, return_y=False) - Q = make_example_2d_numpy_series(n_channels=1, n_timepoints=L) - dist_profiles = _naive_squared_distance_profile(X, Q, normalise=normalise) - - if normalise: - Q = z_normalise_series_2d(Q) - for i_x in range(len(X)): - for i_t in range(X[i_x].shape[1] - L + 1): - _x = X[i_x, :, i_t : i_t + L] - if normalise: - _x = z_normalise_series_2d(_x) - assert_almost_equal(dist_profiles[i_x][i_t], np.sum((_x - Q) ** 2)) - - # test unequal length and multivariate - X = List( - make_example_3d_numpy_list( - n_cases=3, - n_channels=2, - min_n_timepoints=10, - max_n_timepoints=20, - return_y=False, - ) - ) - - Q = make_example_2d_numpy_series(n_channels=2, n_timepoints=L) - dist_profiles = _naive_squared_distance_profile(X, Q, normalise=normalise) - if normalise: - Q = z_normalise_series_2d(Q) - for i_x in range(len(X)): - for i_t in range(X[i_x].shape[1] - L + 1): - _x = X[i_x][:, i_t : i_t + L] - if normalise: - _x = z_normalise_series_2d(_x) - assert_almost_equal(dist_profiles[i_x][i_t], np.sum((_x - Q) ** 2)) - - -@pytest.mark.parametrize("k", K_VALUES) -@pytest.mark.parametrize("allow_neighboring_matches", NN_MATCHES) -@pytest.mark.parametrize("inverse_distance", INVERSE) -@pytest.mark.parametrize("normalise", NORMALISE) -def test__naive_squared_matrix_profile( - k, allow_neighboring_matches, inverse_distance, normalise -): - """Test brute force matrix profile method.""" - L = 3 - X = List( - make_example_3d_numpy_list( - n_cases=3, - n_channels=2, - min_n_timepoints=6, - max_n_timepoints=8, - return_y=False, - ) - ) - X_copy = X.copy() - T = make_example_2d_numpy_series(n_channels=2, n_timepoints=5) - T_copy = T.copy() - T_index = None - threshold = np.inf - exclusion_size = L - # MP : distances to best matches for each query - # IP : Indexes of best matches for each query - MP, IP = _naive_squared_matrix_profile( - X, - T, - L, - T_index, - k, - threshold, - allow_neighboring_matches, - exclusion_size, - inverse_distance, - normalise=normalise, - ) - assert_array_equal(T, T_copy) - for i in range(len(X)): - assert_array_equal(X[i], X_copy[i]) - # For each query of size L in T - for i in range(T.shape[1] - L + 1): - dist_profiles = _naive_squared_distance_profile( - X, T[:, i : i + L], normalise=normalise - ) - # Check that the top matches extracted have the same value that the - # top matches in the distance profile - if inverse_distance: - dist_profiles = _inverse_distance_profile_list(dist_profiles) - - top_k_indexes, top_k_distances = _extract_top_k_from_dist_profile( - dist_profiles, k, threshold, allow_neighboring_matches, exclusion_size - ) - # Check that the top matches extracted have the same value that the - # top matches in the distance profile - assert_array_almost_equal(MP[i], top_k_distances) - - # Check that the index in IP correspond to a distance profile point - # with value equal to the corresponding MP point. - for j, index in enumerate(top_k_indexes): - assert_almost_equal(MP[i][j], dist_profiles[index[0]][index[1]]) diff --git a/aeon/similarity_search/subsequence_search/tests/test_commons.py b/aeon/similarity_search/subsequence_search/tests/test_commons.py deleted file mode 100644 index e5b4272285..0000000000 --- a/aeon/similarity_search/subsequence_search/tests/test_commons.py +++ /dev/null @@ -1,97 +0,0 @@ -"""Test _commons.py functions.""" - -__maintainer__ = ["baraline"] -import numpy as np -import pytest -from numba.typed import List -from numpy.testing import assert_, assert_array_almost_equal - -from aeon.similarity_search.subsequence_search._commons import ( - _extract_top_k_from_dist_profile, - _inverse_distance_profile_list, - fft_sliding_dot_product, - get_ith_products, -) -from aeon.testing.data_generation import ( - make_example_2d_numpy_list, - make_example_2d_numpy_series, -) - -K_VALUES = [1, 3, 5] -THRESHOLDS = [np.inf, 0.7] -NN_MATCHES = [False, True] - - -def test_fft_sliding_dot_product(): - """Test the fft_sliding_dot_product function.""" - L = 4 - X = make_example_2d_numpy_series(n_channels=1, n_timepoints=10) - Q = make_example_2d_numpy_series(n_channels=1, n_timepoints=L) - - values = fft_sliding_dot_product(X, Q) - # Compare values[0] only as input is univariate - assert_array_almost_equal( - values[0], - [np.dot(Q[0], X[0, i : i + L]) for i in range(X.shape[1] - L + 1)], - ) - - -def test_get_ith_products(): - """Test i-th dot product of a subsequence of size L.""" - X = make_example_2d_numpy_series(n_channels=1, n_timepoints=10) - Q = make_example_2d_numpy_series(n_channels=1, n_timepoints=10) - L = 5 - - values = get_ith_products(X, Q, L, 0) - # Compare values[0] only as input is univariate - assert_array_almost_equal( - values[0], - [np.dot(Q[0, 0:L], X[0, i : i + L]) for i in range(X.shape[1] - L + 1)], - ) - - values = get_ith_products(X, Q, L, 4) - # Compare values[0] only as input is univariate - assert_array_almost_equal( - values[0], - [np.dot(Q[0, 4 : 4 + L], X[0, i : i + L]) for i in range(X.shape[1] - L + 1)], - ) - - -def test__inverse_distance_profile_list(): - """Test method to inverse a TypedList of distance profiles.""" - X = make_example_2d_numpy_list(n_cases=2, return_y=False) - T = _inverse_distance_profile_list(List(X)) - assert_array_almost_equal(1 / (X[0] + 1e-8), T[0]) - assert_array_almost_equal(1 / (X[1] + 1e-8), T[1]) - - -@pytest.mark.parametrize("k", K_VALUES) -@pytest.mark.parametrize("threshold", THRESHOLDS) -@pytest.mark.parametrize("allow_nn_matches", NN_MATCHES) -def test__extract_top_k_from_dist_profile(k, threshold, allow_nn_matches): - """Test method to esxtract the top k candidates from a list of distance profiles.""" - X = make_example_2d_numpy_list( - n_cases=2, min_n_timepoints=5, max_n_timepoints=7, return_y=False - ) - X_sort = [X[i][np.argsort(X[i])] for i in range(len(X))] - - top_k_indexes, top_k_distances = _extract_top_k_from_dist_profile( - X, k, threshold, allow_nn_matches, 3 - ) - for i, index in enumerate(top_k_indexes): - assert_(X[index[0]][index[1]] == top_k_distances[i]) - assert_(np.all(top_k_distances <= threshold)) - if allow_nn_matches: - for i in range(len(X)): - assert_(np.all(top_k_distances <= X_sort[i][k - 1])) - if not allow_nn_matches: - for i_x in range(len(X)): - # test same index X respect exclusion - same_X = [ - top_k_indexes[i][1] - for i in range(len(top_k_indexes)) - if top_k_indexes[i][0] == i_x - ] - same_X = np.sort(same_X) - if len(same_X) > 1: - assert_(np.all(np.diff(same_X) >= 3)) diff --git a/aeon/similarity_search/subsequence_search/tests/test_stomp.py b/aeon/similarity_search/subsequence_search/tests/test_stomp.py deleted file mode 100644 index 757c8a3133..0000000000 --- a/aeon/similarity_search/subsequence_search/tests/test_stomp.py +++ /dev/null @@ -1,332 +0,0 @@ -""" -Tests for stomp algorithm. - -We do not test equality for returned indexes due to the unstable nature of argsort -and the fact that the "kind=stable" parameter is not yet supported in numba. We instead -test that the returned index match the expected distance value. -""" - -__maintainer__ = ["baraline"] - -import numpy as np -import pytest -from numba.typed import List -from numpy.testing import assert_almost_equal, assert_array_almost_equal - -from aeon.similarity_search.subsequence_search._commons import ( - _extract_top_k_from_dist_profile, - _inverse_distance_profile_list, - get_ith_products, -) -from aeon.similarity_search.subsequence_search._stomp import ( - _normalised_squared_dist_profile_one_series, - _normalised_squared_distance_profile, - _squared_dist_profile_one_series, - _squared_distance_profile, - _stomp, - _stomp_normalised, - _update_dot_products_one_series, -) -from aeon.testing.data_generation import ( - make_example_2d_numpy_series, - make_example_3d_numpy, - make_example_3d_numpy_list, -) -from aeon.utils.numba.general import ( - sliding_mean_std_one_series, - z_normalise_series_2d_with_mean_std, -) - -K_VALUES = [1, 3, 5] -NN_MATCHES = [True, False] -INVERSE = [True, False] - - -def _get_mean_sdts_inputs(X, Q, L): - X_means = [] - X_stds = [] - - for i_x in range(len(X)): - _mean, _std = sliding_mean_std_one_series(X[i_x], L, 1) - X_stds.append(_std) - X_means.append(_mean) - - Q_means = Q.mean(axis=1) - Q_stds = Q.std(axis=1) - - return X_means, X_stds, Q_means, Q_stds - - -def test__update_dot_products_one_series(): - """Test the _update_dot_product function.""" - X = make_example_2d_numpy_series(n_channels=1, n_timepoints=20) - T = make_example_2d_numpy_series(n_channels=1, n_timepoints=10) - L = 7 - current_product = get_ith_products(X, T, L, 0) - for i_query in range(1, T.shape[1] - L + 1): - new_product = get_ith_products( - X, - T, - L, - i_query, - ) - current_product = _update_dot_products_one_series( - X, - T, - current_product, - L, - i_query, - ) - assert_array_almost_equal(new_product, current_product) - - -def test__squared_dist_profile_one_series(): - """Test Euclidean distance.""" - L = 3 - X = make_example_2d_numpy_series(n_channels=1, n_timepoints=10) - Q = make_example_2d_numpy_series(n_channels=1, n_timepoints=L) - QX = get_ith_products(X, Q, L, 0) - dist_profile = _squared_dist_profile_one_series(QX, X, Q) - for i_t in range(X.shape[1] - L + 1): - assert_almost_equal(dist_profile[i_t], np.sum((X[:, i_t : i_t + L] - Q) ** 2)) - - -def test__normalised_squared_dist_profile_one_series(): - """Test Euclidean distance.""" - L = 3 - X = make_example_2d_numpy_series(n_channels=1, n_timepoints=10) - Q = make_example_2d_numpy_series(n_channels=1, n_timepoints=L) - QX = get_ith_products(X, Q, L, 0) - X_mean, X_std = sliding_mean_std_one_series(X, L, 1) - Q_mean = Q.mean(axis=1) - Q_std = Q.std(axis=1) - - dist_profile = _normalised_squared_dist_profile_one_series( - QX, X_mean, X_std, Q_mean, Q_std, L, Q.std(axis=1) <= 0 - ) - Q = z_normalise_series_2d_with_mean_std(Q, Q_mean, Q_std) - for i_t in range(X.shape[1] - L + 1): - S = z_normalise_series_2d_with_mean_std( - X[:, i_t : i_t + L], X_mean[:, i_t], X_std[:, i_t] - ) - assert_almost_equal(dist_profile[i_t], np.sum((S - Q) ** 2)) - - -def test__squared_distance_profile(): - """Test Euclidean distance profile calculation.""" - L = 3 - X = make_example_3d_numpy(n_cases=3, n_channels=1, n_timepoints=10, return_y=False) - Q = make_example_2d_numpy_series(n_channels=1, n_timepoints=L) - QX = np.asarray([get_ith_products(X[i_x], Q, L, 0) for i_x in range(len(X))]) - dist_profiles = _squared_distance_profile(QX, X, Q) - for i_x in range(len(X)): - for i_t in range(X[i_x].shape[1] - L + 1): - assert_almost_equal( - dist_profiles[i_x][i_t], np.sum((X[i_x, :, i_t : i_t + L] - Q) ** 2) - ) - - # test unequal length and multivariate - X = List( - make_example_3d_numpy_list( - n_cases=3, - n_channels=2, - min_n_timepoints=10, - max_n_timepoints=20, - return_y=False, - ) - ) - - Q = make_example_2d_numpy_series(n_channels=2, n_timepoints=L) - QX = List([get_ith_products(X[i_x], Q, L, 0) for i_x in range(len(X))]) - dist_profiles = _squared_distance_profile(QX, X, Q) - for i_x in range(len(X)): - for i_t in range(X[i_x].shape[1] - L + 1): - assert_almost_equal( - dist_profiles[i_x][i_t], np.sum((X[i_x][:, i_t : i_t + L] - Q) ** 2) - ) - - -def test__normalised_squared_distance_profile(): - """Test Euclidean distance profile calculation.""" - L = 3 - X = make_example_3d_numpy(n_cases=3, n_channels=1, n_timepoints=10, return_y=False) - Q = make_example_2d_numpy_series(n_channels=1, n_timepoints=L) - QX = np.asarray([get_ith_products(X[i_x], Q, L, 0) for i_x in range(len(X))]) - - X_means, X_stds, Q_means, Q_stds = _get_mean_sdts_inputs(X, Q, L) - - X_means = np.asarray(X_means) - X_stds = np.asarray(X_stds) - - dist_profiles = _normalised_squared_distance_profile( - QX, X_means, X_stds, Q_means, Q_stds, L - ) - - Q_norm = z_normalise_series_2d_with_mean_std(Q, Q_means, Q_stds) - for i_x in range(len(X)): - for i_t in range(X[i_x].shape[1] - L + 1): - X_sub_norm = z_normalise_series_2d_with_mean_std( - X[i_x, :, i_t : i_t + L], X_means[i_x][:, i_t], X_stds[i_x][:, i_t] - ) - assert_almost_equal( - dist_profiles[i_x][i_t], np.sum((X_sub_norm - Q_norm) ** 2) - ) - - # test unequal length and multivariate - X = List( - make_example_3d_numpy_list( - n_cases=5, - n_channels=2, - min_n_timepoints=10, - max_n_timepoints=20, - return_y=False, - ) - ) - Q = make_example_2d_numpy_series(n_channels=2, n_timepoints=L) - - QX = List([get_ith_products(X[i_x], Q, L, 0) for i_x in range(len(X))]) - - X_means, X_stds, Q_means, Q_stds = _get_mean_sdts_inputs(X, Q, L) - # Convert to numba typed list - X_means = List(X_means) - X_stds = List(X_stds) - - dist_profiles = _normalised_squared_distance_profile( - QX, X_means, X_stds, Q_means, Q_stds, L - ) - - Q_norm = z_normalise_series_2d_with_mean_std(Q, Q_means, Q_stds) - for i_x in range(len(X)): - for i_t in range(X[i_x].shape[1] - L + 1): - X_sub_norm = z_normalise_series_2d_with_mean_std( - X[i_x][:, i_t : i_t + L], X_means[i_x][:, i_t], X_stds[i_x][:, i_t] - ) - assert_almost_equal( - dist_profiles[i_x][i_t], np.sum((X_sub_norm - Q_norm) ** 2) - ) - - -@pytest.mark.parametrize("k", K_VALUES) -@pytest.mark.parametrize("allow_neighboring_matches", NN_MATCHES) -@pytest.mark.parametrize("inverse_distance", INVERSE) -def test__stomp(k, allow_neighboring_matches, inverse_distance): - """Test STOMP method.""" - L = 3 - - X = make_example_3d_numpy_list( - n_cases=3, - n_channels=2, - min_n_timepoints=6, - max_n_timepoints=8, - return_y=False, - ) - T = make_example_2d_numpy_series(n_channels=2, n_timepoints=5) - XdotT = List([get_ith_products(X[i_x], T, L, 0) for i_x in range(len(X))]) - - T_index = None - threshold = np.inf - exclusion_size = L - # MP : distances to best matches for each query - # IP : Indexes of best matches for each query - MP, IP = _stomp( - X, - T, - XdotT, - L, - T_index, - k, - threshold, - allow_neighboring_matches, - exclusion_size, - inverse_distance, - ) - # For each query of size L in T - for i in range(T.shape[1] - L + 1): - dist_profiles = _squared_distance_profile( - List([get_ith_products(X[i_x], T, L, i) for i_x in range(len(X))]), - X, - T[:, i : i + L], - ) - # Check that the top matches extracted have the same value that the - # top matches in the distance profile - if inverse_distance: - dist_profiles = _inverse_distance_profile_list(dist_profiles) - - top_k_indexes, top_k_distances = _extract_top_k_from_dist_profile( - dist_profiles, k, threshold, allow_neighboring_matches, exclusion_size - ) - # Check that the top matches extracted have the same value that the - # top matches in the distance profile - assert_array_almost_equal(MP[i], top_k_distances) - - # Check that the index in IP correspond to a distance profile point - # with value equal to the corresponding MP point. - for j, index in enumerate(top_k_indexes): - assert_almost_equal(MP[i][j], dist_profiles[index[0]][index[1]]) - - -@pytest.mark.parametrize("k", K_VALUES) -@pytest.mark.parametrize("allow_neighboring_matches", NN_MATCHES) -@pytest.mark.parametrize("inverse_distance", INVERSE) -def test__stomp_normalised(k, allow_neighboring_matches, inverse_distance): - """Test STOMP normalised method.""" - L = 3 - X = make_example_3d_numpy_list( - n_cases=3, - n_channels=2, - min_n_timepoints=6, - max_n_timepoints=8, - return_y=False, - ) - T = make_example_2d_numpy_series(n_channels=2, n_timepoints=5) - - XdotT = List([get_ith_products(X[i_x], T, L, 0) for i_x in range(len(X))]) - - T_index = None - threshold = np.inf - exclusion_size = L - X_means, X_stds, _, _ = _get_mean_sdts_inputs(X, T, L) - T_means, T_stds = sliding_mean_std_one_series(T, L, 1) - # MP : distances to best matches for each query - # IP : Indexes of best matches for each query - MP, IP = _stomp_normalised( - X, - T, - XdotT, - X_means, - X_stds, - T_means, - T_stds, - L, - T_index, - k, - threshold, - allow_neighboring_matches, - exclusion_size, - inverse_distance, - ) - # For each query of size L in T - for i in range(T.shape[1] - L + 1): - dist_profiles = _normalised_squared_distance_profile( - List([get_ith_products(X[i_x], T, L, i) for i_x in range(len(X))]), - X_means, - X_stds, - T_means[:, i], - T_stds[:, i], - L, - ) - - if inverse_distance: - dist_profiles = _inverse_distance_profile_list(dist_profiles) - - top_k_indexes, top_k_distances = _extract_top_k_from_dist_profile( - dist_profiles, k, threshold, allow_neighboring_matches, exclusion_size - ) - # Check that the top matches extracted have the same value that the - # top matches in the distance profile - assert_array_almost_equal(MP[i], top_k_distances) - - # Check that the index in IP correspond to a distance profile point - # with value equal to the corresponding MP point. - for j, index in enumerate(top_k_indexes): - assert_almost_equal(MP[i][j], dist_profiles[index[0]][index[1]]) diff --git a/aeon/testing/mock_estimators/_mock_similarity_searchers.py b/aeon/testing/mock_estimators/_mock_similarity_searchers.py index 1d3161514a..89061a6dc2 100644 --- a/aeon/testing/mock_estimators/_mock_similarity_searchers.py +++ b/aeon/testing/mock_estimators/_mock_similarity_searchers.py @@ -1,83 +1,30 @@ """Mock series transformers useful for testing and debugging.""" -__maintainer__ = [] +__maintainer__ = ["baraline"] __all__ = [ - "MockSubsequenceSearch", - "MockMatrixProfile", + "MockSeriesSimilaritySearch", ] -import numpy as np +from aeon.similarity_search.series._base import BaseSeriesSimilaritySearch -from aeon.similarity_search.subsequence_search._base import ( - BaseMatrixProfile, - BaseSubsequenceSearch, -) - -class MockMatrixProfile(BaseMatrixProfile): +class MockSeriesSimilaritySearch(BaseSeriesSimilaritySearch): """Mock estimator for BaseMatrixProfile.""" def __init__( self, length=3, - normalise=False, + normalize=False, n_jobs=1, ): - super().__init__(length=length, n_jobs=n_jobs, normalise=normalise) - - def compute_matrix_profile( - self, - k, - threshold, - exclusion_size, - inverse_distance, - allow_neighboring_matches, - X=None, - X_index=None, - ): - """Compute matrix profiles between X_ and X or between all series in X_.""" - return np.zeros((X.shape[1] - self.length + 1, k)), np.zeros( - (X.shape[1] - self.length + 1, k, 2), dtype=np.int64 - ) + self.length = length + self.normalize = normalize + super().__init__(n_jobs=n_jobs) - def compute_distance_profile(self, X): - """Compute distrance profiles between X_ and X (a series of size length).""" - return [ - np.zeros(self.X_[i].shape[1] - self.length + 1) for i in range(len(self.X_)) - ] + def _fit(self, X, y=None): + return self - -class MockSubsequenceSearch(BaseSubsequenceSearch): - """Mock estimator for BaseSubsequenceSearch.""" - - def __init__( - self, - length=3, - normalise=False, - n_jobs=1, - ): - super().__init__(length=length, n_jobs=n_jobs, normalise=normalise) - - def _find_motifs( - self, - X, - k=1, - threshold=np.inf, - X_index=None, - inverse_distance=False, - allow_neighboring_matches=False, - exclusion_factor=2.0, - ): - return [[0, 0]], self.X_[0][0:1] # TODO: update after logic is implemented - - def _find_neighbors( - self, - X, - k=1, - threshold=np.inf, - inverse_distance=False, - X_index=None, - allow_neighboring_matches=False, - exclusion_factor=2.0, - ): - return [[0, 0]], self.X_[0][0:1] + def predict(self, X): + """Compute matrix profiles between X_ and X or between all series in X_.""" + X = self._pre_predict(X) + return [0], [0.1] diff --git a/aeon/transformations/collection/base.py b/aeon/transformations/collection/base.py index 013001d80e..54442dd6db 100644 --- a/aeon/transformations/collection/base.py +++ b/aeon/transformations/collection/base.py @@ -31,10 +31,10 @@ class name: BaseCollectionTransformer import pandas as pd from aeon.base import BaseCollectionEstimator -from aeon.transformations.base import BaseTransformer +from aeon.similarity_search._base import BaseSimilaritySearch -class BaseCollectionTransformer(BaseCollectionEstimator, BaseTransformer): +class BaseCollectionTransformer(BaseCollectionEstimator, BaseSimilaritySearch): """Transformer base class for collections.""" # tag values specific to CollectionTransformers diff --git a/examples/similarity_search/similarity_search_tasks.ipynb b/examples/similarity_search/similarity_search_tasks.ipynb index a42339c611..86fe5b5274 100644 --- a/examples/similarity_search/similarity_search_tasks.ipynb +++ b/examples/similarity_search/similarity_search_tasks.ipynb @@ -26,7 +26,7 @@ "- normalize. Wheter subseries should be normalized prior to distance computations\n", "\n", "#### Subseries Motif search :\n", - "Extract $k$-motifs or range $r$-motifs.\n", + "Extract $k$-motifs or range motifs or $r$-motifs.\n", "\n", "The $k^{th}$ motif is the $k^{th}$ most similar pair of subseries in $X$. Given $\\forall a,b,i,j$ the pair ${W_i, W_j}$ is the motif if $dist(W_i, W_j) ≤ dist(W_a, W_b), i \\neq j$ and $a \\neq b$\n", "\n", From 57e5e7b5e324426ea69a168709f7d0bac291054e Mon Sep 17 00:00:00 2001 From: baraline Date: Thu, 16 Jan 2025 15:56:05 +0100 Subject: [PATCH 13/18] Fix mistake addition in transformers and fix base classes --- aeon/similarity_search/_base.py | 5 +- aeon/similarity_search/collection/_base.py | 12 ++- .../collection/neighbors/__init__.py | 6 ++ .../collection/neighbors/_rp_cosine_lsh.py | 96 +++++++++---------- .../collection/neighbors/tests/__init__.py | 1 + .../collection/tests/__init__.py | 1 + .../collection/tests/test_base.py | 62 ++++++++++++ aeon/similarity_search/series/_base.py | 5 +- .../series/tests/test_base.py | 81 ++++++---------- .../_mock_similarity_searchers.py | 31 +++--- aeon/transformations/collection/base.py | 4 +- 11 files changed, 180 insertions(+), 124 deletions(-) create mode 100644 aeon/similarity_search/collection/neighbors/tests/__init__.py create mode 100644 aeon/similarity_search/collection/tests/__init__.py create mode 100644 aeon/similarity_search/collection/tests/test_base.py diff --git a/aeon/similarity_search/_base.py b/aeon/similarity_search/_base.py index fc58838eb4..5204163315 100644 --- a/aeon/similarity_search/_base.py +++ b/aeon/similarity_search/_base.py @@ -20,7 +20,6 @@ class BaseSimilaritySearch(BaseAeonEstimator): _tags = { "requires_y": False, - "capability:multivariate": True, "fit_is_empty": False, } @@ -85,12 +84,12 @@ def _check_predict_series_format(self, X): if isinstance(X, np.ndarray): if X.ndim != 2: raise TypeError( - "A np.ndarray given in find_neighbors must be 2D" + "A np.ndarray given in predict must be 2D" f"(n_channels, n_timepoints) but found {X.ndim}D." ) else: raise TypeError( - "Expected a 2D np.ndarray in find_neighbors but found" f" {type(X)}." + "Expected a 2D np.ndarray in predict but found" f" {type(X)}." ) if self.n_channels_ != X.shape[0]: raise ValueError( diff --git a/aeon/similarity_search/collection/_base.py b/aeon/similarity_search/collection/_base.py index 402b7342a2..3fde03d420 100644 --- a/aeon/similarity_search/collection/_base.py +++ b/aeon/similarity_search/collection/_base.py @@ -15,12 +15,16 @@ from aeon.similarity_search._base import BaseSimilaritySearch -class BaseCollectionSimilaritySearch(BaseCollectionEstimator, BaseSimilaritySearch): +class BaseCollectionSimilaritySearch(BaseSimilaritySearch, BaseCollectionEstimator): """Similarity search base class for collections.""" # tag values specific to CollectionTransformers _tags = { "input_data_type": "Collection", + "capability:multivariate": True, + "capability:unequal_length": True, + "capability:multithreading": True, + "X_inner_type": ["np-list", "numpy3D"], } @abstractmethod @@ -57,10 +61,8 @@ def fit( self.reset() X = self._preprocess_collection(X) # Store minimum number of n_timepoints for unequal length collections - self.n_channels_ = X[0].shape[1] + self.n_channels_ = X[0].shape[0] self.n_cases_ = len(X) - self.X_ = X - prev_threads = get_num_threads() set_num_threads(self._n_jobs) self._fit(X, y=y) @@ -92,5 +94,5 @@ def _pre_predict( self._check_is_fitted() if X is not None: # Could we call somehow _preprocess_series from a BaseCollectionEstimator ? - self._check_predict_format(X) + self._check_predict_series_format(X) return X diff --git a/aeon/similarity_search/collection/neighbors/__init__.py b/aeon/similarity_search/collection/neighbors/__init__.py index e9a5d49d49..f5cf0d925b 100644 --- a/aeon/similarity_search/collection/neighbors/__init__.py +++ b/aeon/similarity_search/collection/neighbors/__init__.py @@ -1 +1,7 @@ """Neighbors search for time series collection.""" + +__all__ = ["RandomProjectionIndexANN"] + +from aeon.similarity_search.collection.neighbors._rp_cosine_lsh import ( + RandomProjectionIndexANN, +) diff --git a/aeon/similarity_search/collection/neighbors/_rp_cosine_lsh.py b/aeon/similarity_search/collection/neighbors/_rp_cosine_lsh.py index 8d1701793e..6774803984 100644 --- a/aeon/similarity_search/collection/neighbors/_rp_cosine_lsh.py +++ b/aeon/similarity_search/collection/neighbors/_rp_cosine_lsh.py @@ -3,9 +3,8 @@ import numpy as np from numba import njit, prange -from aeon.similarity_search.series_search._base import BaseIndexSearch - -# TPB = 16 +from aeon.similarity_search.collection._base import BaseCollectionSimilaritySearch +from aeon.utils.numba.general import z_normalise_series_2d, z_normalise_series_3d @njit(cache=True) @@ -17,12 +16,11 @@ def _hamming_dist(X, Y): @njit(cache=True, parallel=True) -def _hamming_dist_matrix(bool_hashes_value_list, bool_hashes): - n_hash_funcs = bool_hashes.shape[0] - res = np.zeros((n_hash_funcs, bool_hashes_value_list.shape[0]), dtype=np.int64) - for i in prange(n_hash_funcs): - for j in range(bool_hashes_value_list.shape[0]): - res[i, j] = _hamming_dist(bool_hashes_value_list[j], bool_hashes[i]) +def _hamming_dist_series_to_collection(X_bool, collection_bool): + n_buckets = collection_bool.shape[0] + res = np.zeros(n_buckets, dtype=np.int64) + for i in prange(n_buckets): + res[i] = _hamming_dist(collection_bool[i], X_bool) return res @@ -32,7 +30,7 @@ def _series_to_bool(X, hash_funcs, start_points, length): res = np.empty(n_hash_funcs, dtype=np.bool_) for j in prange(n_hash_funcs): res[j] = _nb_flat_dot( - X[start_points[j] : start_points[j] + length], hash_funcs[j] + X[:, start_points[j] : start_points[j] + length], hash_funcs[j] ) return res @@ -57,13 +55,12 @@ def _collection_to_bool(X, hash_funcs, start_points, length): res[i, j] = _nb_flat_dot( X[i, :, start_points[j] : start_points[j] + length], hash_funcs[j] ) - return res -class RP_LSH_Cosine(BaseIndexSearch): +class RandomProjectionIndexANN(BaseCollectionSimilaritySearch): """ - Random Projection Locality Sensitive Hashing index for cosine similarity. + Random Projection Locality Sensitive Hashing index with cosine similarity. In this method based on SimHash, we define a hash function as a boolean operation such as, given a random vector ``V`` of shape ``(n_channels, L)`` and a time series @@ -74,8 +71,8 @@ class RP_LSH_Cosine(BaseIndexSearch): as ``X[:, s:s+L].V`` Note that this method will not provide exact results, but will perform approximate - searchs. This also ignore any temporal correlation and treat series as - high dimensional points. + searchs. This also ignore any temporal correlation and consider series as + high dimensional points due to the cosine similarity distance. Parameters ---------- @@ -93,15 +90,16 @@ class RP_LSH_Cosine(BaseIndexSearch): Example ------- >>> from aeon.datasets import load_classification - >>> from aeon.similarity_search.series_search import RP_LSH_Cosine - >>> index = RP_LSH_Cosine() + >>> from aeon.similarity_search.collection.neighbors import RandomProjectionIndexANN + >>> index = RandomProjectionIndexANN() >>> X, y = load_classification("ArrowHead") >>> index.fit(X[:200]) - >>> r = index.find_neighbors(X[201]) + >>> r = index.predict(X[201]) """ _tags = { "capability:unequal_length": False, + "capability:multithreading": True, } def __init__( @@ -110,18 +108,26 @@ def __init__( hash_func_coverage=0.25, use_discrete_vectors=True, random_state=None, - normalise=False, + normalize=True, n_jobs=1, ): self.n_hash_funcs = n_hash_funcs self.hash_func_coverage = hash_func_coverage self.use_discrete_vectors = use_discrete_vectors self.random_state = random_state - super().__init__(normalise=normalise, n_jobs=n_jobs) + self.normalize = normalize + super().__init__(n_jobs=n_jobs) - def _build_index(self): + def _fit(self, X, y=None): """ - Build the index based on the data stored in X_. + Build the index based on the X. + + Parameters + ---------- + X : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints) + Input array to be used to build the index. + y : optional + Not used. Returns ------- @@ -129,8 +135,10 @@ def _build_index(self): """ rng = np.random.default_rng(self.random_state) - n_timepoints = self.X_.shape[2] - self.window_length_ = max(1, int(n_timepoints * self.hash_func_coverage)) + if self.normalize: + X = z_normalise_series_3d(X) + self.n_timepoints_ = X.shape[2] + self.window_length_ = max(1, int(self.n_timepoints_ * self.hash_func_coverage)) if self.use_discrete_vectors: self.hash_funcs_ = rng.choice( @@ -143,13 +151,12 @@ def _build_index(self): size=(self.n_hash_funcs, self.n_channels_, self.window_length_), ) self.start_points_ = rng.choice( - n_timepoints - self.window_length_ + 1, + self.n_timepoints_ - self.window_length_ + 1, size=self.n_hash_funcs, replace=True, ) - bool_hashes = _collection_to_bool( - self.X_, self.hash_funcs_, self.start_points_, self.window_length_ + X, self.hash_funcs_, self.start_points_, self.window_length_ ) str_hashes = [hash(bool_hashes[i].tobytes()) for i in range(len(bool_hashes))] @@ -174,20 +181,6 @@ def _get_bucket_sizes(self): return {key: len(self.dict_X_index_[key]) for key in self.dict_X_index_} def _get_series_bucket(self, X): - """ - Get the matching bucket of a single series X if it exists in the index. - - Parameters - ---------- - X : TYPE - DESCRIPTION. - - Returns - ------- - TYPE - DESCRIPTION. - - """ bool_hash = _series_to_bool( X, self.hash_funcs_, self.start_points_, self.window_length_ ) @@ -197,7 +190,7 @@ def _get_series_bucket(self, X): else: return None - def _query_index( + def predict( self, X, k=1, @@ -222,19 +215,26 @@ def _query_index( Returns ------- top_k : np.ndarray, shape = (n_cases, k) - Indexes of k series in X_ (the index) that are close to each series in X. + Indexes of k series in the index that are similar to X. top_k_dist : np.ndarray, shape = (n_cases, k) - Distance of k series in X_ (the index) to each series in X. The distance + Distance of k series in the index to X. The distance is the hamming distance between the result of each hash function. """ - bool_hashes = _series_to_bool( + X = self._pre_predict(X) + if X.shape[1] != self.n_timepoints_: + raise ValueError( + f"Expected a series with {self.n_timepoints_} but got {X.shape[1]}." + "Unequal length is not supported by this estimator." + ) + if self.normalize: + X = z_normalise_series_2d(X) + + X_bool = _series_to_bool( X, self.hash_funcs_, self.start_points_, self.window_length_ ) top_k = np.zeros(k, dtype=int) top_k_dist = np.zeros(k, dtype=float) - dists = _hamming_dist_matrix( - self.bool_hashes_value_list_, bool_hashes[np.newaxis, :] - )[0] + dists = _hamming_dist_series_to_collection(X_bool, self.bool_hashes_value_list_) if inverse_distance: dists = 1 / (dists + 1e-8) # Get top k buckets diff --git a/aeon/similarity_search/collection/neighbors/tests/__init__.py b/aeon/similarity_search/collection/neighbors/tests/__init__.py new file mode 100644 index 0000000000..89bc3412fb --- /dev/null +++ b/aeon/similarity_search/collection/neighbors/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for similarity search for time series collection neighbors module.""" diff --git a/aeon/similarity_search/collection/tests/__init__.py b/aeon/similarity_search/collection/tests/__init__.py new file mode 100644 index 0000000000..d136a8571e --- /dev/null +++ b/aeon/similarity_search/collection/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for similarity search for time series collection base class and commons.""" diff --git a/aeon/similarity_search/collection/tests/test_base.py b/aeon/similarity_search/collection/tests/test_base.py new file mode 100644 index 0000000000..9180a071cf --- /dev/null +++ b/aeon/similarity_search/collection/tests/test_base.py @@ -0,0 +1,62 @@ +"""Test for collection similarity search base class.""" + +__maintainer__ = ["baraline"] + +import pytest + +from aeon.testing.mock_estimators._mock_similarity_searchers import ( + MockCollectionSimilaritySearch, +) +from aeon.testing.testing_data import ( + make_example_1d_numpy, + make_example_2d_numpy_series, + make_example_3d_numpy, + make_example_3d_numpy_list, +) + + +def test_input_shape_fit_predict_collection(): + """Test input shapes.""" + estimator = MockCollectionSimilaritySearch() + # dummy data to pass to fit when testing predict/predict_proba + X_3D_uni = make_example_3d_numpy(n_channels=1, return_y=False) + X_3D_multi = make_example_3d_numpy(n_channels=2, return_y=False) + X_3D_uni_list = make_example_3d_numpy_list(n_channels=1, return_y=False) + X_3D_multi_list = make_example_3d_numpy_list(n_channels=2, return_y=False) + X_2D_uni = make_example_2d_numpy_series(n_channels=1) + X_2D_multi = make_example_2d_numpy_series(n_channels=2) + X_1D = make_example_1d_numpy() + + # 2D are converted to 3D + valid_inputs_fit = [ + X_3D_uni, + X_3D_multi, + X_3D_multi_list, + X_3D_uni_list, + X_2D_uni, + X_2D_multi, + ] + # Valid inputs + for _input in valid_inputs_fit: + estimator.fit(_input) + + with pytest.raises(ValueError): + estimator.fit(X_1D) + + estimator_multi = MockCollectionSimilaritySearch().fit(X_3D_multi) + estimator_uni = MockCollectionSimilaritySearch().fit(X_3D_uni) + + estimator_uni.predict(X_2D_uni) + estimator_multi.predict(X_2D_multi) + + with pytest.raises(ValueError): + estimator_uni.predict(X_2D_multi) + with pytest.raises(ValueError): + estimator_multi.predict(X_2D_uni) + + for _input in [X_3D_uni, X_3D_uni_list]: + with pytest.raises(TypeError): + estimator_uni.predict(_input) + for _input in [X_3D_multi, X_3D_multi_list]: + with pytest.raises(TypeError): + estimator_multi.predict(_input) diff --git a/aeon/similarity_search/series/_base.py b/aeon/similarity_search/series/_base.py index b9cca5d5cb..bd07f25ba6 100644 --- a/aeon/similarity_search/series/_base.py +++ b/aeon/similarity_search/series/_base.py @@ -11,11 +11,12 @@ from aeon.utils.validation import check_n_jobs -class BaseSeriesSimilaritySearch(BaseSeriesEstimator, BaseSimilaritySearch): +class BaseSeriesSimilaritySearch(BaseSimilaritySearch, BaseSeriesEstimator): """Base class for similarity search applications on single series.""" _tags = { "input_data_type": "Series", + "capability:multivariate": True, } @abstractmethod @@ -87,7 +88,7 @@ def _pre_predict( self._check_is_fitted() if X is not None: X = self._preprocess_series(X, self.axis, False) - self._check_predict_format(X) + self._check_predict_series_format(X) return X def _check_X_index(self, X_index: int): diff --git a/aeon/similarity_search/series/tests/test_base.py b/aeon/similarity_search/series/tests/test_base.py index d3dc953c6a..1b4d17b991 100644 --- a/aeon/similarity_search/series/tests/test_base.py +++ b/aeon/similarity_search/series/tests/test_base.py @@ -1,4 +1,4 @@ -"""Test for subsequence search base class.""" +"""Test for series similarity search base class.""" __maintainer__ = ["baraline"] @@ -14,13 +14,10 @@ make_example_3d_numpy_list, ) -BASES = [MockSeriesSimilaritySearch] - -@pytest.mark.parametrize("base", BASES) -def test_input_shape_fit_predict(base): +def test_input_shape_fit_predict_series(): """Test input shapes.""" - estimator = base() + estimator = MockSeriesSimilaritySearch() # dummy data to pass to fit when testing predict/predict_proba X_3D_uni = make_example_3d_numpy(n_channels=1, return_y=False) X_3D_multi = make_example_3d_numpy(n_channels=2, return_y=False) @@ -30,58 +27,38 @@ def test_input_shape_fit_predict(base): X_2D_multi = make_example_2d_numpy_series(n_channels=2) X_1D = make_example_1d_numpy() - valid_inputs_fit = [X_2D_uni, X_2D_multi] - # Valid inputs + valid_inputs_fit = [X_1D, X_2D_uni, X_2D_multi] + # 1D is converted to 2D univariate for _input in valid_inputs_fit: estimator.fit(_input) - invalid_inputs_fit = [X_1D, X_3D_multi_list, X_3D_uni_list, X_3D_multi, X_3D_uni] - for _input in invalid_inputs_fit: - with pytest.raises(TypeError): - estimator.fit(_input) - - valid_inputs_predict = [X_2D_uni, X_2D_multi] - invalid_inputs_predict_uni = [ - X_1D, - X_3D_uni, - X_3D_uni_list, - ] - invalid_inputs_predict_multi = [ + invalid_inputs_fit = [ X_3D_multi, + X_3D_uni, X_3D_multi_list, + X_3D_uni_list, ] - L = 3 - estimator_multi = base(length=L).fit(X_2D_multi) - estimator_uni = base(length=L).fit(X_2D_uni) - - for _input in valid_inputs_predict: - estimator_uni.find_neighbors(_input[:, :L]) - estimator_uni.find_motifs(_input) - with pytest.raises(ValueError): - # Wrong number of channels - estimator_multi.find_neighbors(_input) - with pytest.raises(ValueError): - estimator_multi.find_motifs(_input) - # X length not of size L + for _input in invalid_inputs_fit: with pytest.raises(ValueError): - estimator_uni.find_neighbors(_input[:, : L + 2]) + estimator.fit(_input) + + estimator_multi = MockSeriesSimilaritySearch().fit(X_2D_multi) + estimator_uni = MockSeriesSimilaritySearch().fit(X_2D_uni) - for _input in invalid_inputs_predict_uni: - with pytest.raises(TypeError): - estimator_uni.find_neighbors(_input) - with pytest.raises(TypeError): - estimator_uni.find_motifs(_input) - with pytest.raises(TypeError): - estimator_multi.find_neighbors(_input) - with pytest.raises(TypeError): - estimator_multi.find_motifs(_input) + estimator_uni.predict(X_2D_uni) + # 1D is converted to 2D univariate + estimator_uni.predict(X_1D) + estimator_multi.predict(X_2D_multi) - for _input in invalid_inputs_predict_multi: - with pytest.raises(TypeError): - estimator_uni.find_neighbors(_input) - with pytest.raises(TypeError): - estimator_uni.find_motifs(_input) - with pytest.raises(TypeError): - estimator_multi.find_neighbors(_input) - with pytest.raises(TypeError): - estimator_multi.find_motifs(_input) + with pytest.raises(ValueError): + estimator_uni.predict(X_2D_multi) + with pytest.raises(ValueError): + estimator_multi.predict(X_2D_uni) + + for _input in [X_3D_uni, X_3D_uni_list]: + with pytest.raises(ValueError): + estimator_uni.predict(_input) + + for _input in [X_3D_multi, X_3D_multi_list]: + with pytest.raises(ValueError): + estimator_multi.predict(_input) diff --git a/aeon/testing/mock_estimators/_mock_similarity_searchers.py b/aeon/testing/mock_estimators/_mock_similarity_searchers.py index 89061a6dc2..a2919c939d 100644 --- a/aeon/testing/mock_estimators/_mock_similarity_searchers.py +++ b/aeon/testing/mock_estimators/_mock_similarity_searchers.py @@ -1,25 +1,32 @@ """Mock series transformers useful for testing and debugging.""" __maintainer__ = ["baraline"] -__all__ = [ - "MockSeriesSimilaritySearch", -] +__all__ = ["MockSeriesSimilaritySearch", "MockCollectionSimilaritySearch"] +from aeon.similarity_search.collection._base import BaseCollectionSimilaritySearch from aeon.similarity_search.series._base import BaseSeriesSimilaritySearch class MockSeriesSimilaritySearch(BaseSeriesSimilaritySearch): """Mock estimator for BaseMatrixProfile.""" - def __init__( - self, - length=3, - normalize=False, - n_jobs=1, - ): - self.length = length - self.normalize = normalize - super().__init__(n_jobs=n_jobs) + def __init__(self): + super().__init__() + + def _fit(self, X, y=None): + return self + + def predict(self, X): + """Compute matrix profiles between X_ and X or between all series in X_.""" + X = self._pre_predict(X) + return [0], [0.1] + + +class MockCollectionSimilaritySearch(BaseCollectionSimilaritySearch): + """Mock estimator for BaseMatrixProfile.""" + + def __init__(self): + super().__init__() def _fit(self, X, y=None): return self diff --git a/aeon/transformations/collection/base.py b/aeon/transformations/collection/base.py index 54442dd6db..013001d80e 100644 --- a/aeon/transformations/collection/base.py +++ b/aeon/transformations/collection/base.py @@ -31,10 +31,10 @@ class name: BaseCollectionTransformer import pandas as pd from aeon.base import BaseCollectionEstimator -from aeon.similarity_search._base import BaseSimilaritySearch +from aeon.transformations.base import BaseTransformer -class BaseCollectionTransformer(BaseCollectionEstimator, BaseSimilaritySearch): +class BaseCollectionTransformer(BaseCollectionEstimator, BaseTransformer): """Transformer base class for collections.""" # tag values specific to CollectionTransformers From 2078086a8c8a142f2a8d5cf166d8eb2b7898f830 Mon Sep 17 00:00:00 2001 From: baraline Date: Thu, 16 Jan 2025 16:18:38 +0100 Subject: [PATCH 14/18] Fix registry and api reference --- aeon/similarity_search/collection/__init__.py | 4 ++ aeon/testing/testing_config.py | 4 -- aeon/utils/base/_register.py | 11 ++-- docs/api_reference/similarity_search.rst | 63 +++++++++++-------- 4 files changed, 49 insertions(+), 33 deletions(-) diff --git a/aeon/similarity_search/collection/__init__.py b/aeon/similarity_search/collection/__init__.py index 0aef46ef49..ab3a546193 100644 --- a/aeon/similarity_search/collection/__init__.py +++ b/aeon/similarity_search/collection/__init__.py @@ -1 +1,5 @@ """Similarity search for time series collection.""" + +__all__ = ["BaseCollectionSimilaritySearch"] + +from aeon.similarity_search.collection._base import BaseCollectionSimilaritySearch diff --git a/aeon/testing/testing_config.py b/aeon/testing/testing_config.py index 4c46058318..61ff90cdd1 100644 --- a/aeon/testing/testing_config.py +++ b/aeon/testing/testing_config.py @@ -57,10 +57,6 @@ "ClaSPSegmenter": ["check_non_state_changing_method"], "HMMSegmenter": ["check_non_state_changing_method"], "RSTSF": ["check_non_state_changing_method"], - # Keeps length during predict to avoid recomputing means and std of data in fit - # if the next predict calls uses the same query length parameter. - "QuerySearch": ["check_non_state_changing_method"], - "SeriesSearch": ["check_non_state_changing_method"], # Unknown issue not producing the same results "RDSTRegressor": ["check_regressor_against_expected_results"], "RISTRegressor": ["check_regressor_against_expected_results"], diff --git a/aeon/utils/base/_register.py b/aeon/utils/base/_register.py index 024ad447ee..5e81e29b33 100644 --- a/aeon/utils/base/_register.py +++ b/aeon/utils/base/_register.py @@ -24,8 +24,9 @@ from aeon.forecasting.base import BaseForecaster from aeon.regression.base import BaseRegressor from aeon.segmentation.base import BaseSegmenter -from aeon.similarity_search.series_search._base import BaseSeriesSearch -from aeon.similarity_search.subsequence_search._base import BaseSubsequenceSearch +from aeon.similarity_search._base import BaseSimilaritySearch +from aeon.similarity_search.collection import BaseCollectionSimilaritySearch +from aeon.similarity_search.series import BaseSeriesSimilaritySearch from aeon.transformations.base import BaseTransformer from aeon.transformations.collection import BaseCollectionTransformer from aeon.transformations.series import BaseSeriesTransformer @@ -37,6 +38,7 @@ "estimator": BaseAeonEstimator, "series-estimator": BaseSeriesEstimator, "transformer": BaseTransformer, + "similarity-search": BaseSimilaritySearch, # estimator types "anomaly-detector": BaseAnomalyDetector, "collection-transformer": BaseCollectionTransformer, @@ -47,8 +49,8 @@ "segmenter": BaseSegmenter, "series-transformer": BaseSeriesTransformer, "forecaster": BaseForecaster, - "subsequence_searcher": BaseSubsequenceSearch, - "series_searcher": BaseSeriesSearch, + "series-similarity-search": BaseSeriesSimilaritySearch, + "collection-similarity-search": BaseCollectionSimilaritySearch, } # base classes which are valid for estimator to directly inherit from @@ -60,5 +62,6 @@ "collection-estimator", "series-estimator", "transformer", + "similarity-search", } } diff --git a/docs/api_reference/similarity_search.rst b/docs/api_reference/similarity_search.rst index eb13cafd23..7212179953 100644 --- a/docs/api_reference/similarity_search.rst +++ b/docs/api_reference/similarity_search.rst @@ -4,51 +4,47 @@ Similarity search ================= The :mod:`aeon.similarity_search` module contains algorithms and tools for similarity -search tasks. +search tasks. First, we distinguish between `series` estimator and `collection` +estimators, similarly to the `aeon.transformer` module. Secondly, we distinguish between +estimators used `neighbors` (with sufix SNN for subsequence nearest neighbors, or ANN +for approximate nearest neighbors) search and estimators used for `motifs` search. -Similarity search estimators ----------------------------- +Series Similarity search estimators +----------------------------------- -.. currentmodule:: aeon.similarity_search +.. currentmodule:: aeon.similarity_search.series.neighbors .. autosummary:: :toctree: auto_generated/ :template: class.rst - QuerySearch - SeriesSearch + DummySNN + MassSNN -Distance profile functions --------------------------- - -.. currentmodule:: aeon.similarity_search.distance_profiles +.. currentmodule:: aeon.similarity_search.series.motifs .. autosummary:: :toctree: auto_generated/ - :template: function.rst + :template: class.rst + + StompMotif - euclidean_distance_profile - normalised_euclidean_distance_profile - squared_distance_profile - normalised_squared_distance_profile -Matrix profile functions --------------------------- +Collection Similarity search estimators +----------------------------------- -.. currentmodule:: aeon.similarity_search.matrix_profiles +.. currentmodule:: aeon.similarity_search.collection.neighbors .. autosummary:: :toctree: auto_generated/ - :template: function.rst + :template: class.rst + + RandomProjectionIndexANN - stomp_normalised_euclidean_matrix_profile - stomp_euclidean_matrix_profile - stomp_normalised_squared_matrix_profile - stomp_squared_matrix_profile -Base ----- +Base Estimators +--------------- .. currentmodule:: aeon.similarity_search.base @@ -57,3 +53,20 @@ Base :template: class.rst BaseSimilaritySearch + + +.. currentmodule:: aeon.similarity_search.series.base + +.. autosummary:: + :toctree: auto_generated/ + :template: class.rst + + BaseSeriesSimilaritySearch + +.. currentmodule:: aeon.similarity_search.collection.base + +.. autosummary:: + :toctree: auto_generated/ + :template: class.rst + + BaseCollectionSimilaritySearch From 9effbd9a9ed1afc69428d76501a7e143717ff90d Mon Sep 17 00:00:00 2001 From: baraline Date: Fri, 17 Jan 2025 22:40:49 +0100 Subject: [PATCH 15/18] Update documentation and fix some leftover bugs --- aeon/similarity_search/_base.py | 6 +- aeon/similarity_search/collection/_base.py | 7 +- .../collection/neighbors/_rp_cosine_lsh.py | 8 +- aeon/similarity_search/series/__init__.py | 6 +- aeon/similarity_search/series/_base.py | 7 +- aeon/similarity_search/series/_commons.py | 5 +- .../similarity_search/series/motifs/_stomp.py | 38 +- .../series/neighbors/_dummy.py | 14 +- .../series/neighbors/_mass.py | 28 +- docs/getting_started.md | 5 +- examples/similarity_search/code_speed.ipynb | 2 +- .../similarity_search/distance_profiles.ipynb | 2 +- .../similarity_search/similarity_search.ipynb | 425 +++++------------- 13 files changed, 201 insertions(+), 352 deletions(-) diff --git a/aeon/similarity_search/_base.py b/aeon/similarity_search/_base.py index 5204163315..a87487fde1 100644 --- a/aeon/similarity_search/_base.py +++ b/aeon/similarity_search/_base.py @@ -72,7 +72,7 @@ def predict( """ ... - def _check_predict_series_format(self, X): + def _check_predict_series_format(self, X, length=None): """ Check wheter a series X in predict is correctly formated. @@ -96,8 +96,8 @@ def _check_predict_series_format(self, X): f"Expected X to have {self.n_channels_} channels but" f" got {X.shape[0]} channels." ) - if hasattr(self, "length") and X.shape[1] != self.length: + if length is not None and X.shape[1] != length: raise ValueError( - f"Expected X to have {self.length} timepoints but" + f"Expected X to have {length} timepoints but" f" got {X.shape[1]} timepoints." ) diff --git a/aeon/similarity_search/collection/_base.py b/aeon/similarity_search/collection/_base.py index 3fde03d420..618a531081 100644 --- a/aeon/similarity_search/collection/_base.py +++ b/aeon/similarity_search/collection/_base.py @@ -15,7 +15,7 @@ from aeon.similarity_search._base import BaseSimilaritySearch -class BaseCollectionSimilaritySearch(BaseSimilaritySearch, BaseCollectionEstimator): +class BaseCollectionSimilaritySearch(BaseCollectionEstimator, BaseSimilaritySearch): """Similarity search base class for collections.""" # tag values specific to CollectionTransformers @@ -81,6 +81,7 @@ def _fit( def _pre_predict( self, X: Union[np.ndarray, None] = None, + length: int = None, ): """ Predict method. @@ -89,10 +90,12 @@ def _pre_predict( ---------- X : Union[np.ndarray, None], optional Optional data to use for predict.. The default is None. + length: int, optional + If not None, the number of timepoint of X should be equal to length. """ self._check_is_fitted() if X is not None: # Could we call somehow _preprocess_series from a BaseCollectionEstimator ? - self._check_predict_series_format(X) + self._check_predict_series_format(X, length=length) return X diff --git a/aeon/similarity_search/collection/neighbors/_rp_cosine_lsh.py b/aeon/similarity_search/collection/neighbors/_rp_cosine_lsh.py index 6774803984..a6f3097f78 100644 --- a/aeon/similarity_search/collection/neighbors/_rp_cosine_lsh.py +++ b/aeon/similarity_search/collection/neighbors/_rp_cosine_lsh.py @@ -220,12 +220,8 @@ def predict( Distance of k series in the index to X. The distance is the hamming distance between the result of each hash function. """ - X = self._pre_predict(X) - if X.shape[1] != self.n_timepoints_: - raise ValueError( - f"Expected a series with {self.n_timepoints_} but got {X.shape[1]}." - "Unequal length is not supported by this estimator." - ) + X = self._pre_predict(X, length=self.n_timepoints_) + if self.normalize: X = z_normalise_series_2d(X) diff --git a/aeon/similarity_search/series/__init__.py b/aeon/similarity_search/series/__init__.py index 23df7d1b53..d1b5494c13 100644 --- a/aeon/similarity_search/series/__init__.py +++ b/aeon/similarity_search/series/__init__.py @@ -1,7 +1,7 @@ """Similarity search for series.""" -__all__ = [ - "BaseSeriesSimilaritySearch", -] +__all__ = ["BaseSeriesSimilaritySearch", "MassSNN", "StompMotif"] from aeon.similarity_search.series._base import BaseSeriesSimilaritySearch +from aeon.similarity_search.series.motifs._stomp import StompMotif +from aeon.similarity_search.series.neighbors._mass import MassSNN diff --git a/aeon/similarity_search/series/_base.py b/aeon/similarity_search/series/_base.py index bd07f25ba6..2e1b6d40e0 100644 --- a/aeon/similarity_search/series/_base.py +++ b/aeon/similarity_search/series/_base.py @@ -11,7 +11,7 @@ from aeon.utils.validation import check_n_jobs -class BaseSeriesSimilaritySearch(BaseSimilaritySearch, BaseSeriesEstimator): +class BaseSeriesSimilaritySearch(BaseSeriesEstimator, BaseSimilaritySearch): """Base class for similarity search applications on single series.""" _tags = { @@ -75,6 +75,7 @@ def _fit( def _pre_predict( self, X: Union[np.ndarray, None] = None, + length: int = None, ): """ Predict method. @@ -83,12 +84,14 @@ def _pre_predict( ---------- X : Union[np.ndarray, None], optional Optional data to use for predict.. The default is None. + length: int, optional + If not None, the number of timepoint of X should be equal to length. """ self._check_is_fitted() if X is not None: X = self._preprocess_series(X, self.axis, False) - self._check_predict_series_format(X) + self._check_predict_series_format(X, length=length) return X def _check_X_index(self, X_index: int): diff --git a/aeon/similarity_search/series/_commons.py b/aeon/similarity_search/series/_commons.py index d14281573f..4e62e5aacb 100644 --- a/aeon/similarity_search/series/_commons.py +++ b/aeon/similarity_search/series/_commons.py @@ -61,6 +61,7 @@ def get_ith_products(X, T, L, ith): return fft_sliding_dot_product(X, T[:, ith : ith + L]) +@njit(cache=True, fastmath=True) def _inverse_distance_profile(dist_profile): return 1 / (dist_profile + 1e-8) @@ -101,9 +102,7 @@ def _extract_top_k_from_dist_profile( The distances of the best matches. """ - if k == np.inf: - k = dist_profile.shape[0] - top_k_indexes = np.zeros((k), dtype=np.int64) - 1 + top_k_indexes = np.zeros(k, dtype=np.int64) - 1 top_k_distances = np.full(k, np.inf, dtype=np.float64) ub = np.full(k, np.inf) lb = np.full(k, -1.0) diff --git a/aeon/similarity_search/series/motifs/_stomp.py b/aeon/similarity_search/series/motifs/_stomp.py index 9287d80241..fbd459b890 100644 --- a/aeon/similarity_search/series/motifs/_stomp.py +++ b/aeon/similarity_search/series/motifs/_stomp.py @@ -154,13 +154,13 @@ def predict( "Expected motif_extraction_method to be either 'k_motifs' or 'r_motifs'" f"but got {motif_extraction_method}" ) - exclusion_size = self.length // exclusion_factor + MP, IP = self.compute_matrix_profile( X, motif_size=motif_size, dist_threshold=dist_threshold, allow_trivial_matches=allow_trivial_matches, - exclusion_size=exclusion_size, + exclusion_factor=exclusion_factor, inverse_distance=inverse_distance, ) if motif_extraction_method == "k_motifs": @@ -170,11 +170,11 @@ def predict( def compute_matrix_profile( self, - X: np.ndarray, + X: np.ndarray = None, motif_size: Optional[int] = 1, dist_threshold: Optional[float] = np.inf, allow_trivial_matches: Optional[bool] = False, - exclusion_size: Optional[float] = 2, + exclusion_factor: Optional[float] = 2, inverse_distance: Optional[bool] = False, ): """ @@ -198,14 +198,12 @@ def compute_matrix_profile( inverse_distance : bool If True, the matching will be made on the inverse of the distance, and thus, the worst matches to the query will be returned instead of the best ones. - exclusion_size : int - The size of the exclusion zone used to prevent returning as top k candidates - the ones that are close to each other (for example i and i+1). - It is used to define a region between - :math:`id_timestamp - exclusion_size` and - :math:`id_timestamp + exclusion_size` which cannot be returned - as best match if :math:`id_timestamp` was already selected. By default, - the value None means that this is not used. + exclusion_factor : float, default=1. + A factor of the query length used to define the exclusion zone when + ``allow_trivial_matches`` is set to False. For a given timestamp, + the exclusion zone starts from + :math:`id_timestamp - length//exclusion_factor` and end at + :math:`id_timestamp + length//exclusion_factor`. Returns ------- @@ -229,7 +227,7 @@ def compute_matrix_profile( X_means, X_stds = sliding_mean_std_one_series(X, self.length, 1) X_dotX = get_ith_products(X, self.X_, self.length, 0) - + exclusion_size = self.length // exclusion_factor if self.normalize: MP, IP = _stomp_normalized( self.X_, @@ -353,14 +351,17 @@ def _stomp_normalized( lb = max(0, i_q - exclusion_size) dist_profile[lb:ub] = np.inf - top_indexes, top_dists = _extract_top_k_from_dist_profile( + _top_indexes, top_dists = _extract_top_k_from_dist_profile( dist_profile, motif_size, dist_threshold, allow_trivial_matches, exclusion_size, ) - + top_indexes = np.zeros((len(_top_indexes), 2), dtype=np.int64) + for i_idx in range(len(_top_indexes)): + top_indexes[i_idx, 0] = i_q + top_indexes[i_idx, 1] = _top_indexes[i_idx] MP.append(top_dists) IP.append(top_indexes) @@ -443,14 +444,17 @@ def _stomp( lb = max(0, i_q - exclusion_size) dist_profile[lb:ub] = np.inf - top_indexes, top_dists = _extract_top_k_from_dist_profile( + _top_indexes, top_dists = _extract_top_k_from_dist_profile( dist_profile, motif_size, dist_threshold, allow_trivial_matches, exclusion_size, ) - + top_indexes = np.zeros((len(_top_indexes), 2), dtype=np.int64) + for i_idx in range(len(_top_indexes)): + top_indexes[i_idx, 0] = i_q + top_indexes[i_idx, 1] = _top_indexes[i_idx] MP.append(top_dists) IP.append(top_indexes) diff --git a/aeon/similarity_search/series/neighbors/_dummy.py b/aeon/similarity_search/series/neighbors/_dummy.py index bbea714eda..7b4d4d89da 100644 --- a/aeon/similarity_search/series/neighbors/_dummy.py +++ b/aeon/similarity_search/series/neighbors/_dummy.py @@ -47,7 +47,7 @@ def predict( self, X: np.ndarray, k: Optional[int] = 1, - threshold: Optional[float] = np.inf, + dist_threshold: Optional[float] = np.inf, exclusion_factor: Optional[float] = 2, inverse_distance: Optional[bool] = False, allow_neighboring_matches: Optional[bool] = False, @@ -62,7 +62,7 @@ def predict( Subsequence we want to find neighbors for. k : int The number of neighbors to return. - threshold : float + dist_threshold : float The maximum distance of neighbors to X. inverse_distance : bool If True, the matching will be made on the inverse of the distance, and thus, @@ -73,7 +73,7 @@ def predict( the exclusion zone starts from :math:`id_timestamp - length//exclusion_factor` and end at :math:`id_timestamp + length//exclusion_factor`. - X_index : Optional[int], optional + X_index : int, optional If ``X`` is a subsequence of X_, specify its starting timestamp in ``X_``. If specified, neighboring subsequences of X won't be able to match as neighbors. @@ -86,12 +86,13 @@ def predict( The distances of the best matches. """ - X = self._pre_predict(X) + X = self._pre_predict(X, length=self.length) X_index = self._check_X_index(X_index) dist_profile = self.compute_distance_profile(X) if inverse_distance: dist_profile = _inverse_distance_profile(dist_profile) + exclusion_size = self.length // exclusion_factor if X_index is not None: exclusion_size = self.length // exclusion_factor _max_timestamp = self.n_timepoints_ - self.length @@ -99,10 +100,13 @@ def predict( lb = max(0, X_index - exclusion_size) dist_profile[lb:ub] = np.inf + if k == np.inf: + k = len(dist_profile) + return _extract_top_k_from_dist_profile( dist_profile, k, - threshold, + dist_threshold, allow_neighboring_matches, exclusion_size, ) diff --git a/aeon/similarity_search/series/neighbors/_mass.py b/aeon/similarity_search/series/neighbors/_mass.py index bb56815f4e..9565407fc8 100644 --- a/aeon/similarity_search/series/neighbors/_mass.py +++ b/aeon/similarity_search/series/neighbors/_mass.py @@ -21,7 +21,22 @@ class MassSNN(BaseSeriesSimilaritySearch): - """Estimator to compute the on profile and distance profile using MASS.""" + """ + Estimator to compute the subsequences nearest neighbors using MASS _[1]. + + Parameters + ---------- + length : int + The length of the subsequences to use for the search. + normalize : bool + Wheter the subsequences should be z-normalized. + + References + ---------- + .. [1] Abdullah Mueen, Yan Zhu, Michael Yeh, Kaveh Kamgar, Krishnamurthy + Viswanathan, Chetan Kumar Gupta and Eamonn Keogh (2015), The Fastest Similarity + Search Algorithm for Time Series Subsequences under Euclidean Distance. + """ def __init__( self, @@ -38,7 +53,7 @@ def _fit( y=None, ): if self.normalize: - self.X_means_, X_stds_ = sliding_mean_std_one_series(X, self.length, 1) + self.X_means_, self.X_stds_ = sliding_mean_std_one_series(X, self.length, 1) return self def predict( @@ -76,7 +91,7 @@ def predict( the exclusion zone starts from :math:`id_timestamp - length//exclusion_factor` and end at :math:`id_timestamp + length//exclusion_factor`. - X_index : Optional[int], optional + X_index : int, optional If ``X`` is a subsequence of X_, specify its starting timestamp in ``X_``. If specified, neighboring subsequences of X won't be able to match as neighbors. @@ -89,19 +104,22 @@ def predict( The distances of the best matches. """ - X = self._pre_predict(X) + X = self._pre_predict(X, length=self.length) X_index = self._check_X_index(X_index) dist_profile = self.compute_distance_profile(X) if inverse_distance: dist_profile = _inverse_distance_profile(dist_profile) + exclusion_size = self.length // exclusion_factor if X_index is not None: - exclusion_size = self.length // exclusion_factor _max_timestamp = self.n_timepoints_ - self.length ub = min(X_index + exclusion_size, _max_timestamp) lb = max(0, X_index - exclusion_size) dist_profile[lb:ub] = np.inf + if k == np.inf: + k = len(dist_profile) + return _extract_top_k_from_dist_profile( dist_profile, k, diff --git a/docs/getting_started.md b/docs/getting_started.md index 36f18583cb..ce519359f2 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -21,8 +21,9 @@ classical techniques for the following learning tasks: - [**Clustering**](api_reference/clustering), where a collection of time series without any labels are used to train a model to label cases ([more details](examples/clustering/clustering.ipynb)). -- [**Similarity search**](api_reference/similarity_search), where the goal is to evaluate - the similarity between a query time series and a collection of other longer time series +- [**Similarity search**](api_reference/similarity_search), where the goal is to find + time series motifs or nearest neighbors in an efficient way for either single series + or collections. ([more details](examples/similarity_search/similarity_search.ipynb)). - [**Anomaly detection**](api_reference/anomaly_detection), where the goal is to find values or areas of a single time series that are not representative of the whole series. diff --git a/examples/similarity_search/code_speed.ipynb b/examples/similarity_search/code_speed.ipynb index f31155333d..9b4c08acf3 100644 --- a/examples/similarity_search/code_speed.ipynb +++ b/examples/similarity_search/code_speed.ipynb @@ -554,7 +554,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.12.0" } }, "nbformat": 4, diff --git a/examples/similarity_search/distance_profiles.ipynb b/examples/similarity_search/distance_profiles.ipynb index ec56fcc6bf..d5ea595ff5 100644 --- a/examples/similarity_search/distance_profiles.ipynb +++ b/examples/similarity_search/distance_profiles.ipynb @@ -146,7 +146,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.12.0" } }, "nbformat": 4, diff --git a/examples/similarity_search/similarity_search.ipynb b/examples/similarity_search/similarity_search.ipynb index cdbaa86948..803398d551 100644 --- a/examples/similarity_search/similarity_search.ipynb +++ b/examples/similarity_search/similarity_search.ipynb @@ -7,12 +7,27 @@ "source": [ "# Time Series Similarity Search with aeon\n", "\n", - "The goal of Time Series Similarity Search is to asses the similarities between a time\n", - " series, denoted as a query `q` of length `l`, and a collection of time series,\n", - " denoted as `X`, with lengths greater than or equal to `l`. In this\n", - " context, the notion of similiarity between `q` and the other series in `X` is quantified by similarity functions. Those functions are most of the time defined as distance function, such as the Euclidean distance. Knowing the similarity between `q` and other admissible candidates, we can then perform many other tasks for \"free\", such as anomaly or motif detection.\n", + "\"time\n", "\n", - "\"time" + "The objectives of the similarity search module in aeon is to provide estimators with a `fit`/`predict` interface to solve the following use cases :\n", + "\n", + "- Nearest neighbors search on time series subesequences or whole series\n", + "- Motifs search on time series subsequences\n", + "\n", + "Similarly to the `transformer` module, the `similarity_search` module split estimators between `series` estimators and `collection` estimators, such as :\n", + "\n", + "- `series` estimators take as input a single time series of shape `(n_channels, n_timepoints)` during fit and predict.\n", + "- `collection` estimators take as input a time series collection of shape `(n_cases, n_channels, n_timepoints)` during fit, and a single series of shape `(n_channels, n_timepoints)` during predict.\n", + "\n", + "Note that the above is a general guideline, and that some estimators can also take `None` as input during predict, or series of length different to `n_timepoints`. We'll explore the different estimators in the next sections.\n", + "\n", + "### Other similarity search notebooks\n", + "\n", + "This notebook gives an overview of similarity search module and the available estimators. The following notebooks are also avaiable to go more in depth with specific subject of similarity search in aeon:\n", + "\n", + "- [The theory and math behind the similarity search estimators in aeon](distance_profiles.ipynb)\n", + "- [Analysis of the performance of the estimators provided by similarity search module](code_speed.ipynb)\n", + "\n" ] }, { @@ -22,25 +37,34 @@ "metadata": {}, "outputs": [], "source": [ - "def plot_best_matches(top_k_search, best_matches):\n", + "# Define some plotting functions we'll use later !\n", + "def plot_best_matches(\n", + " X_fit, X_predict, idx_predict, idx_matches, length, normalize=False\n", + "):\n", " \"\"\"Plot the top best matches of a query in a dataset.\"\"\"\n", - " fig, ax = plt.subplots(figsize=(20, 5), ncols=3)\n", - " for i_k, (id_sample, id_timestamp) in enumerate(best_matches):\n", + " fig, ax = plt.subplots(figsize=(20, 5), ncols=len(idx_matches))\n", + " if len(idx_matches) == 1:\n", + " ax = [ax]\n", + " for i_k, id_timestamp in enumerate(idx_matches):\n", " # plot the sample of the best match\n", - " ax[i_k].plot(top_k_search.X_[id_sample, 0], linewidth=2)\n", + " ax[i_k].plot(X_fit[0], linewidth=2)\n", " # plot the location of the best match on it\n", + " match = X_fit[0, id_timestamp : id_timestamp + length]\n", " ax[i_k].plot(\n", - " range(id_timestamp, id_timestamp + q.shape[1]),\n", - " top_k_search.X_[id_sample, 0, id_timestamp : id_timestamp + q.shape[1]],\n", + " range(id_timestamp, id_timestamp + length),\n", + " match,\n", " linewidth=7,\n", " alpha=0.5,\n", " color=\"green\",\n", " label=\"best match location\",\n", " )\n", " # plot the query on the location of the best match\n", + " Q = X_predict[0, idx_predict : idx_predict + length]\n", + " if normalize:\n", + " Q = Q * np.std(match) + np.mean(match)\n", " ax[i_k].plot(\n", - " range(id_timestamp, id_timestamp + q.shape[1]),\n", - " q[0],\n", + " range(id_timestamp, id_timestamp + length),\n", + " Q,\n", " linewidth=5,\n", " alpha=0.5,\n", " color=\"red\",\n", @@ -66,73 +90,32 @@ " plt.show()" ] }, - { - "cell_type": "markdown", - "id": "7e06b213-6038-4901-b98e-2433625115c4", - "metadata": {}, - "source": [ - "## Similarity search Notebooks\n", - "\n", - "This notebook gives an overview of similarity search module and the available estimators. The following notebooks are avaiable to go more in depth with specific subject of similarity search in aeon:\n", - "\n", - "- [Deep dive in the distance profiles](distance_profiles.ipynb)\n", - "- [Analysis of the speedups provided by similarity search module](code_speed.ipynb)" - ] - }, - { - "cell_type": "markdown", - "id": "ca967c08-9a05-411a-a09a-ad8a13c0adb9", - "metadata": {}, - "source": [ - "## Expected inputs and format\n", - "For both `QuerySearch` and `SeriesSearch`, the `fit` method expects a time series dataset of shape `(n_cases, n_channels, n_timepoints)`. This can be 3D numpy array or a list of 2D numpy arrays if `n_timepoints` varies between cases (i.e. unequal length dataset).\n", - "\n", - "The `predict` method expects a 2D numpy array of shape `(n_channels, query_length)` for `QuerySearch`. In `SeriesSearch`, the predict methods also expects a 2D numpy array, but of shape `(n_channels, n_timepoints)` (`n_timepoints` doesn't have to be the same as in fit) and a `query_length` parameter." - ] - }, { "cell_type": "markdown", "id": "d1fd75ae-84c2-40be-95f6-bd7de409317d", "metadata": {}, "source": [ - "## Available estimators\n", + "### A word on base clases\n", "\n", - "All estimators of the similarity search module in aeon inherit from the `BaseSimilaritySearch` class, which requires the following arguments:\n", - "- `distance` : a string indicating which distance function to use as similarity function. By default this is `\"euclidean\"`, which means that the Euclidean distance is used.\n", - "- `normalise` : a boolean indicating whether this similarity function should be z-normalised. This means that the scale of the two series being compared will be ignored, and that, loosely speaking, we will only focus on their shape during the comparison. By default, this parameter is set `False`.\n", + "All estimators of the similarity search module in aeon inherit from the `BaseSimilaritySearch` class, which define the some abstract methods that estimator must implement, such as `fit` and `predict` and some private function used to validate the format of the time series you will provide. Then, the two submodules `series` and `collection` also define a base class (`BaseSeriesSimilaritySearch` and `BaseCollectionSeriesSearch`) that their respective estimator will inherit from. If you ever want to extend the module or create your own estimators, these are the classes you'll want to use to define the base structure of your estimator.\n", "\n", - "Another parameter, which has no effect on the output of the estimators, is a boolean named `store_distance_profile`, set to `False` by default. If set to `True`, the estimators will expose an attribute named `_distance_profile` after the `predict` function is called. This attribute will contain the computed distance profile for query given as input to the `predict` function.\n", + "### Load a dataset\n", + "In the following, we'll use an easy dataset (`GunPoint`) to help build intuition. Don't hesitate to swap it with other datasets to explore ! We load it using the `load_classification` function.\n", "\n", - "To illustrate how to work with similarity search estimators in aeon, we will now present some example use cases." - ] - }, - { - "cell_type": "markdown", - "id": "01fa67c2-0126-4152-98a9-fa0df84c4629", - "metadata": {}, - "source": [ - "### Query search" - ] - }, - { - "cell_type": "markdown", - "id": "8e99b251-d156-4989-b5a0-3a2c79cb75d4", - "metadata": {}, - "source": [ - "We will use the GunPoint dataset for this example, which can be loaded using the `load_classification` function." + "The GunPoint dataset is composed of two classes which are discriminated by the \"bumps\" located before and after the central peak. These bumps correspond to an actor drawing a fake gun from a holster before pointing it (hence the name \"GunPoint\" !). In the second class, the actor simply points his fingers without making the motion of taking the gun out of the holster." ] }, { "cell_type": "code", "execution_count": 2, - "id": "f8a6bb7e-b219-41f1-b508-b849c45672eb", + "id": "20d3b591-f275-4548-a7d2-75b16380b055", "metadata": {}, "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ - "
" + "
" ] }, "metadata": {}, @@ -162,205 +145,97 @@ }, { "cell_type": "markdown", - "id": "5392f7f4-1825-4b15-9248-27eeecb1af3c", + "id": "01fa67c2-0126-4152-98a9-fa0df84c4629", "metadata": {}, "source": [ - "The GunPoint dataset is composed of two classes which are discriminated by the \"bumps\" located before and after the central peak. These bumps correspond to an actor drawing a fake gun from a holster before pointing it (hence the name \"GunPoint\" !). In the second class, the actor simply points his fingers without making the motion of taking the gun out of the holster.\n", + "## 1. Series estimators\n", "\n", - "Suppose that we define our input query for the similarity search task as one of these bumps:" + "First, we'll explore estimators of the `series` module, where you must provide single series of shape `(n_channels, n_timepoints)` during fit." ] }, { - "cell_type": "code", - "execution_count": 3, - "id": "a494a0be-4459-414d-9fc2-1400feefd171", + "cell_type": "markdown", + "id": "78f17f93-28b3-49c0-be5f-1d430a273b0c", "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], "source": [ - "# We will use the fourth sample an testing data\n", - "X_test = X[3]\n", - "mask = np.ones(X.shape[0], dtype=bool)\n", - "mask[3] = False\n", - "# Use this mask to exluce the sample from which we will extract the query\n", - "X_train = X[mask]\n", + "### 1.1 Subsequence nearest neighbors with MASS\n", "\n", - "q = X_test[:, 20:55]\n", - "plt.plot(q[0])\n", - "plt.show()" + "To perform nearest neighbors search on subsequences on a series, we can use the `MassSNN` estimator.\n", + "\n", + "It takes as parameter during initialisation :\n", + "- `length` : an integer giving the length of the subsequences to extract from the series. It is also the expected length of the series given in `predict`\n", + "- `normalize`: a boolean indicating wheter the subsequences should be independently z-normalized (`(X-mean(X))/std(X)`) before the distance computations. This results in a scale-independent matching.\n", + " \n", + "To parameterize the search, additional parameters are available when calling the `predict` method:\n", + "\n", + "- `k` (int) : the number of nearest neighbors to return.\n", + "- `dist_threshold` (float) : the maximum allowed distance for a candidate subsequence to be considered as a neighbor.\n", + "- `allow_trivial_matches` (bool) : wheter a neighbors of a match to a query can be also considered as matches (True), or if an exclusion zone is applied around each match to avoid trivial matches with their direct neighbors (False).\n", + "- `inverse_distance` (bool) : if True, the matching will be made on the inverse of the distance, and thus, the farther neighbors will be returned instead of the closest ones.\n", + "- `exclusion_factor` (float): A factor of the `length` used to define the exclusion zone when `allow_trivial_matches` is set to False. For a given timestamp, the exclusion zone starts from `id_timestamp - length//exclusion_factor` and end at `id_timestamp + length//exclusion_factor`.\n", + "- `X_index` (int): If series given during predict is a subsequence of series given during fit, specify its starting timestamp. If specified, neighboring subsequences of X won't be able to match as neighbors." ] }, { "cell_type": "markdown", - "id": "fcf10a34-930a-4fce-86f8-4dfa207cad11", + "id": "33105406-fc83-4143-9345-af589a06a00a", "metadata": {}, "source": [ - "Then, we can use the `QuerySearch` class to search for the top `k` matches of this query in a collection of series. The training data for `QuerySearch` can be seen as the database in which want to search for the query on." + "First, we'll select a series from the dataset to use during fit. This is the series we want our neighbors to come from." ] }, { "cell_type": "code", - "execution_count": 4, - "id": "80eaab8f-204f-439f-84c8-ad3462f1575e", + "execution_count": 3, + "id": "a494a0be-4459-414d-9fc2-1400feefd171", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "match 0 : [195 26] with distance 0.1973741999473598 to q\n", - "match 1 : [92 23] with distance 0.20753669049486048 to q\n", - "match 2 : [154 22] with distance 0.21538593730366784 to q\n" + "(1, 150)\n" ] } ], "source": [ - "from aeon.similarity_search import QuerySearch\n", + "from aeon.similarity_search.series import MassSNN\n", "\n", - "# Here, the distance function (distance and normalise arguments)\n", - "top_k_search = QuerySearch(k=3, distance=\"euclidean\")\n", - "# Call fit to store X_train as the database to search in\n", - "top_k_search.fit(X_train)\n", - "distances_to_matches, best_matches = top_k_search.predict(q)\n", - "for i in range(len(best_matches)):\n", - " print(f\"match {i} : {best_matches[i]} with distance {distances_to_matches[i]} to q\")" - ] - }, - { - "cell_type": "markdown", - "id": "3dc402cf-80b7-4d0c-b07c-2f8e7822ac97", - "metadata": {}, - "source": [ - "The similarity search estimators return a list of size `k`, which contains a tuple containing the location of the best matches as `(id_sample, id_timestamp)`. We can then plot the results as:" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "23efe48e-8257-4ecc-93a2-d72f19024ab5", - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "plot_best_matches(top_k_search, best_matches)" + "length = 35\n", + "# We'll take a sample of the class with a \"bump\".\n", + "series_fit = X[2]\n", + "print(series_fit.shape)\n", + "snn = MassSNN(length=length, normalize=False).fit(series_fit)" ] }, { "cell_type": "markdown", - "id": "877b1b32-d978-4c54-a4e7-b475496f710a", + "id": "320ef728-ca92-4fd5-9686-2b9739fcab83", "metadata": {}, "source": [ - "You may also want to search not for the top-k matches, but for all matches below a threshold on the distance from the query to a candidate. To do so, you can use the `threshold` parameter of `QuerySearch` :" + "Then we'll take a subsequence of size `length` in another series of the same class to use in `predict` :" ] }, { "cell_type": "code", - "execution_count": 6, - "id": "23ad7adb-2b01-4425-a2e8-c393f3721a0f", + "execution_count": 4, + "id": "98560db4-4289-4072-8662-2cde2ad5c44a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "match 0 : [195 26] with distance 0.1973741999473598 to q\n", - "match 1 : [92 23] with distance 0.20753669049486048 to q\n", - "match 2 : [154 22] with distance 0.21538593730366784 to q\n", - "match 3 : [176 25] with distance 0.21889484294879047 to q\n", - "match 4 : [23 20] with distance 0.22668346183441293 to q\n", - "match 5 : [167 23] with distance 0.24774491003815066 to q\n" + "match 0 : 27 with distance 0.3020071566139322\n", + "match 1 : 28 with distance 0.48913603040398357\n", + "match 2 : 26 with distance 0.889697094966067\n" ] }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\antoine\\Documents\\aeon\\aeon\\similarity_search\\query_search.py:270: UserWarning: Only 6 matches are bellow the threshold of 0.25, while k=inf. The number of returned match will be 6.\n", - " return extract_top_k_and_threshold_from_distance_profiles(\n" - ] - } - ], - "source": [ - "# Here, the distance function (distance and normalise arguments)\n", - "top_k_search = QuerySearch(k=np.inf, threshold=0.25, distance=\"euclidean\")\n", - "top_k_search.fit(X_train)\n", - "distances_to_matches, best_matches = top_k_search.predict(q)\n", - "for i in range(len(best_matches)):\n", - " print(f\"match {i} : {best_matches[i]} with distance {distances_to_matches[i]} to q\")" - ] - }, - { - "cell_type": "markdown", - "id": "0efd83a5-b36f-4809-be96-94de734d931c", - "metadata": {}, - "source": [ - "You may also combine the `k` and `threshold` parameter :" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "65db1593-3873-4a47-9e2a-d8dfcf42dd1a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "match 0 : [195 26] with distance 0.1973741999473598 to q\n", - "match 1 : [92 23] with distance 0.20753669049486048 to q\n", - "match 2 : [154 22] with distance 0.21538593730366784 to q\n" - ] - } - ], - "source": [ - "# Here, the distance function (distance and normalise arguments)\n", - "top_k_search = QuerySearch(k=3, threshold=0.25, distance=\"euclidean\")\n", - "top_k_search.fit(X_train)\n", - "distances_to_matches, best_matches = top_k_search.predict(q)\n", - "for i in range(len(best_matches)):\n", - " print(f\"match {i} : {best_matches[i]} with distance {distances_to_matches[i]} to q\")" - ] - }, - { - "cell_type": "markdown", - "id": "ff62a385-d58e-4fb1-95dd-eb0474711531", - "metadata": {}, - "source": [ - "It is also possible to return the **worst** matches (not that the title of the plots are not accurate here) to the query, by using the `inverse_distance` parameter :" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "6d6078ab-9104-462e-9856-1d0fc9594b24", - "metadata": {}, - "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ - "
" + "
" ] }, "metadata": {}, @@ -368,141 +243,87 @@ } ], "source": [ - "# Here, the distance function (distance and normalise arguments)\n", - "top_k_search = QuerySearch(k=3, inverse_distance=True, distance=\"euclidean\")\n", - "top_k_search.fit(X_train)\n", - "distances_to_matches, best_matches = top_k_search.predict(q)\n", - "plot_best_matches(top_k_search, best_matches)" - ] - }, - { - "cell_type": "markdown", - "id": "b5240535-5123-4ac5-a5e0-e0502ef80b3e", - "metadata": {}, - "source": [ - "## Using the speed_up option for similarity search" + "series_predict = X[3]\n", + "starting_timestep_predict = 25\n", + "indexes, distances = snn.predict(\n", + " series_predict[:, starting_timestep_predict : starting_timestep_predict + length],\n", + " k=3,\n", + " allow_trivial_matches=True,\n", + ")\n", + "for i in range(len(indexes)):\n", + " print(f\"match {i} : {indexes[i]} with distance {distances[i]}\")\n", + "plot_best_matches(\n", + " series_fit, series_predict, starting_timestep_predict, indexes, length\n", + ")" ] }, { "cell_type": "markdown", - "id": "b5e13c31-2aa3-4987-8d44-8a296c81a318", + "id": "fcf10a34-930a-4fce-86f8-4dfa207cad11", "metadata": {}, "source": [ - "In the similarity search module, we implement different kind of optimization to decrease the time necessary to extract the best matches to a query. You can find more information about these optimization in the other notebooks of the similarity search module. An utility function is available to list the optimizations currently implemented in aeon :" + "The `predict` method returns two lists, containing the starting timesteps of the matches in `series_fit` and the squared euclidean distance of these matches to the subsequence we gave in `predict`. Now, you can then play with the different parameters of `predict` to customize your search results to your needs!\n", + "\n", + "It is also possible to get the distance profile which is used to extract the best matches :" ] }, { "cell_type": "code", - "execution_count": 9, - "id": "d22e2d74-f44d-4c81-ba1b-72d618bd5862", + "execution_count": 5, + "id": "7d2bd3f7-7eb9-4406-be1c-b6fcd9c76730", "metadata": {}, "outputs": [ { "data": { + "image/png": "", "text/plain": [ - "{'normalised euclidean': ['fastest', 'Mueen'],\n", - " 'euclidean': ['fastest', 'Mueen'],\n", - " 'normalised squared': ['fastest', 'Mueen'],\n", - " 'squared': ['fastest', 'Mueen']}" + "
" ] }, - "execution_count": 9, "metadata": {}, - "output_type": "execute_result" + "output_type": "display_data" } ], "source": [ - "QuerySearch.get_speedup_function_names()" - ] - }, - { - "cell_type": "markdown", - "id": "bf12616c-6ace-478b-806f-5419c2c19f2b", - "metadata": {}, - "source": [ - "By default, the `fastest` option is used, which use the best optimisation available. You can change this behavior by using the values of t with the corresponding distance function and normalization options in the estimators, for example with a `QuerySearch` using the `normalised euclidean` distance:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "6313f26a-5788-42dc-881a-40746458414c", - "metadata": {}, - "outputs": [], - "source": [ - "top_k_search = QuerySearch(distance=\"euclidean\", normalise=True, speed_up=\"Mueen\")" - ] - }, - { - "cell_type": "markdown", - "id": "6ab51d84-7220-4333-b50e-2db695eaf45d", - "metadata": {}, - "source": [ - "For more information on these optimizations you can refer to the [distance profile notebook](distance_profiles.ipynb) for the theory, and to the [analysis of the speedups provided by similarity search module](code_speed.ipynb) for a comparison of their performance." + "distance_profile = snn.compute_distance_profile(\n", + " series_predict[:, starting_timestep_predict : starting_timestep_predict + length],\n", + ")\n", + "plt.figure(figsize=(5, 3))\n", + "plt.plot(distance_profile)\n", + "plt.show()" ] }, { "cell_type": "markdown", - "id": "4149c40f", + "id": "b5240535-5123-4ac5-a5e0-e0502ef80b3e", "metadata": {}, "source": [ - "# Series search\n", - "For series search, we are not interest in exploring the relationship of the input dataset `X` (given in `fit`) and a single query, but to all queries of size `query_length` that exists in another time series `T`. For example, with using again our simple GunPoint dataset:" + "### 1.2 Motif search with STOMP" ] }, { "cell_type": "code", - "execution_count": 11, - "id": "d510c4cc", + "execution_count": 6, + "id": "ff23faf5-2941-441a-8c4c-0cf66eaca121", "metadata": {}, "outputs": [ { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAABJ4AAAJcCAYAAAC4425vAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAAsTAAALEwEAmpwYAADT3klEQVR4nOzdd3hUZcLG4WdKeu+9EELvEKoKiAURxa4oKNiwrvWzrLrq2svquq6uig0UBduuYAE7iiAlVAGBAOkFUgikl5nz/RGIBAhFSc4k+d3XlWvamXOemQyT4Zn3vMdiGIYhAAAAAAAA4Dizmh0AAAAAAAAA7RPFEwAAAAAAAFoExRMAAAAAAABaBMUTAAAAAAAAWgTFEwAAAAAAAFoExRMAAAAAAABaBMUTAAAdWFZWlnx9feVwOMyO0uJmzJihE088sUXWvXDhQsXGxv7h+/v6+mr79u2SpKlTp+qBBx5odlmLxaKtW7ce9bqPdXlX4kqvz//973+Ki4uTr6+vVq9erV69emnhwoWSpIcffliTJ082NyAAAC6K4gkAgA4sPj5e5eXlstlsx22dq1evlr+/f5OyY+XKlQoMDFRGRsZx287xlJGRIYvFovr6elO2X15erqSkpD+9ntGjR+uNN944DolaXmJior799tvDLtMSr88/6v/+7//00ksvqby8XAMGDNCGDRs0evRos2MBAODyKJ4AAOigWqpkGTBggG6++WZde+21MgxDdXV1uuqqq/TII48oMTGxRbaJ9qc1S8Cj2VZmZqZ69erVCmkAAGhfKJ4AAGiDnn76acXExMjPz0/dunXTd999J0lyOp166qmn1LlzZ4WEhOjiiy9WSUmJpN9H9bz55puKj4/XmDFjDhrps3v3bl199dWKiopSTEyMHnjggcbdnLZu3apRo0YpICBAoaGhuuSSS5rN99BDDyk/P1/Tp0/XE088IV9fX918882HXPZQo432H7lzuO1u2rRJp512moKDg9WtWzd9+OGHjbcVFxdrwoQJ8vf315AhQ7Rt27Zm844cOVKSFBgYKF9fX/3yyy9yOp167LHHlJCQoPDwcF1xxRXavXt3878USU888YRCQ0OVmJio995775CPRzp4t7/D7Q737LPPKioqStHR0Xrrrbea3fb999+vRYsW6eabbz7o+f7222/VpUsXBQYG6qabbpJhGI23vfXWW+rRo4eCgoI0duxYZWZmHnL9+35Pb7/9tuLi4hQUFKRXX31VK1asUN++fRUYGNhkm9u2bdOYMWMUEhKi0NBQTZo0SaWlpZKkyy+/XFlZWTr77LPl6+urZ5555oivz5KSEsXGxuqzzz6T1DBKLDk5We+8884h844ePVp//etfNWTIEPn7++ucc8457L+F5n7fNTU1jbv79evXT507d5Z0+BFbS5cu1YgRIxQYGKh+/fo17pIHAECHZAAAgDZl06ZNRmxsrJGbm2sYhmGkp6cbW7duNQzDMF544QVj6NChRnZ2tlFdXW1MmzbNmDhxYuNykozLL7/cKC8vNyorKxuvq6urMwzDMM4991xj2rRpRnl5ubFjxw5j8ODBxquvvmoYhmFMnDjReOyxxwyHw2FUVVUZixYtOmzOn3/+2QgICDD8/PyM3377rdnlDsxgGIYxatQo4/XXXz/sdsvLy43Y2FjjrbfeMurq6oxVq1YZISEhxoYNGwzDMIxLLrnEuOiii4zy8nLj119/NaKjo40TTjjhqDO8+eabRufOnY1t27YZZWVlxnnnnWdMnjz5kPf/4YcfDJvNZtx+++1GdXW1sXDhQsPb29vYtGnTQY/HMAzj7bffbpJFkpGWlmYYhmFMmTLFuP/++w3DMIz58+cb4eHhxq+//mqUl5cbl156aZNlD3Tgdvate/z48cauXbuMzMxMIzQ01Jg/f75hGIbx6aefGp07dzY2btxo1NXVGY8++qgxfPjwwz5H1113nVFVVWV89dVXhoeHh3HOOecYO3bsMHJycoywsDBj4cKFhmEYRlpamvH1118b1dXVxs6dO42TTjrJuPXWWxvXl5CQYHzzzTcHrf9wr8+vvvrKiIiIMHbs2GFcc801xgUXXHDIrPuei+jo6Mbn7vzzzzcmTZrU7LaO9Ps+8HnfP/9DDz3UuO6cnBwjODjY+OKLLwyHw2F8/fXXRnBwsLFz585mswIA0J4x4gkAgDbGZrOppqZGGzduVF1dnRITExtHYbz66qt6/PHHFRsbKw8PDz388MP6+OOPm4wmevjhh+Xj4yMvL68m692xY4e+/PJLvfDCC/Lx8VF4eLhuv/12zZkzR5Lk5uamzMxM5eXlydPT84gTdffu3Vt2u119+vRR9+7d//DjbW67n3/+uRITE3XllVfKbrdrwIABuuCCC/TRRx/J4XDok08+0SOPPCIfHx/17t1bU6ZMOabtvvfee7rjjjuUlJQkX19fPfnkk5ozZ85hd8t69NFH5eHhoVGjRmn8+PFNRmD9ER9++KGuvPJK9e7dWz4+Pnr44Yf/0HruvfdeBQYGKj4+XieffLLWrFkjqeH18te//lU9evSQ3W7XfffdpzVr1jQ76kmS/va3v8nT01Onn366fHx8dOmllyo8PFwxMTE66aSTtHr1aklScnKyTjvtNHl4eCgsLEx33HGHfvzxxyNmbe71KUmnn366LrroIp1yyin68ssv9dprrx12XZdffnnjc/foo4/qww8/bDJR+f7b+iO/70OZNWuWzjzzTJ155pmyWq067bTTlJKSoi+//PKY1gMAQHtB8QQAQBuTnJysF154QQ8//LDCw8M1ceJE5eXlSWqYh+a8885TYGCgAgMD1aNHD9lsNu3YsaPx/nFxcYdcb2Zmpurq6hQVFdV4/+uuu047d+6UJD3zzDMyDENDhgxRr169DrvblyTdeeedGjVqlHJychrLqz+iue1mZmZq2bJljVkDAwP13nvvqaCgQIWFhaqvr2/yWBMSEo5pu3l5eU3uk5CQoPr6+ibP5f6CgoLk4+PTZPl9v5c/Ki8v7089hn0iIyMbz3t7e6u8vFxSw3N46623Nj5/wcHBMgxDubm5za4rIiKi8byXl9dBl/ete8eOHZo4caJiYmLk7++vyZMnq6io6IhZm3t97jNt2jStX79eU6dOVUhIyFGvKyEhQXV1dU0y7H/7sf6+m5OZmamPPvqoyevy559/Vn5+/jGtBwCA9oLiCQCANuiyyy7Tzz//rMzMTFksFt1zzz2SGv4jPX/+fJWWljb+VFdXKyYmpvG+FovlkOuMi4uTh4eHioqKGu+7Z88ebdiwQVJDefH6668rLy9Pr732mm688cZm5yX69ttvNW/ePL322mt65ZVXdOuttzbOr3OgfWVNZWVl43UFBQWN55vbblxcnEaNGtXksZaXl+uVV15RWFiY7Ha7srOzG9eTlZXV7PN5qOckOjq6ycifrKws2e32JkXL/nbt2qWKioomy0dHRzc+xuYe3+FERUUd9WNo7nEcTlxcnF577bUmz2FVVZVGjBhxTOs5lPvuu08Wi0W//vqr9uzZo1mzZjWZW6q5rId7DA6HQ9OmTdMVV1yh//znP82+/vY58Llzc3NTaGjoIbd1rL/v5sTFxenyyy9v8pxWVFTo3nvvPab1AADQXlA8AQDQxmzevFnff/+9ampq5OnpKS8vL1mtDX/Sr7/+et1///2N/4EuLCzU3Llzj2q9UVFROv3003XnnXdqz549cjqd2rZtW+PuUR999JFycnIkNYzusVgsjdvdX0VFhaZNm6Z//vOfCg0N1ZlnnqnTTjtNt99++yG3GxYWppiYGM2aNUsOh0NvvfVWk4nAm9vuWWedpS1btujdd99VXV2d6urqtGLFCv3222+y2Ww6//zz9fDDD6uyslIbN27UzJkzm33sYWFhslqt2r59e+N1l156qf75z38qPT1d5eXluu+++3TJJZfIbrc3u56HHnpItbW1WrRokT7//HNddNFFkqT+/fvrv//9ryorK7V161a9+eabza5jfxdffLFmzJihjRs3qrKyUn//+98Pu3xERESTx3Ak119/vZ588snGcnH37t366KOPjvr+h1NWViZfX18FBAQoNzdXzz777J/KKjVM3m6xWPTWW2/prrvu0hVXXNFk17kDzZo1q/G5e/DBB3XhhRfKZrMdctk/8vs+lMmTJ+uzzz7TV199JYfDoerqai1cuLDxNQwAQEdD8QQAQBtTU1Oje++9V6GhoYqMjNTOnTv15JNPSpJuvfVWTZgwQaeffrr8/Pw0bNgwLVu27KjX/c4776i2tlY9e/ZUUFCQLrzwwsZdhFasWKGhQ4fK19dXEyZM0L/+9S8lJSUdtI777rtP3bt316RJkxqve+GFFzR//nx98803h9zu66+/rmeffVYhISHasGFDkxE3zW3Xz89PX3/9tebMmaPo6GhFRkbqnnvuUU1NjSTppZdeUnl5uSIjIzV16lRdeeWVzT5ub29v3X///TrhhBMUGBiopUuX6qqrrtLll1+ukSNHqlOnTvL09NS///3vZtcRGRmpoKAgRUdHa9KkSXr11Vcb57a6/fbb5e7uroiICE2ZMqXJc3M448aN02233aYxY8YoOTlZY8aMOezyt956qz7++GMFBQXplltuOeL6zzvvPN1zzz2aOHGi/P391bt3b82fP/+osh3JQw89pFWrVikgIEDjx4/X+eef3+T2v/71r3rssccUGBiof/zjH0dc38qVK/X888/rnXfekc1m0z333COLxaKnnnqq2ftcfvnlmjp1qiIjI1VdXa0XX3yx2WWP9ffdnLi4OM2dO1dPPPGEwsLCFBcXp2effVZOp/OY1wUAQHtgMfYf8wwAAAC0A6NHj9bkyZN1zTXXmB0FAIAOjRFPAAAAAAAAaBEUTwAAAAAAAGgR7GoHAAAAAACAFsGIJwAAAAAAALSIYzs+bBsVGhqqxMREs2MAAAAAAAC0GxkZGSoqKjrsMh2ieEpMTFRqaqrZMQAAAAAAANqNlJSUIy7DrnYAAAAAAABoERRPAAAAAAAAaBEUTwAAAAAAAGgRFE8AAAAAAABoERRPAAAAAAAAaBEUTwAAAAAAAGgRFE8AAAAAAABoERRPAAAAAAAAaBF2swMAAAA0p97hVHW9UzV1DtU5DNU7nap3GKp3GnI4m7/ccL7h1NPNKh8Pu7zcbPLxsMvb3SZvd5s87DbVOZyqrXeq1uFUTZ1TtQ6HauqdkiR/TzcFeLvJ190uq9Vi8jMBAADQNlE8AQCAP62m3qHdlXUqrarTropa7a6qU63DqTqHU3X1RuP52nqnauqdKquuU1l1vcpq6htO916urKlvLJqq651yOA2zH5qsFsnP003+XnYFeLnJw25rvM0wfs93YNL9bjroNjVzP+OABY39bj3wNne7VYFebgrydleAt5sCvdwV6O2mQG83hfh4KNzfQ+F+HgrwcpPFQnEGAADMQfEEAACOyDAMFZbVaFNBmbbsKNOmgjKl7ShTYVmNSqvqVFnrOKb1ebpZ5efpJj9Pu/w87PLzdFOkv6d8POzydLPKw2476NTdbpXNapHdatl7apXd1vSyzWqRm+33yxZLQylWWbvvp77htMahmnqH3GxWudt/X7+73Sp3m1WSoT1V9dpTXafdVXXaU9VwuruqTnWOpg3Q4Tqd/QsfS5PrD1iumfsc7n7VdU4Vlddqa2G5SisbirtDcbdbFe7XUEKF+XnI290ud5tVbnZL4+N3t1llt1rlcDpVv3e0WJ2jofjb93h9PWzy9XCT797fma+nXb4eDWVciK+7Qn095OlmO2QGAADQcVE8AQCAg5RU1GpFRolWpJdofd5ubS4o067Kusbbw/w81C3CT10i/BTo5bZ3pM3eETde7vL3ssvTzdZYbLjZLA1lR+Nlppk83uodTu2prteuyloVldVo576fPdV7z1crvahC1XUNI8/2382w1uFsHFG1r7hz21fs2awyDKmipl5VdYcvGH3cbQrx9VCIr7tCfDwUFeCphBBvxQV7N5wGecvHg4+fAAB0JPzlBwAAyiut0oqMEi1Lbyib0naWS2oYLdMzyl9je0WqW6SfukX6qXukv4J93E1OjAPZbVYF+7gr2MddncN8j+m+hmHIaTTsVni43fLqHU5V1DhUXluv8up6ldfUaVdFnUoqalVUUaPi8loVl9eoqLxWObsqtSy9+KCRWKG+7ooP9tbQpBCd2iNCA+ICmUMLAIB2jOIJAIAOqrrOoc/W5mnGkgxtyNsjSfL1sGtQQpDOHRCjoZ2C1Sc2oMmcRmifLBaLbEfR/dhtVgV4WxXg7XbU6y6trFVWSaWySiqVWVyp7JJKbSss1+s/bdcrC7cp1NdDp/YI16k9InRil1B21wMAoJ2heAIAoIPJ312lWUszNXt5tkoqatUl3FcPjO+hYUkh6hHlLxujT3AcNeyC6a6+sYFNrt9dVaeFm3fqm4079MW6fM1ZkS1PN6tGdgnTtJFJSkkMNicwAAA4riieAADoAAzD0MrMXXp7SYYWrC+Q0zB0ao8IXTkiUcM7h3DUM7S6AC83ndM/Ruf0j1FtvVPL0ov1zcYd+vLXfF346i8a3S1Md57WTX1iA8yOCgAA/gSLYRx4cN72JyUlRampqWbHAADAFIZh6MG5G/Tu0kz5e9o1cUi8Lh+WoLhgb7OjAQepqnVo5i8ZevXHbSqtrNMZvSJ1x+ld1TXCz+xoAADgAEfTtzDiCQCAdswwDD3y+Ua9uzRTV53QSf83tqu83fnzD9fl5W7T9aM667Kh8Xrr53S9sShdX20s0Ln9Y3TrKV2UGOpjdkQAAHAMOJYxAADtlGEYemrBJr29OENXnpCov53Vg9IJbYa/p5tuO7WrFt19sqaNTNL89fk6/Z8/afbyLLOjAQCAY0DxBABAO/XCt2l67cftmjwsXg+e1ZN5nNAmBfm466/jeuinu07WsM4h+ut/f9U9H69TdZ3D7GgAAOAoUDwBANAOvfzDVv3ruzRdnBKrRyb0pnRCmxfu76m3pw7WX8Yk64PUbF306i/K2VVpdiwAAHAEFE8AALQzbyzarme/2qxz+0fryfP7ymqldEL7YLNadOfp3fT6FSnKKKrQ2f/+WYvSCs2OBQAADoPiCQCAduSdXzL02Be/aXyfKP3jon6yUTqhHTqtZ4Tm/eVEhft5aspby/XyD1vldLb7AzUDANAmUTwBANBOLN1erAfnbtBpPSP0wsT+stv4M4/2q1Ooj/530wid1Tdaz361WbfMWU35BACAC+ITKQAA7cRrP25TqK+7/n3pALlROqED8Ha3618T++v/Tu+qz9fl65Uft5kdCQAAHIBPpQAAtANbd5brh82FunxYojzdbGbHAVqNxWLRTScn6+x+0Xru681asq3I7EgAAGA/FE8AALQDby1Ol7vdqknD4s2OArQ6i8WiJ8/vo8RQH90ye4127qk2OxIAANiL4gkAgDaupKJW/12Vo/MHxCjU18PsOIApfD3semXSIJXX1Okvs1er3uE0OxIAABDFEwAAbd77yzJVXefUVSd2MjsKYKpukX564rw+WpZeoue+2WJ2HAAAIIonAADatJp6h2b+kqmRXcPUNcLP7DiA6c4fGKtLh8TplYXb9N1vO8yOAwBAh0fxBABAG/b52nwVltXoGkY7AY0eOruXekb5644P1yq7pNLsOAAAdGguVTxdddVVCg8PV+/evQ95+8KFCxUQEKD+/furf//+euSRR1o5IQAArsMwDL3xc7q6RvjqpC6hZscBXIanm02vTB4op2HopvdXqabeYXYkAAA6LJcqnqZOnaoFCxYcdpmTTjpJa9as0Zo1a/Tggw+2UjIAAFzPL9uL9Vv+Hl19YidZLBaz4wAuJSHER89e2E/rcnbryS83mR0HAIAOy6WKp5EjRyo4ONjsGAAAtAlvLkpXiI+7zukfY3YUwCWd0TtSU0ckasaSDC3eWmR2HAAAOiSXKp6Oxi+//KJ+/fpp3Lhx2rBhQ7PLTZ8+XSkpKUpJSVFhYWErJgQAoOVtLyzXd5t2avKwBHm62cyOA7ise87orqRQH9398TqVVdeZHQcAgA6nTRVPAwcOVGZmptauXau//OUvOvfcc5tddtq0aUpNTVVqaqrCwsJaLyQAAK3grcXpcrdZNXlYgtlRAJfm5W7TPy7up/zdVXrs89/MjgMAQIfTpoonf39/+fr6SpLOPPNM1dXVqaiIYdMAgI6ltLJWH6/M0bkDohXm52F2HMDlDYwP0vWjOuuD1Gx9v2mH2XEAAOhQ2lTxVFBQIMMwJEnLly+X0+lUSEiIyakAAGhd7y3LUnWdU1ed2MnsKECbceupXdQ90k/3fPKrdlXUmh0HAIAOw252gP1deumlWrhwoYqKihQbG6u///3vqqtr2Bf/+uuv18cff6xXXnlFdrtdXl5emjNnDkfxAQB0KLX1Tr3zS4ZO6hKq7pH+ZscB2gwPu03/uKifzn15sR6at0EvXjrA7EgAAHQILlU8zZ49+7C333zzzbr55ptbKQ0AAK7n640F2rGnRk+e38fsKECb0zsmQLec0kXPf7NFZ/SO1Jl9osyOBABAu9emdrUDAKCjm7U0U7FBXhrVNdzsKECbdMPozuobG6AHPl2vwrIas+MAANDuUTwBANBGbN1ZpqXbS3TZ0HjZrOxqDvwRbjarnruon8pr6nX//35tnD8UAAC0DIonAADaiFlLs+Rms+jilDizowBtWpcIP911ejd9vXGH5q3NMzsOAADtGsUTAABtQFWtQ5+sytG43lEK9fUwOw7Q5l11Yif1iw3Qk19uUmVtvdlxAABotyieAABoAz5bm6ey6npNHpZgdhSgXbBZLfrbWT1VsKda03/abnYcAADaLYonAADagFnLMtU1wleDE4PMjgK0GymJwRrfJ0qv/bhdBburzY4DAEC7RPEEAICLW5dTqnU5uzVpaIIsFiYVB46ne8d1l8Np6NmvNpsdBQCAdoniCQAAFzdraaa83Gw6b2CM2VGAdicu2FtXnpioT1bl6Nec3WbHAQCg3aF4AgDAhe2urNO8tXk6d0C0/D3dzI4DtEs3n5ysEB93PfrFRhmGYXYcAADaFYonAABc2CerclRd59SkoUwqDrQUP0833XF6Vy1PL9FXGwrMjgMAQLtC8QQAgIsyDEPvLctU/7hA9Y4JMDsO0K5dkhKnrhG+euLLTaqpd5gdBwCAdoPiCQAAF/XL9mJtK6zQ5GGMdgJamt1m1QPjeyqrpFLvLMk0Ow4AAO0GxRMAAC7qvaVZCvBy01l9o8yOAnQII7uGaXS3ML34fZqKy2vMjgMAQLtA8QQAgAvauadaX20o0EWDYuXpZjM7DtBh3H9mD1XWOvSv79LMjgIAQLtA8QQAgAv6YEW26p2GJrGbHdCqukT46bIh8XpvWZa27iwzOw4AAG0exRMAAC7G4TQ0e3mWTkwOVadQH7PjAB3O7ad1lbebTU/N32x2FAAA2jyKJwAAXMz3m3Yqb3e1Jg+LNzsK0CEF+7jr+tGd9e1vO7Q8vcTsOAAAtGkUTwAAuJhZSzMV4e+hU3tEmB0F6LCuOqGTIvw99OT832QYhtlxAABosyieAABwIVnFlfoprVATB8fLbuPPNGAWL3eb7jitq1ZnlWrB+gKz4wAA0GbxiRYAABfy3vJMWS0WXTqE3ewAs10wMFZdwn31zFebVedwmh0HAIA2ieIJAAAXUVPv0EepOTq1R7giAzzNjgN0eHabVfeO6670ogrNWZFtdhwAANokiicAAFzEgvUFKqmo1eRhCWZHAbDXmO7hGtIpWP/6dovKa+rNjgMAQJtD8QQAgIuYtTRTCSHeOqFzqNlRAOxlsVh035k9VFReq9d/2m52HAAA2hyKJwAAXMCmgj1akbFLk4bGy2q1mB0HwH76xwVqfJ8ovb5ou3aWVZsdBwCANoXiCQAAF/De0iy52626aFCc2VEAHMJdY7uptt6pf32bZnYUAADaFIonAABMVlFTr/+tztVZfaIU5ONudhwAh5AY6qNJQ+M1Z0W2thWWmx0HAIA2g+IJAACTfbomV+U19ZrEpOKAS/vLKV3kabfq2QWbzY4CAECbQfEEAICJDMPQrKVZ6hHlr4HxgWbHAXAYob4eum5UZy3YUKC12aVmxwEAoE2geAIAwESrskr1W/4eTR4WL4uFScUBV3fViZ0U4OWml37YanYUAADaBIonAABM9N7STPl62HVu/xizowA4Cr4edl11Qid9s3GHfsvfY3YcAABcHsUTAAAm2VVRq89/zdd5A2Lk42E3Ow6AozR1RKJ8Pex66XtGPQEAcCQUTwAAmOSjldmqrXdqMpOKA21KgLebpoxI0Jfr87V1Z5nZcQAAcGkUTwAAmMDpNPTesiwNTgxSt0g/s+MAOEZXn5gkT7tNL/+wzewoAAC4NIonAABM8PPWImUWVzLaCWijgn3cNXlYvOauyVVGUYXZcQAAcFkUTwAAmGDW0kyF+LjrjN6RZkcB8AddOzJJbjarXlnIqCcAAJrjUsXTVVddpfDwcPXu3fuQtxuGoVtuuUXJycnq27evVq1a1coJAQD48/J3V+nb33boopQ4edhtZscB8AeF+3nq0iHx+mRVjnJ2VZodBwAAl+RSxdPUqVO1YMGCZm+fP3++0tLSlJaWpunTp+uGG25oxXQAABwfs5dny5A0aWi82VEA/EnTRibJYpFe/ZFRTwAAHIpLFU8jR45UcHBws7fPnTtXV1xxhSwWi4YNG6bS0lLl5+e3YkIAAP6cOodTc5ZnaVTXMMUFe5sdB8CfFB3opQsHxenDFTnasafa7DgAALgclyqejiQ3N1dxcXGNl2NjY5Wbm3vIZadPn66UlBSlpKSosLCwtSICAHBY327coZ1lNZo8lEnFgfbixtGd5TAMvfbjdrOjAADgctpU8XQspk2bptTUVKWmpiosLMzsOAAASJJmLctUdICnTu4ebnYUAMdJXLC3zu0fo/eXZ6qovMbsOAAAuJQ2VTzFxMQoOzu78XJOTo5iYmJMTAQAwNHbXliuxVuLdemQeNmsFrPjADiObjq5s2rrnXp9EaOeAADYX5sqniZMmKB33nlHhmFo6dKlCggIUFRUlNmxAAA4Ku8vy5LdatElQ+KOvDCANiUpzFdn9Y3WrF8ytbuqzuw4AAC4DLvZAfZ36aWXauHChSoqKlJsbKz+/ve/q66u4Q/39ddfrzPPPFNffvmlkpOT5e3trbffftvkxAAAHJ3qOoc+Wpmjsb0iFe7naXYcAC3g+lGdNW9tnt5blqkbRyebHQcAAJfgUsXT7NmzD3u7xWLRyy+/3EppAAA4fj5fl6/dVXWaNCze7CgAWkjPaH+d1CVUby/O0NUndpKH3WZ2JAAATNemdrUDAKCtmrU0U53DfDQ8KcTsKABa0LSRSSosq9Hc1XlmRwEAwCVQPAEA0MLW5+7WmuxSTRqaIIuFScWB9uzE5FD1jPLX9EXb5XQaZscBAMB0FE8AALSw95ZlytPNqgsGxZodBUALs1gsmjYySVt3luuHzTvNjgMAgOkongAAaEF7quv06eo8TegXrQAvN7PjAGgF4/tGKSbQS6/9uN3sKAAAmI7iCQCAFvS/VbmqqnNo8rAEs6MAaCVuNquuOrGTlmeUaHXWLrPjAABgKoonAABaiGEYmrU0U31jA9Q3NtDsOABa0cTBcfL3tGv6T4x6AgB0bBRPAAC0kOXpJUrbWa7JQxntBHQ0Ph52TR6WoAUbCpRRVGF2HAAATEPxBABAC5m1LEv+nnad3S/a7CgATDB1RKLcrFa98TOjngAAHRfFEwAALaCwrEYL1ufrgkGx8nK3mR0HgAnC/T113oAYfZSao+LyGrPjAABgCoonAABawIep2apzGJrEbnZAh3btyE6qqXfqnV8yzY4CAIApKJ4AADjOHE5D7y/L0vCkECWH+5odB4CJksP9dGqPcL3zS4aqah1mxwEAoNVRPAEAcJz9uGWnckurNHkYo50ASNeN6qxdlXX6aGW22VEAAGh1FE8AABxns5ZmKczPQ6f3ijA7CgAXkJIQpAHxgXpjUbocTsPsOAAAtCqKJwAAjqPskkr9sHmnLkmJk5uNP7MAJIvFoutGJimrpFIL1heYHQcAgFbFJ2IAAI6j2cuzZJF06dB4s6MAcCGn9YxUYoi3pv+0TYbBqCcAQMdB8QQAwHFSW+/Uh6nZGtM9XDGBXmbHAeBCbFaLrjkpSWtzdmtZeonZcQAAaDUUTwAAHCdfbShQUXmtJjGpOIBDuHBQrEJ83DX9p+1mRwEAoNVQPAEAcJzMWpqpuGAvjeoSZnYUAC7I082mK4Yn6vtNO7VlR5nZcQAAaBUUTwAAHAdpO8q0LL1Elw1JkNVqMTsOABd1+fAEebpZGfUEAOgwKJ4AADgO3luWJXebVRenxJodBYALC/Zx1yUpcZq7JlcFu6vNjgMAQIujeAIA4E+qrK3XJytzNK5PpEJ8PcyOA8DFXXNSkhxOQ28vSTc7CgAALY7iCQCAP+nztfkqq6nXZCYVB3AU4oK9Na5PlN5fmqWy6jqz4wAA0KIongAA+JPmrMhS5zAfpSQEmR0FQBtx3cgkldXUa87ybLOjAADQoiieAAD4E7buLNOqrFJdMjhOFguTigM4On1jAzUsKVhv/pyu2nqn2XEAAGgxFE8AAPwJH6zIlt1q0fkDmVQcwLG5bmRnFeyp1mdr88yOAgBAi6F4AgDgD6qtd+qTVbk6tUeEQplUHMAxGt0tTF0jfPX6ou0yDMPsOAAAtAiKJwAA/qDvftuhkopaXTI4zuwoANogi8WiaSM7a1NBmX7cUmh2HAAAWgTFEwAAf9CcFdmK9PfUyK5hZkcB0EZN6BetSH9PvfrjNrOjAADQIiieAAD4A/JKq/RTWqEuSomVzcqk4gD+GHe7Vdec1ElLt5doZWaJ2XEAADjuKJ4AAPgDPl6ZI8OQLk5hNzsAf86lQ+IV5O2m//zAqCcAQPtD8QQAwDFyOg19mJqtE5JDFBfsbXYcAG2cj4ddU0d00nebduq3/D1mxwEA4LiieAIA4Bgt2VasnF1VjHYCcNxMGZEgH3ebXlnIqCcAQPtC8QQAwDGasyJLAV5uGtsr0uwoANqJQG93TR6WoM/X5SmzuMLsOAAAHDcUTwAAHINdFbX6esMOnTcgRp5uNrPjAGhHrj6xk+w2q179cbvZUQAAOG5crnhasGCBunXrpuTkZD311FMH3T5jxgyFhYWpf//+6t+/v9544w0TUgIAOqpP1+Sq1uFkNzsAx124v6cuGhSrT1bmaMeearPjAABwXLhU8eRwOHTTTTdp/vz52rhxo2bPnq2NGzcetNwll1yiNWvWaM2aNbrmmmtMSAoA6IgMw9AHK7LVNzZAPaP9zY4DoB26bmRn1TudemMRo54AAO2DSxVPy5cvV3JyspKSkuTu7q6JEydq7ty5ZscCAECStC5ntzYVlDHaCUCLiQ/x1oR+0XpvWZZ2VdSaHQcAgD/NpYqn3NxcxcX9/mE+NjZWubm5By33ySefqG/fvrrwwguVnZ19yHVNnz5dKSkpSklJUWFhYYtlBgB0HB+kZsvTzaoJ/aPNjgKgHbthdLIqax2a+UuG2VEAAPjTXKp4Ohpnn322MjIytG7dOp122mmaMmXKIZebNm2aUlNTlZqaqrCwsFZOCQBob6pqHZq3Jk9n9omSv6eb2XEAtGPdIv10ao8Ivb04Q+U19WbHAQDgT3Gp4ikmJqbJCKacnBzFxMQ0WSYkJEQeHh6SpGuuuUYrV65s1YwAgI7pu007VF5TrwsHxZodBUAHcOPJnbW7qk6zl2WZHQUAgD/FpYqnwYMHKy0tTenp6aqtrdWcOXM0YcKEJsvk5+c3np83b5569OjR2jEBAB3QZ2vzFO7noaGdQsyOAqADGBgfpBGdQ/T6ou2qqXeYHQcAgD/MpYonu92ul156SWPHjlWPHj108cUXq1evXnrwwQc1b948SdKLL76oXr16qV+/fnrxxRc1Y8YMc0MDANq9PdV1+mFzoc7sEyWb1WJ2HAAdxI2jk7WzrEafrDx4zlMAANoKi2EYhtkhWlpKSopSU1PNjgEAaKM+WZmjOz9aq09uGKFBCUFmxwHQQRiGoXNfXqzSqjp9d8co2W0u9Z0xAABH1bfw1wsAgCP4fF2eYgK9NDA+0OwoADoQi8WiG0YnK7O4Ul/8mn/kOwAA4IIongAAOIxdFbValFaks/pFyWJhNzsArev0nhFKDvfVKwu3qQPsqAAAaIcongAAOIwFGwpU7zR0dt9os6MA6ICsVotuHN1ZmwrK9P2mnWbHAQDgmFE8AQBwGJ+tzVOnUB/1ivY3OwqADursftGKCfTSyz9sZdQTAKDNoXgCAKAZO8uqtXR7sc7uy252AMzjZrPq+lFJWpVVqmXpJWbHAQDgmFA8AQDQjPm/FshpNIw2AAAzXZQSp1Bfd738w1azowAAcEwongAAaMZna/PULcJPXSL8zI4CoIPzdLPp6hOTtCitSL/m7DY7DgAAR43iCQCAQ8grrVJq5i6d3S/K7CgAIEmaPCxefp52/Wcho54AAG0HxRMAAIfwxbp8SdJZHM0OgIvw83TTlOGJWrChQFt3lpsdBwCAo0LxBADAIXy2Lk99YwOUGOpjdhQAaHTlCYnysFv16o/bzI4CAMBRoXgCAOAAGUUVWpezW2f1ZTc7AK4lxNdDEwfH69PVucrZVWl2HAAAjojiCQCAA3zxa8NuduPZzQ6AC5o2MkmS9PpP201OAgDAkVE8AQBwgM/W5iklIUgxgV5mRwGAg0QHeun8gTGasyJbO8uqzY4DAMBhUTwBALCfLTvKtKmgTGf3Y7QTANd14+hkOZyG/vVtmtlRAAA4LIonAAD28/naPFkt0rg+kWZHAYBmJYb6aNLQeM1Zkc0R7gAALo3iCQCAvRxOQ/PW5mlYUojC/TzNjgMAh/WXU7rIy82mZxZsMjsKAADNongCAGCvj1dmK6O4UpOHJZgdBQCOKNTXQ9ePStLXG3doRUaJ2XEAADgkiicAACRV1NTrH19v0aCEII3rzW52ANqGq09MUoS/h5748jcZhmF2HAAADkLxBACApOk/bVdhWY3uH99DFovF7DgAcFS83G2687RuWp1VqvnrC8yOAwDAQSieAAAd3o491Zr+03aN7xulgfFBZscBgGNywaBYdY3w1TMLNqm23ml2HAAAmqB4AgB0eM99vVn1TqfuGdvd7CgAcMxsVov+Oq6HMoorNXt5ltlxAABoguIJANCh/Za/Rx+tzNGU4YmKD/E2Ow4A/CGju4VpeFKI/vVdmsqq68yOAwBAI4onAECHZRiGnvjyN/l7uukvY7qYHQcA/jCLxaK/ntldJRW1eu3H7WbHAQCgEcUTAKDD+nFLoRalFemWU7oowNvN7DgA8Kf0jQ3UhH7ReuPn7SrYXW12HAAAJFE8AQA6qHqHU098+ZsSQrx1+bAEs+MAwHFx19hucjql57/ZbHYUAAAkUTwBADqoj1bmaMuOct17Rne52/lzCKB9iAv21hXDE/TRyhytzS41Ow4AABRPAICOp6KmXs99vUWDEoJ0Ru9Is+MAwHF166ldFObroQc+XS+H0zA7DgCgg6N4AgB0OK/9tF1F5TW6f3wPWSwWs+MAwHHl5+mmv53VU7/m7taspZlmxwEAdHAUTwCADqWytl4zFqdrbK8IDYwPMjsOALSIs/pG6aQuofrHV5u1cw8TjQMAzEPxBADoUP63Old7qut1zUlJZkcBgBZjsVj0yDm9VeNw6tEvfjM7DgCgA6N4AgB0GIZhaOaSDPWM8ldKAqOdALRvnUJ9dOPozvpsbZ4WpRWaHQcA0EFRPAEAOoxfthdry45yTT0hkbmdAHQI14/qrMQQbz04d4Oq6xxmxwEAdEAUTwCADmPG4gwFebtpQr9os6MAQKvwdLPp0XN7K72oQq/+uM3sOACADojiCQDQIWSXVOrb33bo0iHx8nSzmR0HAFrNSV3CdHa/aP1n4TZlFFWYHQcA0MFQPAEAOoRZSzNlsVg0eViC2VEAoNX9bXwPedis+tvc9TIMw+w4AIAOxOWKpwULFqhbt25KTk7WU089ddDtNTU1uuSSS5ScnKyhQ4cqIyOj9UMCANqUqlqH5qzI1theEYoO9DI7DgC0unB/T915elctSivSF7/mmx0HANCBuFTx5HA4dNNNN2n+/PnauHGjZs+erY0bNzZZ5s0331RQUJC2bt2q22+/Xffcc49JaQEAbcWna3K1u6pOU4Ynmh0FAExz+fBE9Y7x1yOfbVRZdZ3ZcQAAHYRLFU/Lly9XcnKykpKS5O7urokTJ2ru3LlNlpk7d66mTJkiSbrwwgv13XffMVwYANAswzA0c0mGekT5a0inYLPjAIBpbFaLHj+3jwrLa/Tc11vMjgMA6CBcqnjKzc1VXFxc4+XY2Fjl5uY2u4zdbldAQICKi4sPWtf06dOVkpKilJQUFRYWtmxwAIDLWrq9RJsKyjR1RIIsFovZcQDAVP3iAjV5aILe+SVD63N3mx0HANABuFTxdDxNmzZNqampSk1NVVhYmNlxAAAmmbkkQ4Hebjqnf4zZUQDAJfzf2G4K9vHQ/Z+ul8PJngMAgJblUsVTTEyMsrOzGy/n5OQoJiam2WXq6+u1e/duhYSEtGpOAEDbkLOrUl9vLNDEwfHydLOZHQcAXEKAl5seGN9Da7NLNXt5ltlxAADtnEsVT4MHD1ZaWprS09NVW1urOXPmaMKECU2WmTBhgmbOnClJ+vjjjzVmzBh2nQAAHNKspQ3/obp8eILJSQDAtZzTP1ojOofomQWbVFhWY3YcAEA75lLFk91u10svvaSxY8eqR48euvjii9WrVy89+OCDmjdvniTp6quvVnFxsZKTk/X888/rqaeeMjk1AMAVVdc5NGdFlk7vGamYQC+z4wCAS7FYLHr03N6qrnPqiS9/MzsOAKAdsxgd4JBwKSkpSk1NNTsGAKAVfbAiS/d88qvmTBumYUnskg0Ah/Lc15v17++36v1rh2pE51Cz4wAA2pij6VtcasQTAADHg2EYentxhrpH+mlop2Cz4wCAy7rp5GTFB3vrb5+uV2290+w4AIB2iOIJANDuLE8v0aaCMk0dkcg8gABwGJ5uNj1yTi9tK6zQ64u2mx0HANAOUTwBANqdGUsyFODlpnP6xxx5YQDo4EZ3C9eZfSL14ndpyiquNDsOAKCdoXgCALQruaVV+nrjDk0cEicvd5vZcQCgTXjwrF6yWy266+O1cjjb/RSwAIBWRPEEAGhXZi3NlGEYunxYgtlRAKDNiAzw1CPn9Nay9BL967s0s+MAANoRiicAQLtRXefQnOVZOq1nhGKDvM2OAwBtygWDYnXBwFj9+/s0LdlaZHYcAEA7QfEEAGg35q3J067KOk0ZkWh2FABokx49t5c6h/nq1g/WqLCsxuw4AIB2gOIJANAuGIahGUsy1C3CT8OTQsyOAwBtkre7XS9fNlB7qup0+wdr5GS+JwDAn0TxBABoF1Zk7NLG/D2aMiJRFovF7DgA0GZ1i/TT3yf00s9bi/SfhVvNjgMAaOMongAA7cLMJRkK8HLTuQOizY4CAG3eJYPjdE7/aD3/zRYtTy8xOw4AoA2jeAIAtHl5pVVasKFAlwyOk7e73ew4ANDmWSwWPX5eHyWE+OiW2atVUlFrdiQAQBtF8QQAaPPeW5YpwzB0+bAEs6MAQLvh62HXS5cNUEllre74kPmeAAB/DMUTAKBNq65zaPbybJ3SI0Jxwd5mxwGAdqVXdID+Nr6HFm4u1Cs/bjM7DgCgDaJ4AgC0aZ+tzVNJRa2uHJFodhQAaJcmD0vQOf2j9Y+vN+v7TTvMjgMAaGMongAAbZZhGJqxJENdI3w1vHOI2XEAoF2yWCx66vy+6hnlr1tnr9G2wnKzIwEA2hCKJwBAm/XD5p3akLdHU0YkymKxmB0HANotL3ebpl+RIne7Vde+k6o91XVmRwIAtBEUTwCANmnh5p26YdYqdY3w1XkDYsyOAwDtXkygl/4zaaCyiit1+xwmGwcAHB2KJwBAm/P1hgJNe2elOof5as604fJ2t5sdCQA6hKFJIXro7J76btNOPf/NFrPjAADaAD6pAwDalC/W5evWOavVKyZA71w5RAHebmZHAoAOZfKwBK3P3aOXftiqntH+OrNPlNmRAAAujBFPAIA247+rcvSX2as0ID5Qs66mdAIAM1gsFj1ybi8NjA/UnR+u1W/5e8yOBABwYRRPAIA2YfbyLN350VoNSwrRzKuGyM+T0gkAzOJht+nVyYPk52nXtHdTlbOr0uxIAAAXRfEEAHB5M5dk6K///VWjuobpramDmdMJAFxAuL+nXrt8kEor6jThpcVasq3I7EgAABdE8QQAcGlv/Zyuh+Zt0Ok9I/Ta5YPk6WYzOxIAYK8B8UH69OYTFOzjrsvfXK43Fm2XYXC0OwDA7yieAAAua+aSDD3y+Uad0StSL08aKA87pRMAuJrOYb769KYTdFqPCD32xW+6dc4aVdU6zI4FAHARFE8AAJf07tJMPTRvg07rGaEXLx0gNxt/sgDAVfl62PXK5IG6a2w3fbYuT+f9Z7Gyipn3CQBA8QQAcEHvL8vS3z5dr1N7hOvlywbK3c6fKwBwdRaLRTednKy3pw5WXmmVzn7pZ/24pdDsWAAAk/FJHgDgUj5YkaX7/verTu4WppcnUToBQFszulu4PvvLiYoK8NTUt5fr8S82qqaeXe8AoKPi0zwAwGV8lJqte/ceve6VyYOY0wkA2qiEEB/998YRmjQ0Xq8vStc5Ly3Wb/l7zI4FADABxRMAwCX8d1WO7v5knU5MDuXodQDQDni72/XYuX309tTBKiqv1TkvLdZrP26Tw8lR7wCgI6F4AgCYyjAMvbFou+78aK2GJ4Xo9StSKJ0AoB05uXu4vr59pE7uHqYn52/SZa8vVc4uJh4HgI6C4gkAYBqH09BD8zbosS9+07jekXpr6mBKJwBoh4J93PXq5EF69sK+2pC3R+NeWKR3f8lQZW292dEAAC2M4gkAYIrK2npd926q3vklU9NGJumlSwdSOgFAO2axWHRRSpzm33qSekT5629zN2jo49/pb5+u18Y85n8CgPbKbnYAAEDHs7OsWtfMTNX63N169Jxeunx4otmRAACtJC7YWx9cN0ypmbv0/rIsfZCarXeXZqp/XKAuGxqvs/tGy8udLyIAoL2wGIbR7mf3S0lJUWpqqtkxAACS0naUaerbK1RSUauXLhugU3pEmB0JAGCi0spafbIqV+8vy9S2wgr5edp1Ws8InZgcqhOSQxXh72l2RABAM46mb2HEEwCg1fy0pVA3vb9Knm42fXjdcPWJDTA7EgDAZIHe7rr6xE666oRELU8v0ZwV2fph0079d1WuJKlLuK9O2FtCDU0Klr+nm8mJAQDHwmWKp5KSEl1yySXKyMhQYmKiPvzwQwUFBR20nM1mU58+fSRJ8fHxmjdvXmtHBQAco/W5u/WPrzdr4eZCdQn31dtXDlZskLfZsQAALsRisWhoUoiGJoXI6TS0MX+PFm8t0uJtxZqzIkszlmTIZrVoaKdgjesTpbG9IhTux2goAHB1LrOr3d13363g4GDde++9euqpp7Rr1y49/fTTBy3n6+ur8vLyY1o3u9oBgDm2F5br+W+26PN1+QrwctMNoztryvBE5u4AAByTmnqHVmWWalFaoRZsKND2wgpZLNLgxGCN6x2pM3pHKirAy+yYANDhHE3f4jLFU7du3bRw4UJFRUUpPz9fo0eP1ubNmw9ajuIJAFxf/u4q/evbNH20MkcedquuOqGTrh2ZpAAvdo8AAPw5hmEobWe5vvw1X/N/LdDmHWWSpH5xgeoT46/kMF8lh/spOdxXEf4eslgsJicGgParTRVPgYGBKi0tldTwxyQoKKjx8v7sdrv69+8vu92ue++9V+eee+4h1zd9+nRNnz5dklRYWKjMzMwWSg4AkKTdVXX6aUuhvt+0U1/8mi/DMDRpaIJuOjlZYX4eZscDALRT2wrLtWB9gb7ftFNbdpSprLq+8TY/D7uSwn3VOdRHUYGeigzwUpS/p6ICPRUV4KUgbzeKKQD4E1yueDr11FNVUFBw0PWPP/64pkyZ0qRoCgoK0q5duw5aNjc3VzExMdq+fbvGjBmj7777Tp07dz7sdhnxBADH375vnL/ftFPfb9qplZm75HAaCvJ20xm9I3Xj6GTFBTOPEwCg9RiGocLyGm3dWa5tO8u1dWe5thaWK6OoUjv2VKve2fS/Pu52q6ICPBXp79lwGuC199RT0QFeCvF1l7+Xm3zcbRRUAHAILndUu2+//bbZ2yIiIpSfn9+4q114ePghl4uJiZEkJSUlafTo0Vq9evURiycAwPGRXVKp5eklWp5eosXbipSzq0qS1CPKX9ePStKY7hHqHxcom5UP5wCA1mexWBTu56lwP0+N6Bza5DaH01BxeY3yd1crf3eV8ndXq2B3deNpauYu7diTrzrHwd/LWy2Sv5eb/D3d5O9ll7+nmyL9PRUb7K3YIC/FBXkrLthLUQFe/A0EgAO4zFHtJkyYoJkzZ+ree+/VzJkzdc455xy0zK5du+Tt7S0PDw8VFRVp8eLFuvvuu01ICwDtn2EY2l5U0Vg0LU8vUW5pQ9EU4OWmIZ2CdePoZJ3cPYwJXQEALs9mtSjc31Ph/p7qFxd4yGWcTkPFFbUq2F2tvN1V2lVRqz3VddpTVb/3tE57qutVWlmrpduLlb8mV/vvP2K3WhQZ4Cl/Tzd5u9vk7WGXt5tN3h42ebvb5OVmk81qlc2qhlOLRXabRVaLRVaL5DQkp2HIMIzG806j4W+yccDlfefd7VZ5u9nk5W6Tt7u9YTt7tyVJDsOQ09mwrMPZsG6LRQr0dlewT8NPkLc7hRmAFuMyxdO9996riy++WG+++aYSEhL04YcfSpJSU1P16quv6o033tBvv/2m6667TlarVU6nU/fee6969uxpcnIAaF6dw6ni8loVltVoZ1m1Cstq9p6vUXFFjXw97Arz89j77azH7+f9PeTp1rpHfttdWac1OaVanbVLa7JLtSa7VKWVdZKkUF8PDe0UrGkjkzQ0KVhdw/1k5QMqAKCdsVotCtv797hPbMARl6+tdyqvtEo5u6qUvatS2SWVyiutUnmNQ5W19dpdVaeC3VWqqHGoqq7hOqezoQxyOI9+xhOLRY3llGXvqdVikUVSrcN5yFFax8JiafhSKdjHXWG+HooP9lZCiLfiQ3wazgd7K5D5sAD8QS4zuXhLYo4nAC2ltt6p1IwS/VZQ1qRY2vdTUlmrQ73LBno3fLirqKlXUXntIT98+nnYFebvsbeQaiimDiynwnw9jvmDYGllrTKKK5VZXKGMokqlF5VrXe5ubS+skNTw4bNruJ8GxAeqf1ygBncKVlKoDx82AQA4jvaNanI4G0oop2HIZrXsVzL9XjQdSZ3Dqcpah6pqG8qtylqHquscsuy9v81iaVy3zWqR09nweaC4olYlB/zs2FOtrJJK7SyrabINP0+7ukf6aUinYA3pFKJBCUHy9XCZcQwATOJyczwBQHtQWFajhZsbJtRelFak8pqGo+e426yN35LGBXtrYEJQY1EU5uuhcH9Phfl5KNTXXR7230czOZyGSip+HxW184DyamdZtdbllGrnnhpV1TkOyrNvu6F+HvL3tMuy9xvQfZ9TG85bVFxeo4ziSu2uqmty/6gAT/WK9tcFA2M1IC5QfWID5Ofp1lJPHwAA0L5CSMdlFzc3m1UBXlYFeB2/v99VtQ5llVQqq6Thy6qskkqty9mtV3/crpd/2Cab1aLe0f4amhSiIYnBGt45RD4UUQAOgRFPAHAUduyp1gcrsvXdpp1am10qSYrw99CY7hEa0z1cKQlBrTIEvbymXjv3VDfurrdzv3KqsKxGZdX1MiTtG2Zl7D1ryFCQt7sSQ3yUEOKthBAfJYZ4Ky7Yu9V36QMAAG1XRU29VmXt0rLtDfM/rskuVa3DKW93m87sE6ULB8VqSGIwu+QDHcTR9C0UTwBwGE6nofeWZerpBZtVUVuvfrGBOqV7uMb0CFfPKH92PwMAAB1adZ1DqzJ3ad7aPH2+Ll/lNfWKC/bSBQNjdcHAWMUFe5sdEUALonjai+IJwB+xZUeZ/vrfX7Uyc5dOTA7VY+f2VmKoj9mxAAAAXFJVrUMLNuTr45U5WrKtWIYhDUsK1tUnJunUHuF8YQe0Q8zxBAB/QHWdQ//5Yate+XGbfD3seu6ifjp/YAwflgAAAA7Dy92m8wbE6rwBscotrdL/VuVozopsXftOqnpG+euWU7ro9J4R7IYHdDCMeAKA/SxPL9G9/12n7YUVOm9AjB4Y30Mhvh5mxwIAAGiT6hxOzV2Tp5d/2Kr0ogp1j/TTX8Z00bjekRRQQDvAiCcAOEq7q+r01PxNmr08S7FBXpp51RCN6hpmdiwAAIA2zc1m1YWDYnVu/2h9vi5f//4+TTe9v0pdwn1185hkndU3+rgc2Q+A66J4AtChGYah+esL9NC8DSour9G0kUm67dQu8nbn7REAAOB4sdusOndAjM7uF60vf20ooG6ds0avLNymu8Z205juzAEFtFf8zwpAh5VXWqUH527Qt7/tUO8Yf709dbB6xwSYHQsAAKDdslktOrtftMb3idLnv+br+a836+qZqRqcGKS7z+iuwYnBZkcEcJxRPAHocBxOQ7OWZuqZBZvkNKT7z+yhK09IlN1mNTsaAABAh2C1WjShX7TG9Y7Uh6nZ+te3abro1V90Svdw3XVGN3WP9Dc7IoDjhOIJQIdRVevQVxsK9PbidK3N2a2RXcP0+Lm9FRfsbXY0AACADsnNZtWkoQk6f0Cs3l6SrlcWbtO4fy3ShH7RuvakJEajA+0AxROAds0wDK3M3KWPV+bo83X5Kq+pV1ywl164pL/O6R/NXAIAAAAuwMvdphtHJ+uyIfF65cdteveXTM1dk6fBiUG68oROOr1nBKPTgTbKYhiGYXaIlnY0h/cD0H5U1TqUW1qprzbs0Mcrc5ReVCFvd5vO7BOlCwfFakhiMIfvBQAAcGF7quv04YpszfwlQ9klVYoJ9NLlwxM0cXCcAr3dzY7XbtTWO5Wzq1KZJZXKKq5UZnGlcnZVqqbeKYfT+P3HMFTvNOR07n/qlNNQw6lTcu6tFvZ9yt7/C16LpeGn4XbLfucld7tVXm42ee798XKzycu94byfp11+HvaGU0+3xlNfj4YxNIYMGYZkqOEL58bTxuv2v75hee1/mww59y7jNAzVOxoeX73TUL3DuffUkMP5+/n9b3M4G+7n3G/dzr3b33fd/hn23Rbq66HbT+va4r/f1nA0fQvFE4A/xbn3D9G+P0pN/iAd8Afq92Ua/jg1/LHa+2a+3zoO/CN3qHXvu25PdZ0KdlerYE+1CnZXK393tXZX1TXmG5YUrAsHxWlc70j5eDDIEwAAoC1xOA1999sOzViSoSXbiuXpZtV5A2J15QmJ6hrhZ3Y8l1dV61De7irllTb85JZWN57PKqlUXmmVnPs1Al5uNsUFe8nLzSab1XLAj1V2q0VWi0X2g26zyGppKHOk34udhvN7mx79XhDtv0ydw6mqOoeqah2qrnM0nN97uay6XjX1ztZ6uo6Z1dIwYb5FFsnScNmihufCYrHIon2lW9PrEkK89d8bTzA7/nFB8bRXeyieSipq9d1vO1TrcKqu3qk6h6Fah1O19c79rnPuvc5ocl2d09D+v2Zjv+a13mk0rKPe+fv69p4/3EvDZrXIbrXKbmt407HbrHtPLQc12A1nLPs1301b8P2vb7jP7wscy5iUo34hH+WCxlEuuP+ba8Plpm+kTTa5321N72v8fr6Z9R24fHPL/r6t5nPtu+9B6zngITuNAwogx8HfdriCUF8PRQV4KsLfU1EBnooM8FSkv6cGJwYrPoT5mwAAANqDTQV7NGNxhv63Olc19U6dkByiK0d00pju4R16NHtZdZ0yiyuVXlShjKIKZRRXKqO4QpnFFSoqr22yrMUiRfh5KirQU3FB3koM8VZ8iI8SQryVEOKtMF8Pl5uKorbeqbLqOpXX1Kusul57qutUUeOQpP2Knf2Lnv0Kn73/N7XsXXj/yxZLw3nr3hsb/o9r2ft/XGtjueZms+49bXrZbrW43HNlBoqnvdpD8bQ+d7fO+vfPh7zNzWaRu80qN7tVbjar3G1Wudute69raKZ//4fXYN8/RpvVIne7VR723+/jvve8tZl/RIahhvLB4dxvKGJDGebYr+RqWnDsO39wAbbv/KGuM3Rs5dPR/ru3HOVaj3p9+5dmB9xv35vRwQXbwbf/fj9LM8sevI0Dt73/ydFs+4BNNlm/RZLd9vu3GlZr01Ob5eDrmn4DYpXNqqanlt+/FTnUNyW2vevdf5km67Y1vd3b3S53O/v7AwAAdBS7Kmo1e0WW3v0lU/m7q5UQ4q0rhifq4pRY+Xm6mR2vxZTX1CttR5m27CjTlh3le0/LtGNPTZPlIv09lRjqrU6hPooN8lZMoJeiA70av6B1Y64sHEcUT3u1h+Kppt6hnXtq5L6vXLJbGwsnWlYAAAAAHU2dw6mvN+zQ24vTlZq5Sz7uNo3pEaFB8YFKSQxW90i/NjEheWVtvYrKalVYXqOifT9ltSoqr1FhWcPl/N3Vyi2taryPp5tVXcL91DXCT8nhvuoU6q3EUB8lBPvIy91m4qNBR3M0fQsTnrQRHnYbh3wHAAAAgL3cbFaN7xul8X2j9GvObs38JUM/pxXps7V5khrmK+ofF6hBCUEamBCo2CBvBXm7K8jbrcULqYqa+ibFUWF5rYrK9iuWyn8vliprHYdcR6C3m0J9PRTq666UxCBdFhGvLuG+6hbpp9ggb9k68O6FaFsongAAAAAAbVqf2AD946J+kqS80iqtzNyllZm7tCprl175cZscB8xN6u9pV4ivh4K83RTg1bB7Xv1hDmiz7wA5+899Wn/AXKj1BxwY50AWixTk7a5QX3eF+nqoX2ygwvw8GsulUD8Phfk2XA72cWc6CbQbFE8AAAAAgHYjeu+cRmf3i5bUsCvbhrw92rGnWrsqalVSUaeSihqVVDacFpbXyGrZb35Ri0UeblZ5H3gUN9t+c5Tum3t070GXrPtdtlks8vO0NxRKfg2lUtjeMqkt7PoHHG8UTwAAAACAdsvb3a7BicFmxwA6LOpWAAAAAAAAtAiKJwAAAAAAALQIiicAAAAAAAC0CIonAAAAAAAAtAiKJwAAAAAAALQIiicAAAAAAAC0CIonAAAAAAAAtAiKJwAAAAAAALQIiicAAAAAAAC0CIthGIbZIVpaaGioEhMTzY5xXBQWFiosLMzsGHBRvD5wOLw+cDi8PnA4vD5wOLw+cCS8RnA4vD7atoyMDBUVFR12mQ5RPLUnKSkpSk1NNTsGXBSvDxwOrw8cDq8PHA6vDxwOrw8cCa8RHA6vj/aPXe0AAAAAAADQIiieAAAAAAAA0CIontqYadOmmR0BLozXBw6H1wcOh9cHDofXBw6H1weOhNcIDofXR/vHHE8AAAAAAABoEYx4AgAAAAAAQIugeAIAAAAAAECLoHhqIxYsWKBu3bopOTlZTz31lNlxYLLs7GydfPLJ6tmzp3r16qV//etfkqSSkhKddtpp6tKli0477TTt2rXL5KQwk8Ph0IABA3TWWWdJktLT0zV06FAlJyfrkksuUW1trckJYabS0lJdeOGF6t69u3r06KFffvmF9xA0+uc//6levXqpd+/euvTSS1VdXc17SAd21VVXKTw8XL179268rrn3C8MwdMsttyg5OVl9+/bVqlWrzIqNVnKo18ddd92l7t27q2/fvjrvvPNUWlraeNuTTz6p5ORkdevWTV999ZUJidHaDvUa2ee5556TxWJRUVGRJN5D2iuKpzbA4XDopptu0vz587Vx40bNnj1bGzduNDsWTGS32/Xcc89p48aNWrp0qV5++WVt3LhRTz31lE455RSlpaXplFNOoaTs4P71r3+pR48ejZfvuece3X777dq6dauCgoL05ptvmpgOZrv11lt1xhlnaNOmTVq7dq169OjBewgkSbm5uXrxxReVmpqq9evXy+FwaM6cObyHdGBTp07VggULmlzX3PvF/PnzlZaWprS0NE2fPl033HCDGZHRig71+jjttNO0fv16rVu3Tl27dtWTTz4pSdq4caPmzJmjDRs2aMGCBbrxxhvlcDjMiI1WdKjXiNTwZfrXX3+t+Pj4xut4D2mfKJ7agOXLlys5OVlJSUlyd3fXxIkTNXfuXLNjwURRUVEaOHCgJMnPz089evRQbm6u5s6dqylTpkiSpkyZok8//dTElDBTTk6OvvjiC11zzTWSGr49+v7773XhhRdK4vXR0e3evVs//fSTrr76akmSu7u7AgMDeQ9Bo/r6elVVVam+vl6VlZWKioriPaQDGzlypIKDg5tc19z7xdy5c3XFFVfIYrFo2LBhKi0tVX5+fmtHRis61Ovj9NNPl91ulyQNGzZMOTk5khpeHxMnTpSHh4c6deqk5ORkLV++vNUzo3Ud6jUiSbfffrueeeYZWSyWxut4D2mfKJ7agNzcXMXFxTVejo2NVW5uromJ4EoyMjK0evVqDR06VDt27FBUVJQkKTIyUjt27DA5Hcxy22236ZlnnpHV2vA2X1xcrMDAwMYPgbyPdGzp6ekKCwvTlVdeqQEDBuiaa65RRUUF7yGQJMXExOj//u//FB8fr6ioKAUEBGjQoEG8h6CJ5t4v+NyKA7311lsaN26cJF4f+N3cuXMVExOjfv36Nbme10j7RPEEtGHl5eW64IIL9MILL8jf37/JbRaLpcm3B+g4Pv/8c4WHh2vQoEFmR4GLqq+v16pVq3TDDTdo9erV8vHxOWi3Ot5DOq5du3Zp7ty5Sk9PV15enioqKg65iwSwD+8XaM7jjz8uu92uSZMmmR0FLqSyslJPPPGEHnnkEbOjoJVQPLUBMTExys7Obryck5OjmJgYExPBFdTV1emCCy7QpEmTdP7550uSIiIiGoei5ufnKzw83MyIMMnixYs1b948JSYmauLEifr+++916623qrS0VPX19ZJ4H+noYmNjFRsbq6FDh0qSLrzwQq1atYr3EEiSvv32W3Xq1ElhYWFyc3PT+eefr8WLF/Megiaae7/gcyv2mTFjhj7//HO99957jcUkrw9I0rZt25Senq5+/fopMTFROTk5GjhwoAoKCniNtFMUT23A4MGDlZaWpvT0dNXW1mrOnDmaMGGC2bFgIsMwdPXVV6tHjx664447Gq+fMGGCZs6cKUmaOXOmzjnnHLMiwkRPPvmkcnJylJGRoTlz5mjMmDF67733dPLJJ+vjjz+WxOujo4uMjFRcXJw2b94sSfruu+/Us2dP3kMgSYqPj9fSpUtVWVkpwzAaXx+8h2B/zb1fTJgwQe+8844Mw9DSpUsVEBDQuEseOo4FCxbomWee0bx58+Tt7d14/YQJEzRnzhzV1NQoPT1daWlpGjJkiIlJYYY+ffpo586dysjIUEZGhmJjY7Vq1SpFRkbyHtJeGWgTvvjiC6NLly5GUlKS8dhjj5kdByZbtGiRIcno06eP0a9fP6Nfv37GF198YRQVFRljxowxkpOTjVNOOcUoLi42OypM9sMPPxjjx483DMMwtm3bZgwePNjo3LmzceGFFxrV1dUmp4OZVq9ebQwaNMjo06ePcc455xglJSW8h6DRgw8+aHTr1s3o1auXMXnyZKO6upr3kA5s4sSJRmRkpGG3242YmBjjjTfeaPb9wul0GjfeeKORlJRk9O7d21ixYoXJ6dHSDvX66Ny5sxEbG9v4OfW6665rXP6xxx4zkpKSjK5duxpffvmlicnRWg71GtlfQkKCUVhYaBgG7yHtlcUwDMPs8gsAAAAAAADtD7vaAQAAAAAAoEVQPAEAAAAAAKBFUDwBAAAAAACgRVA8AQAAAAAAoEVQPAEAAAAAAKBFUDwBAAAAAACgRVA8AQAAAAAAoEVQPAEAAAAAAKBFUDwBAAAAAACgRVA8AQAAAAAAoEVQPAEAAAAAAKBFUDwBAAAAAACgRVA8AQAAAAAAoEVQPAEAAAAAAKBFUDwBAAAAAACgRVA8AQAAAAAAoEVQPAEAAAAAAKBFUDwBAAAAAACgRVA8AQAAtLJFixapW7duZseQJL3yyiuKiIiQr6+viouL5evrq+3bt0uSpk6dqgceeOC4b3PHjh0aOXKk/Pz8dOeddx739QMAANdB8QQAAFqVr69v44/VapWXl1fj5ffee++4bOPDDz/UiBEj5O3trdGjRx+XdR4ti8WirVu3HnaZk046SZs3b26lRM2rq6vTHXfcoa+//lrl5eUKCQlReXm5kpKSWnS706dPV2hoqPbs2aPnnnvuoNsvuOACXXvttU2uO++883TzzTe3aC4AAHD82c0OAAAAOpby8vLG84mJiXrjjTd06qmnHtdtBAcH67bbbtOmTZv0/fffH9d1/1n19fWy21vnI9iRtrVjxw5VV1erV69erZJnn8zMTPXs2VMWi+WQt7/88svq1auXLrvsMp188sn64IMPtGrVKr377rutmhMAAPx5jHgCAAAuoaamRrfddpuio6MVHR2t2267TTU1NZKkhQsXKjY2Vk888YRCQ0OVmJh42NFRp556qi6++GJFR0cfcbv71v3MM88oPDxcUVFR+vTTT/Xll1+qa9euCg4O1hNPPNG4/PLlyzV8+HAFBgYqKipKN998s2prayVJI0eOlCT169dPvr6++uCDDxrX//TTTysyMlJXXnll43WStG3bNgUHB2vVqlWSpLy8PIWFhWnhwoWHzJuYmKgnn3xSPXv2VFBQkK688kpVV1c3eSz7b6u553XLli2Nu/sFBgZqzJgxkg4/Yuvzzz9X//79FRgYqBEjRmjdunXNPq9LlizR4MGDFRAQoMGDB2vJkiWSGnbfmzlzpp555hn5+vrq22+/Pei+kZGReu6553TttdcqKytLt9xyi1577TX5+vo2uz0AAOCaKJ4AAIBLePzxx7V06VKtWbNGa9eu1fLly/XYY4813l5QUKCioiLl5uZq5syZmjZt2nHbXa2goEDV1dXKzc3VI488omuvvVazZs3SypUrtWjRIj366KNKT0+XJNlsNv3zn/9UUVGRfvnlF3333Xf6z3/+I0n66aefJElr165VeXm5Lrnkksb1l5SUKDMzU9OnT2+y7c6dO+vpp5/W5MmTVVlZqSuvvFJTpkw57C6C7733nr766itt27ZNW7ZsOeh52n9bzT2vXbt21YYNGyRJpaWlRxwZtnr1al111VV67bXXVFxcrOuuu04TJkxoLAf3V1JSovHjx+uWW25RcXGx7rjjDo0fP17FxcWaMWOGJk2apLvvvlvl5eXNjnabOnWqOnfurIEDB+qMM87QGWeccdh8AADANVE8AQAAl/Dee+/pwQcfVHh4uMLCwvTQQw8dtGvVo48+Kg8PD40aNUrjx4/Xhx9+eFy27ebmpvvvv19ubm6aOHGiioqKdOutt8rPz0+9evVSz549tXbtWknSoEGDNGzYMNntdiUmJuq6667Tjz/+eNj1W61W/f3vf5eHh4e8vLwOuv3aa69VcnKyhg4dqvz8fD3++OOHXd/NN9+suLg4BQcH6/7779fs2bOb3dbRPK9HY/r06bruuus0dOhQ2Ww2TZkyRR4eHlq6dOlBy37xxRfq0qWLLr/8ctntdl166aXq3r27Pvvss2Pa5kknnaTi4mJNnjz5mPMCAADXQPEEAABcQl5enhISEhovJyQkKC8vr/FyUFCQfHx8mr39zwgJCZHNZpOkxmIoIiKi8XYvL6/Guam2bNmis846S5GRkfL399d9992noqKiw64/LCxMnp6eh13m2muv1fr16/WXv/xFHh4eh102Li6u8fyBz8OB2zrS83q0MjMz9dxzzykwMLDxJzs7+5DrOnCb+7abm5t71NtLS0vTP/7xD91444268847VVdXd8yZAQCA+SieAACAS4iOjlZmZmbj5aysrCZzNO3atUsVFRXN3t5abrjhBnXv3l1paWnas2ePnnjiCRmGcdj7NDeJ9j7l5eW67bbbdPXVV+vhhx9WSUnJYZfPzs5uPH/g83Dgto70vB6tuLg43X///SotLW38qays1KWXXnrQsgduc992Y2JijmpbhmHommuu0W233aZ///vf8vHx0dNPP33MmQEAgPkongAAgEu49NJL9dhjj6mwsFBFRUV65JFHDtrF6qGHHlJtba0WLVqkzz//XBdddNEh1+VwOFRdXa36+no5nU5VV1cftxEzZWVl8vf3l6+vrzZt2qRXXnmlye0RERHavn37Ma3z1ltvVUpKit544w2NHz9e119//WGXf/nll5WTk6OSkhI9/vjjjXNJHcrRPK9H49prr9Wrr76qZcuWyTAMVVRU6IsvvlBZWdlBy5555pnasmWL3n//fdXX1+uDDz7Qxo0bddZZZx3Vtl555RUVFRXpvvvuk9Vq1ZtvvqlnnnlGmzZtOubcAADAXBRPAADAJTzwwANKSUlR37591adPHw0cOFAPPPBA4+2RkZEKCgpSdHS0Jk2apFdffVXdu3c/5LreffddeXl56YYbbtCiRYvk5eWla6+99rjk/Mc//qH3339ffn5+uvbaaw8qfR5++GFNmTJFgYGBRzUH1dy5c7VgwYLGAuv555/XqlWrDnvUvssuu0ynn366kpKS1Llz5ybP04GO9LwerZSUFL3++uu6+eabFRQUpOTkZM2YMeOQy4aEhOjzzz/Xc889p5CQED3zzDP6/PPPFRoaesTtZGVl6b777tObb74pd3d3SVLPnj1155136tprrz3i6DIAAOBaLAZ/vQEAgItbuHChJk+erJycHLOjmC4xMVFvvPFGs0eDAwAAcCWMeAIAAAAAAECLoHgCAAAAAABAi2BXOwAAAAAAALQIRjwBAAAAAACgRdjNDtAaQkNDlZiYaHYMAAAAAACAdiMjI0NFRUWHXaZDFE+JiYlKTU01OwYAAAAAAEC7kZKScsRlWnRXuwULFqhbt25KTk7WU089ddDtzz//vHr27Km+ffvqlFNOUWZmpiRpzZo1Gj58uHr16qW+ffvqgw8+aLzP1KlT1alTJ/Xv31/9+/fXmjVrWvIhAAAAAAAA4A9qsRFPDodDN910k7755hvFxsZq8ODBmjBhgnr27Nm4zIABA5Samipvb2+98soruvvuu/XBBx/I29tb77zzjrp06aK8vDwNGjRIY8eOVWBgoCTp2Wef1YUXXthS0QEAAAAAAHActNiIp+XLlys5OVlJSUlyd3fXxIkTNXfu3CbLnHzyyfL29pYkDRs2TDk5OZKkrl27qkuXLpKk6OhohYeHq7CwsKWiAgAAAAAAoAW0WPGUm5uruLi4xsuxsbHKzc1tdvk333xT48aNO+j65cuXq7a2Vp07d2687v7771ffvn11++23q6am5pDrmz59ulJSUpSSkkJpBQAAAAAAYIIWnePpaM2aNUupqam66667mlyfn5+vyy+/XG+//bas1oaoTz75pDZt2qQVK1aopKRETz/99CHXOW3aNKWmpio1NVVhYWEt/hgAAAAAAADQVIsVTzExMcrOzm68nJOTo5iYmIOW+/bbb/X4449r3rx58vDwaLx+z549Gj9+vB5//HENGzas8fqoqChZLBZ5eHjoyiuv1PLly1vqIQAAAAAAAOBPaLHiafDgwUpLS1N6erpqa2s1Z84cTZgwockyq1ev1nXXXad58+YpPDy88fra2lqdd955uuKKKw6aRDw/P1+SZBiGPv30U/Xu3bulHgIAAAAAAAD+hBY7qp3dbtdLL72ksWPHyuFw6KqrrlKvXr304IMPKiUlRRMmTNBdd92l8vJyXXTRRZKk+Ph4zZs3Tx9++KF++uknFRcXa8aMGZKkGTNmqH///po0aZIKCwtlGIb69++vV199taUeAgAAAAAAAP4Ei2EYhtkhWlpKSopSU1PNjgEAAAAALqO6zqFPVuVowfoCDUsK0SWD4xTq63HkOwLAXkfTt7TYiCcAAAAAgOvZXVmnd5dmaMaSDBWV1yo2yEuL0or0r2/TdGafSF0+PEED44NksVjMjgqgHaB4AgAAAIAW5nQaqql3qrrOoao6h7zcbArycW/VDHmlVXrr53TNXp6lilqHRncL0/WjOmtop2BtK6zQrKWZ+mRljj5dk6eeUf66fHiCzukfLW93/tsI4I9jVzsAAAAAOE62F5br9g/WaE91vapqHaqud6iq1qGaemeT5WxWi8b2itCVJ3RSSkLLji7aXFCm137apnlr8mRImtAvWtNGJqlHlP9By1bU1OvTNbl695dMbSook5+nXRcOitUVwxPVKdSnxTICaJvY1Q4AAAAAWtG8tXlal7tbZ/WNlpebVV5uNnm62eThZtt73ipPN5vSiyo0Z3mWvvy1QL2i/XXlCZ10dr8oedhtfzpDbb1T6/N2a0V6iX7eWqRFaUXycrPp8uEJuvrETooN8m72vj4edk0amqDLhsQrNXOX3vklU7OWZur9ZVn66raRSqR8AnCMGPEEAAAAAMfJRa8uUU29U/NuPvGIy1bW1ut/q3M1Y3GG0naWK9TXXZcNidfkYQkK9/c86m1W1NRrdVaplmeUaEV6iVZn71J1XcMIq6RQH507IEaXD0v4w7v2ZRVXasxzC3XF8EQ9eHbPP7QOAO0TI54AAAAAoJWU7y2Arh2ZdFTLe7v/Prpo8dZizViSrn//sFX/WbhN4/tGaUz3cNXWOxvnhqqpd6qmzqHqvZer6xzaXFCm9Xl75HAaslqkntH+unRIvIYkBislMVhhfn/+KHXxId4a3zdKH6Vm647Tu8rXg/9GAjh6vGMAAAAAwHGwPL1Y9U5DJyaHHtP9LBaLTuwSqhO7hCqjqEIzf8nQR6k5mrsm7xDLSh72ht31POxWJQT76PpRSRqcGKxBCUHy83Q7Xg+niakjEjV3TZ7+uypHVwxPbJFtAGifKJ4AAAAA4DhYlFYkD7tVgxKC/vA6EkN99NDZvfR/p3dTXmnV3vmhrPKwN8wP5W6ztuhE5M0ZEB+kfnGBmrEkQ5OHJshqbf0MANomq9kBAAAAAKA9WLy1SEM6BcvT7c9PEO7jYVeXCD/FBXsr3M9TAV5u8rDbTCmd9rlyRKK2F1Zo0dYi0zIAaHsongAAAADgT9q5p1pbdpTrhGPcza4tObNPlML8PDRjcbrZUQC0IRRPAAAAAPAnLd7WMAroWOd3akvc7VZNGhqvHzYXKr2owuw4ANoIiicAAAAA+JN+TitWoLebekb5mx2lRV02NF5uNotmLskwOwqANoLiCQAAAAD+BMMwtHhrkU7oHNruJ90O9/PUWX2j9VFqtsqq68yOA6ANoHgCAAAAgD9hW2GFCvZUt+v5nfY3dUSiKmod+nhljtlRALQBFE8AAAAA8Cf8nFYoSTqpS8convrFBWpAfKBmLsmQ02mYHQeAi6N4AgAAAIA/4eetxYoP9lZcsLfZUVrN1BGJyiiu1I9bCs2OAsDFUTwBAAAAwB9U73Bq6fbiDrOb3T7jekcp3M9DbzPJOIAjoHgCAAAAgD9obc5uldfU68QOVjy5262aPCxBP20p1Nad5WbHAeDCKJ4AAAAA4A9avLVIFos0vHOI2VFa3aVD4uVus+qdXzLMjgLAhVE8AQAAAMAf9PPWIvWK9lewj7vZUVpdmJ+HzuoXpY9X5mhPdZ3ZcQC4KIonAAAAAPgDKmrqtTprV4eb32l/V47opMpahz5KzTE7CgAXRfEEAAAAAH/A8vQS1TmMDje/0/76xAZoUEKQZi7JkMNpmB0HgAuieAIAAACAP+DnrUVyt1s1ODHY7CimmjoiUVkllVq4eafZUQC4oBYtnhYsWKBu3bopOTlZTz311EG3P//88+rZs6f69u2rU045RZmZmY23zZw5U126dFGXLl00c+bMxutXrlypPn36KDk5WbfccosMg1YdAAAAQOtbvLVIgxOD5OlmMzuKqc7oHakIfw/NWJJhdhQALqjFiieHw6GbbrpJ8+fP18aNGzV79mxt3LixyTIDBgxQamqq1q1bpwsvvFB33323JKmkpER///vftWzZMi1fvlx///vftWvXLknSDTfcoNdff11paWlKS0vTggULWuohAAAAAMAh7Syr1qaCsg49v9M+bjarLhgYq8Vbi1RV6zA7DgAX02LF0/Lly5WcnKykpCS5u7tr4sSJmjt3bpNlTj75ZHl7e0uShg0bppychgnpvvrqK5122mkKDg5WUFCQTjvtNC1YsED5+fnas2ePhg0bJovFoiuuuEKffvppSz0EAAAAADikX7YVS1KHnt9pfwPig+Q0pA15u82OAsDFtFjxlJubq7i4uMbLsbGxys3NbXb5N998U+PGjTvsfXNzcxUbG3tU65w+fbpSUlKUkpKiwsLCP/twAAAAAKDRz2lFCvByU6/oALOjuIS+sQ3Pw7ociicATdnNDiBJs2bNUmpqqn788cfjts5p06Zp2rRpkqSUlJTjtl4Af16dw6mi8hrt3FOjwrIa7SxrOC0sr1ZljUN1TkP1DqfqHIYcTqfqnYbqHE7VOww5DEPRAV7qHO6rzmE+6hzmq85hvvJy79hzKwAAgNZjGIYWby3SiM4hslktZsdxCRH+norw99C6nFKzowBwMS1WPMXExCg7O7vxck5OjmJiYg5a7ttvv9Xjjz+uH3/8UR4eHo33XbhwYZP7jh49WjExMY274x1unQBaV2VtvX7cXKiy6nqV1zT8VBxwWl5Tr10VdSosr1FJRe0h1xPk7SZvd7vcbBbZbVbZrRa52ayy2xpOPdysssiiDXm7NX99vvY/Ym9MYEMZlRzmqy4RvhqWFKLEEG9ZLHwYBAAAx9f2ogrl7a7WjSezm93++sYGal0uI54ANNVixdPgwYOVlpam9PR0xcTEaM6cOXr//febLLN69Wpdd911WrBggcLDwxuvHzt2rO67777GCcW//vprPfnkkwoODpa/v7+WLl2qoUOH6p133tFf/vKXlnoIAI6CYRi67t2VWpRW1OR6D7tVvh52+XjY5bv3Jz7EWymJQQrz81C4n+feUw+F+Xko1NdD7vaj3/u3us6hzOJKbSss17ad5dpaWK5theWanV6iqrqGSS1jAr10UpdQndglVCd0DlWQj/txfewAAKBjWry14XPPSV0onvbXLzZA32zcoT3VdfL3dDM7DgAX0WLFk91u10svvaSxY8fK4XDoqquuUq9evfTggw8qJSVFEyZM0F133aXy8nJddNFFkqT4+HjNmzdPwcHB+tvf/qbBgwdLkh588EEFBwdLkv7zn/9o6tSpqqqq0rhx4xrnhQJgjm9/26lFaUX6v9O76twBMY1lk5utxaaQkyR5utnULdJP3SL9mlzvdBrKLKnUz1uL9HNaob74NV9zVmTLYpH6xAToxOSGImpQQpA87OyeBwAAjt3PaUWKDfJSfLC32VFcSp/YQEnS+pzdGsGk6wD2shiGYRx5sbYtJSVFqampZscA2p2aeofG/vMn2W1Wzb/1pBYvm/6IeodTa3N26+e0Iv28tVCrskrlcBoK8HLTTSd31hXDE+XpRgEFAACOTr3DqQGPfqPxfaL01AV9zY7jUnZV1GrAo9/o3nHddf2ozmbHAdAKjqZvcYnJxQG0TTOXZCijuFIzrhzskqWTJNltVg1KCNKghCDdemoXlVXXaen2Es1amqknvtykmUsydefpXXVu/xhZmRwUAAAcwa+5u1VWXa8TGNFzkCAfd8UFezHBOIAmXPN/igBcXmFZjV78bqvGdA/X6G7hR76Di/DzdNNpPSM086ohev+aoQr2cdcdH67V+H//rJ+2FJodDwAAuLh98zuN6BxichLX1Dc2UOtymGAcwO8ongD8Ic99vVnVdQ7dP76H2VH+sBHJoZp70wn618T+Kq+p0xVvLdfkN5ZpPUdjAQAAzfh5a5F6RvkrxNfD7CguqW9MgHJ2Vam4vMbsKABcBMUTgGO2Pne3PkjN1pQRieoc5mt2nD/FarXonP4x+vaOUXrwrJ7akLdbZ/37Z902Z7WySyrNjgcAAFxIZW29VmWW6kSOZtesvnsnGP+VL/IA7EXxBOCYGIahRz7fqCBvd91yShez4xw3Hnabrjqxk368+2TdOLqz5q8v0CnP/6hnFmxSeU292fEAAIAL+GlLkWodTo3qGmZ2FJfVO8ZfFovY3Q5AI4onAMfky18LtDy9RHee3lUBXm5mxznu/D3ddPcZ3bXwrtE6q2+U/rNwm0Y/u1AfrsiWw9nuDwIKAAAOY8H6fAV6u2lIp2Czo7gsP083JYX6MME4gEYUTwCOWnWdQ098+Zu6R/pp4uB4s+O0qKgALz1/cX99etMJig/20t2frNOEl37Wsu3FZkcDAAAmqK136rvfduq0HhEuezRfV9GPCcYB7Id3TABH7Y1F25VbWqUHz+4pm9VidpxW0T8uUJ/cMEIvXjpAuypqdcn0pbph1kplFTP/EwAAHcnibUUqq6nXGb0jzY7i8vrEBmhnWY0KdlebHQWAC6B4AnBUduyp1n8WbtPYXhEa0bljTahpsVg0oV+0vrtztO48rasWbi7Uqc//qKfmb9Ke6jqz4wEAgFaw4NcC+XrYmVj8KOybYJzd7QBIFE8AjtLTCzap3mHo/jN7mh3FNF7uNv3llC764f9G66x+UXr1x20a8vi3um3Oai1KK2QOKAAA2ql6h1Pf/LZDY7qHy8NuMzuOy+sZ5S+b1cLudgAkSXazAwBwfauzdum/q3J1w+jOig/xNjuO6SIDPPX8xf111QmdNHt5lj5bm6dP1+Qp0t9T5w6I0QUDY9Qlws/smAAA4DhZnlGikopadrM7Sl7uNnWN8NO6XIonABRPAI7AMAw98vlGhfl56KaTk82O41J6xwTo8fP66G9n9dT3m3bqk5U5en3Rdr364zb1iQnQBQNjNKF/jIJ93M2OCgAA/oQF6wvk6WbV6G5hZkdpM/rGBOirjQUyDEMWS8eYGxTAobGrHYDD+nrjDq3OKtVdY7vJ14Ou+lA83Ww6s0+U3pw6WMvuO0V/O6unnIahhz/bqCGPf6u7PlqrnF1MRg4AQFvkdBr6akOBRnUNk7c7n4WOVt+4AJVW1im7pMrsKABMxjsngMP6duMOBXi56YKBsWZHaRNCfT109YmddPWJnbSpYI/mLM/W+8uzNHdNni4bGq+bxyQr1NfD7JgAAOAorc4u1Y49Nexmd4z6xgRKktblljJVA9DBMeIJQLMMw9CSbcUanhQim5Uh0seqe6S/Hp7QSwv/b7TOHxijd5dmauQzP+gfX23maHgAALQRC9bny81m0ZjuEWZHaVO6RfrJ3WZlgnEAFE8AmpdZXKnc0iqdkBxidpQ2LTrQS09d0Fff3D5SY7qH66Uftuqkp3/Qqz9uU1Wtw+x4AACgGYZhaMGGAp2QHKoALzez47Qp7narekT7a11OqdlRAJiM4glAsxZvK5IkjUgONTlJ+5AU5quXLhuoz/9yogbEB+qp+Zs06tkf9O7STNU5nGbHAwAAB9iQt0fZJVUax252f0jfmACtz90jp9MwOwoAE1E8AWjWkq3FivT3VFKoj9lR2pXeMQGaceUQfXjdcCWEeOtvn67XWS/+rF+2FZsdDQAA7GfB+gJZLdKpPdjN7o/oGxug8pp6bS+qMDsKABNRPAE4JKfT0JJtRRqRHMIhcFvIkE7B+vC64Zp++SBV1Nbr0teX6i+zVyt/N0d/AQDAFcxfn6+hnUIUwoFB/pC+sYGSxO52QAdH8QTgkH4r2KNdlXU6oTO72bUki8Wi03tF6ts7Rum2U7vo6w0FOuW5H/XKwm2qqWf+JwAAzLJ1Z5m2FVZoXB92s/ujksN95eVmY4JxoIOjeAJwSEu2Nuz2dQLzO7UKTzebbju1q769Y5ROTA7V0ws2adwLi/TjlkKzowEA0CHN/7VAknR6T4qnP8pmtah3DBOMAx0dxROAQ1q8rUidw3wUGeBpdpQOJS7YW9OvSNGMKwfLkDTlreWa9k6qsksqzY4GAECHMn99gQbGB/JZ6E/qGxuoDXl7OJAK0IG1aPG0YMECdevWTcnJyXrqqacOuv2nn37SwIEDZbfb9fHHHzde/8MPP6h///6NP56envr0008lSVOnTlWnTp0ab1uzZk1LPgSgQ6qtd2rZ9hJGO5lodLdwLbjtJN01tpsWpRXp1Od/1NuL0zkqDAAArSCruFIb8/doXO8os6O0eX1jA1RT71TajnKzowAwSYsVTw6HQzfddJPmz5+vjRs3avbs2dq4cWOTZeLj4zVjxgxddtllTa4/+eSTtWbNGq1Zs0bff/+9vL29dfrppzfe/uyzzzbe3r9//5Z6CECHtSa7VFV1Do1gfidTedhtuunkZH135yiN6Byiv3+2UZe/tUx5pUw+DgBAS1qwIV+SdEZvdrP7s5hgHECLFU/Lly9XcnKykpKS5O7urokTJ2ru3LlNlklMTFTfvn1ltTYf4+OPP9a4cePk7e3dUlEBHGDx1iJZLdLwpBCzo0BSdKCX3po6WE+c10ers0o19oWf9L/VOTIMRj8BANAS5q8vUK9of8UF83+QPysxxFt+nnaty2WCcaCjarHiKTc3V3FxcY2XY2NjlZube8zrmTNnji699NIm191///3q27evbr/9dtXU1BzyftOnT1dKSopSUlJUWMjkvMCxWLKtSL1jAhTg7WZ2FOxlsVh02dB4zb/1JHWN8NPtH6zVTe+vUklFrdnRAABoVwp2V2t1VqnGMdrpuLBYLOobG8CIJ6ADc+nJxfPz8/Xrr79q7Nixjdc9+eST2rRpk1asWKGSkhI9/fTTh7zvtGnTlJqaqtTUVIWFhbVWZKDNq6ip1+qsUnazc1EJIT768LrhuvuMbvpm4w6NfeEnfb9ph9mxAABoN77a0HA0O3azO376xgZqU36ZquscZkcBYIIWK55iYmKUnZ3deDknJ0cxMTHHtI4PP/xQ5513ntzcfh91ERUVJYvFIg8PD1155ZVavnz5ccsMQFqeUaJ6p6ETktnNzlXZrBbdODpZc286USE+7rpqRqr++t9fVVFTb3Y0AADavPnr85Uc7qvkcD+zo7QbfWMCVO80tKmgzOwoAEzQYsXT4MGDlZaWpvT0dNXW1mrOnDmaMGHCMa1j9uzZB+1ml5/fMNGfYRj69NNP1bt37+OWGYC0ZGuR3G1WpSQEmx0FR9Az2l9zbz5B141K0pwVWRr/4iJt3ckHOgAA/qji8hotTy9hN7vjrG9coCQmGAc6qhYrnux2u1566SWNHTtWPXr00MUXX6xevXrpwQcf1Lx58yRJK1asUGxsrD766CNdd9116tWrV+P9MzIylJ2drVGjRjVZ76RJk9SnTx/16dNHRUVFeuCBB1rqIQAd0uKtxRqYECgvd5vZUXAUPOw2/XVcD825dpjKaxw67+UlWrh5p9mxAABok77ZuENOQxrbi+LpeIoO8FSIj7vW5TDBONARWYwOcFiklJQUpaammh0DcHklFbUa+Og3+r/Tu+rmMV3MjoNjlFtapWtmpmpzwR49ML6nrjwhURaLxexYAAC0GVPeWq7tReX66a6T+Rt6nF359nLlllbp69tHHXlhAG3G0fQtLj25OIDW9cu2YknSiGQmFm+LYgK99PH1w3Vqjwg98vlG3fe/9apzOM2OBQBAm7BzT7UWby3Smb2jKJ1aQJ/YQG3dWc6clEAHRPEEoNHibUXy87Crb0yA2VHwB/l42PXq5EG6cXRnzV6epcvfXKZdFbVmxwIAwOW9tyxL9U5DE4fEmx2lXeoXGyCnIW3I22N2FACtjOIJQKPFW4s0NClYdhtvDW2Z1WrR3Wd01/MX99OqzFKd+5/F2rqz3OxYAAC4rNp6p95fnqXR3cLUKdTH7DjtUp/Yhi82mWAc6HjsR7NQbm6uMjMzVV//+7DIkSNHtlgoAK0vZ1elMosrNWV4otlRcJycPzBWCSHeuu7dlTrvP4v18mUDNbJrmNmxAABwOfPX56uwrEZTRySaHaXdCvfzVFSAp9YywTjQ4RyxeLrnnnv0wQcfqGfPnrLZGo5yZbFYKJ6AdmbJ1ob5nU5gfqd2ZVBCsD696QRdMzNVU99ersfP66NL2YUAAIAm3l6coU6hPhrZhS9oWtLA+CAtTy+WYRjMowV0IEcsnj799FNt3rxZHh4erZEHgEkWbytSqK+Hukb4mh0Fx1lskLc+uWGEbnhvlf726Xr1iQlQb+bxAgBAkrQ2u1Rrskv10Nk9ZbVShrSkk7qE6otf87VlR7m6RfqZHQdAKzniRC5JSUmqq6trjSwATGIYhpZsK9aIziF8+9RO+XjY9eLE/gr2cdcdH65RdZ3D7EgAALiEmUsy5ONu04WDYs2O0u7t2+X/py2FJicB0JqaHfH0l7/8RRaLRd7e3urfv79OOeWUJqOeXnzxxVYJCKDlpe0sV2FZjU5IDjE7ClpQoLe7nr6wr658e4X++e0W/XVcD7MjAQBgqqLyGn2+Ll8Th8TJz9PN7DjtXnSgl5LDffVTWqGuHZlkdhwAraTZ4iklJUWSNGjQIE2YMKHVAgFofYu3FkmSRnRmfqf27uRu4bp0SJym/7Rdp/WIUEpisNmRAAAwzexlWap1OHUFB1dpNSO7hGnWskxV1Trk5W4zOw6AVtBs8TRlyhRJUkVFhTw9PRsnFnc4HKqpqWmddABaxeKtxUoI8VZcsLfZUdAK7h/fUz9vLdKdH63Vl7ecJB+PozrAKQAA7Uqdw6lZyzJ1UpdQJYczx2VrGdk1VG8tTtey9GKN7hZudhwAreCIczydcsopqqqqarxcVVWlU089tUVDAWg99Q6nlm0vZrRTB+LrYdezF/ZTVkmlnpq/yew4AACY4qsNBdqxp0ZTRySaHaVDGdopRO52q37aUmR2FACt5IjFU3V1tXx9f/8GwNfXV5WVlS0aCkDr+TV3t8pq6pnfqYMZlhSiq07opHeXZmpRGhN8AgA6nhmLMxQf7M2om1bm5W7T0E7B+onPH0CHccTiycfHR6tWrWq8vHLlSnl5ebVoKACtZ8m2YknS8CSKp47mrrHdlBzuq7s/XqfdVRy9FADQcazP3a3UzF26YniCbFaO6NvaRnYJ09ad5corrTrywgDavCMWTy+88IIuuuginXTSSTrxxBN1ySWX6N///ndrZAPQCn5OK1KPKH+F+HoceWG0K55uNj13UT/tLKvR3z/bYHYcAABazcwlGfJys+milDizo3RII7uGSRKjroEO4ogzyvbt21ebNm3S5s2bJUndunWT0+ls8WAAWl51nUMrs3bpimEJZkeBSfrFBeqm0Z314vdbNbZXpMb2ijQ7EgAALaqkolZz1+bpwkGxCvByMztOh9Q1wleR/p76aUuRLhkcb3YcAC3siCOehg8fLjc3N/Xu3Vu9e/eWm5ubhg8f3hrZALSgPdV1uvn9VaqtdzK3QQd385gu6hXtr/v++6uKyzlqKQCgfZuzIku19U4mFTeRxWLRSV1C9fPWIjmchtlxALSwZoungoICrVy5UlVVVVq9erVWrVqlVatWaeHChUwuDrRxaTvKdO5Li7Vwc6EePrsnE4t3cO52q56/uL/Kqut1///WyzD4AAgAaJ/qHU7N+iVTIzqHqGuEn9lxOrSRXcO0u6pOa3NKzY4CoIU1u6vdV199pRkzZignJ0d33HFH4/V+fn564oknWiUcgOPvy1/z9X8frZW3u13vXztMQzoFmx0JLqBbpJ/uOL2rnpq/Se8vz9Kkoex+CQBof779bYfydlfroQm9zI7S4Z2YHCqLRfppS6EGxgeZHQdAC2q2eJoyZYqmTJmiTz75RBdccEFrZgLQAhxOQ//4erNeWbhNA+ID9cqkQYoM8DQ7FlzItSclafHWIt3/v/XKK63Snad1k5Uj/QAA2pG3F2coJtBLp/aIMDtKhxfk466+MQH6aUuhbju1q9lxALSgI04ufsEFF+iLL77Qhg0bVF1d3Xj9gw8+2KLBABw/uypqdcuc1VqUVqTLhsbrobN7ysNuMzsWXIzNatGbUwbroXnr9fIP27RlR7n+eUl/+Xoc8U8FAAAu77f8PVqWXqJ7x3WXjS9WXMLIrmF6+Yet2l1ZpwBvJnoH2qsjTi5+/fXX64MPPtC///1vGYahjz76SJmZma2RDcBxsCFvt85+6Wct216ipy/ooyfO60PphGa526164rw+/9/efYc3VfZvAL8zuvemk1K66KItKZS9l2gZIpQ9RRF+IvgqKorr9RV9HYioiDIVKFOKCIiCbOiglFVGC90tpYPS3aTJ+f1RqPIyWqDhNO39ua5eIScnJ3fKISTfPM/3wXvP+GH/xet49ttjyCxiXz8iItJ9q4+mwUAuxWiFq9hR6JYe3nbQCMDRKwViRyEiLaq38HTs2DGsXbsWVlZWePfdd3H8+HFcvnz5SWQjoscUnZiNZ787BrVGwKYXO3O5WmoQiUSCyV3bYM2Ujsi9WYmIpUdw4mqh2LGIiIgeWXxaETadzERkmCusTPTFjkO3BLtawsxAjkOX88WOQkRaVG/hycjICABgbGyMnJwc6OnpITc3V+vBiOjxZBdXYu7GRAQ5W+LX/+uGYFdLsSORjunmZYvo2d1gZaKP8T/GYH1MhtiRiIiIHlqFsgb/2nwazpZGeG2Qr9hx6B/0ZFJ08bTBocv5XFWXqBmrt/D09NNPo7i4GK+99hpCQ0Ph7u6OMWPGNOjge/bsgY+PDzw9PbFo0aK7bj906BBCQ0Mhl8uxZcuWO26TyWQIDg5GcHAwIiIi6ranpqaiU6dO8PT0xOjRo6FUKhuUhail2ZGYA40AfD6qPWxNDcSOQzqqja0Jts/qiq6etnjrl7N4N/ocVGqN2LGIiIga7NM9l5BWWIFPRwaxb2ET1MPbDjk3q3Alv0zsKESkJfUWnt555x1YWlri2WefRXp6Oi5evIgPP/yw3gOr1WrMmjULu3fvRlJSEjZs2ICkpKQ79nFzc8Pq1asxduzYu+5vZGSExMREJCYmYseOHXXb58+fj7lz5yIlJQVWVlZYsWJFQ54nUYsTnZiNDq2t4GptLHYU0nHmhnpYOTkMz3dvgzXH0zFpZSyS80r5zSQRETV5x68UYvWxNEzu4o4ubW3FjkP30MPLDgBw8DL7PD1IZlEFXt10GtGJ2ahUqsWOQ/RQ6i35q9Vq/Pbbb0hLS0NNTU3d9nnz5j3wfrGxsfD09ISHhwcAIDIyEtHR0fDz86vbx93dHQAgldZb/wIACIKA/fv3Y/369QCASZMm4b333sPMmTMbdH+iluLitRJcvFaKD4b6ix2FmgmZVIIFQ/zg7WCGBb+cQ/8vD8HD1gT9/R0w0L8Vgl0sIeUKQURE1ISUVdfgtS2n4W5jjNcH+Ygdh+7D1doYHrYmOJycj2nd2ogdp8n6en8ytiZkYWtCFkz0ZXgq0BEjQl3QqY0134NRk1dv4emZZ56BoaEhAgMDG1wgAoDs7Gy4uv69YoSLiwtiYmIafP+qqiooFArI5XK88cYbGDZsGAoLC2FpaQm5XF53zOzs7Hvef/ny5Vi+fDkAID+fzeqoZYlOzIFMKsGQQEexo1Az85zCFd297PBH0jXsTcrDisOp+P7gVdiZGaC/nwMG+Dmgc1sbrpxIRESi+8+uC8gursTmFzrDWJ9T7JqyHt52iIrLQJVKDUM9vof4X/ml1dh+Kgfjw93wdJATtiVkYdfZa9h8MgvOlkYYHuKM4aHOaGtnKnZUonuq9xU4KysLZ86ceRJZ7pCeng5nZ2dcvXoVffr0QWBgICwsLBp8/xkzZmDGjBkAAIVCoa2YRE2ORiNgR2IOenjZwoa9nUgLWlkYYkJnd0zo7I6bFSr8dek69iZdw/ZT2VgfkwEzAzm6ednCwdwQxvoymBjIay/15TA2uHWpL4OeXIoqlRpVKjUqlGpUKtWoVNVeVihrt5sayNHF0xbtXSwglzX8yw8iImrZDl3Ox/qYDDzfvQ0U7tZix6F69PC2xepjaYhPu4FuXpwS+b9+OpEOlUaDad080MbWBOEeNng/IgB7k65hW0I2vj2QgqV/paC9qyWmdHHHsBBnsSMT3aHewtPgwYOxd+9eDBgw4KEO7OzsjMzMzLrrWVlZcHZu+D+A2/t6eHigV69eOHXqFJ599lkUFxejpqYGcrn8oY9J1BLEp99AdnElXhvIIeWkfRbGehgW4oxhIc6oUqlx7EoB9p7Pw9ErBTiaUoBypRpqzaP1gjKQS6FUa/D5H5dhZihH17a26O5tix5eduxdRkRE91VSpcL8rWfQ1s4Erw7g+yFdEO5hA32ZFIeS81l4+h9VKjV+PpGOfu0c0MbWpG67kb4MQ4OdMTTYGddLqhCdmINN8Zl4ZWMiWlkYItzDRsTURHeqt/AUHh6O4cOHQ6PRQE9PD4IgQCKRoKSk5IH3CwsLQ3JyMlJTU+Hs7IyoqKi63kz1uXHjBoyNjWFgYICCggIcPXoUr7/+OiQSCXr37o0tW7YgMjISa9aswdChQxv2TIlaiOjEbBjpydDfz0HsKNTCGOrJ0MfXAX18/z73BEGAUq1BRbUa5coaVCjVKK+uvVTWaGCoJ4ORvgzG+jIY3fqzkV7tj1QqwY1yJY5dKcTh5HwcupyPPeevAQDcbYzR3csO3b1s0dXTFiZcpYiIiG758Nck5JVUYdtLXTltS0cY68uhcLfCocv5eOupdmLHaVK2JWSjqFyJ6Q/of2Vvbojne3hgfHhr9PviIBZGn8NvL3eHHkeLUxNR7zv1efPm4fjx4wgMDIRE0vCmZXK5HEuXLsXAgQOhVqsxdepU+Pv7Y+HChVAoFIiIiEBcXByGDx+OGzdu4Ndff8W7776L8+fP48KFC3jhhRcglUqh0Wjwxhtv1DUl/+STTxAZGYm3334bISEhmDZt2qM/e6JmRlmjwW9nc9Hfz4EfxKlJkEgkMJDLYCCXwcpE/6Hvb2WijyFBjhgS5AhBEHC1oByHL+fjcHIBtiZk4acT6bAy1sOs3p4YH96aHzCIiFq4fRfysPlkFmb1botgV0ux49BD6OFth0W7LyKvpAoO5oZix2kSNBoBK45cRaCzBTq2qX/KqJG+DO9F+OP5tfFYdTQVM3q0fQIpieonEepZD7tHjx44cODAQzUWb2oUCgXi4+PFjkGkdX8m5WH62nisnKy4Y9QJUXOkrNEgLq0Iyw5eweHkAjhZGOKVft4YEerMflBERC1QcYUS/b88BBsTfUTP7sqFLnRMUk4JnlpyGP8dGYTnFK7136EF+OvidUxZHYevIoMxNLjhLWamrY7D8auF2PdqTzhaGGkxIVHD6i31Dom43WNp8ODBMDD4u1HxvHnzHj8hETWq6NM5sDLWQ3cvO7GjEGmdvlyKrp61U+2OpRTgk98v4fWtZ/D9oSv41wAfDApo9VAjdYmISLe9u+M8bpQrsXpKGItOOqidoxnszAxwKLmAhadbfjxyFa3MDfHUQ65U/V6EP/p9cRAf7kzCt+M6aCkdUcPV+5VwmzZt0LdvXyiVSpSWltb9EFHTUlZdgz+SrmFIkCPnc1OL08XTFttf6oJl4ztAIpFg5roEDPvmKI6mFIgdjYiInoAtJ7MQnZiD/+vjBX+nhq+ETU2HRCJBdy9bHEnOf+SFSZqTpJwSHE0pxOSu7g/93t7V2hize3ti19lrOHg5X0sJiRqu3hFP77777pPIQUSPae/5a6hSaTDsIYbhEjUnEokEgwJaob+fA7YlZGHxn8kY92MMunra4I1B7RDowg8iRETNTVl1DT78NQkb4zMR5m6Fl3qzp40u6+lth20J2TiXfRPtW3iPrhVHUmGsL8OYMLdHuv+Mnh7Ydiob70afw55XerAPJomKwyKImonoxBy4WBmhQ2srsaMQiUomleA5hSv2/6snFj7thwu5pXhm6RHM25iI3JuVYscjIqJGcjK9CE99dRibT2bipV5tsW56OEd967iunrYA0OJH6eSVVGHH6WyMUrjCwljvkY5hIJfhg6H+SCuswPcHrzZyQqKHw1dmomagoKwaR1IKENHeiT1tiG4xkMswtVsbHHytF2b2aoudZ3PR+7MD+HzvJZRX14gdj4iIHpFKrcFnv1/Cc8uOQyMI2PhCZ7w+yBf6cn600XW2pgbo6G6NbQlZqGcNrGZt7fE01GgETO3a5rGO093LDkOCHPHNgRSkF5Y3Ujqih8dXZ6JmYOfpHKg1AoaFcJod0f8yM9TD/EG+2DevJwb4tcLX+1PQ67MD2BiXwR4SREQ6JuV6GUZ8ewxL/0rBiFAX7J7THWHu9S8zT7pjTCdXpBVW4PjVQrGjiKJCWYN1MRkY6NcKbjbGj328d4b4QU8qwXs7zrfoYh6J65EKTx988EFj5yCixxB9OgftHM3h7WAmdhSiJsvV2hhLxoRg20td4GplhPlbz2LIksM4nNyyh/MTEekCQRCw5lgahiw5jKwbFVg2PhSfPdceZoaPNg2Jmq7BAY6wMNLD+pgMsaOIYmtCNoorVJje/fFGO93WysIQc/t7469L+diblNcoxyR6WPU2F7+XH3/8EQsXLmzsLET0CNILy3EqoxhvDPYVOwqRTgh1s8LWmV2w6+w1LNpzARNWxKKHtx3aOZoBAnD7u0BBECD847qhnhRDAp3g52QuVnQiohalSqVGemEFUgvKsT42A4cu56Ontx3+OzII9uaGYscjLTHUk2FEqDN+PpGOwrJq2JgaiB3pidFoBKw8kor2rpaN2rd1Uhd3bDmZhfd3nEd3L1sY6z9SGYDokd33jDM3v/cba0EQUFnJ5qxETcWOxBwAQER7J5GTEOkOiUSCIUGO6OdnjzXH0vD9was4cbUQkrrbAQkkty5rVddo8M1fVxDiZonxnVpjSJAjV4ghohbtZoUKJVUqaAQBao0AjVD7WUEj4Nb12gK+RFL7I5VIbv3Uvg5LJbULQqjUAjJvVCA1vxypBeVIKyzH1fxy5NysxO2ZQYZ6Unw41B/jw1uzn2ULMLajG1YdTcPWhCzM6NFyVircf/E6UgvK8fWYkEY9z/VkUnw4LADPLTuOr/enYP4gfmFNT9Z9C0+WlpaIi4uDg4PDXbe5urpqNRQRNYwgCNiemI2ObazhZGkkdhwinWMgl2FGj7YNelN7s0KFLQlZWBeTjlc3n8YHO5MwsoMLxnVyg4ed6RNIS0Qkjhq1BlcLynEhtwQXr5XWXuaW4lpJVaM/lpmhHB62Jghzt0IbW1e42xrDw9YUHnYmMDHgKI2WwsvBDGHuVtgQm4nnu3u0mGLjD4evwtnSCIMDWjX6scPcrTGygwt+OHQVz4Y6w9OeLTroybnvq/fEiRORnp5+z8LT2LFjtRqKiBrmfE4JruSXY1o3D7GjEDV7FsZ6mNatDaZ2dceJq0X4OSYda46lYcWRVHRpa4Px4a3R38+BS3kTkc5T1miw+1wuDicX4EJuCZKvl0FZowEA6MkkaGtnis5tbeDbygw2pgaQ3h7NJJX8/edbl7cLBrWjn2pHQ2mEf46OEiCBBC5WRmhjawJrE/0WU2SgBxvT0Q3zNp3G8auF6NLWVuw4Wnc26yZiUovw9pB2kGvpvcSbg33xR1IeRi47Dt9WZnC3MYGbjTHcbUzQ2sYYrW1MYNqAAq9KrYFKreGUPWowidACWtsrFArEx8eLHYOo0X30WxJWH0tD3IJ+sDTWFzsOUYtzvbQKm+OzsD4mA9nFlXC2NMIPExXsA0VEOqm4Qol1MRlYezwNeSXVsDXVRztH81s/ZvBtZY62dqbQl7PATtpXpVKj03/2oYe3Hb4eEyJ2HK17JeoU/rxwHcfe7ANzLTbNj7laiI3xmUgvrEB6YTkKypR33G5raoDWNsYw1JOiUqlGhVKNKlXtZaVKjUqlGjW3VgUeEeKM/4wIZOuBFq4h9RaWKIl0lFojYMfpHPT0tmfRiUgk9maGmNXbEy/2bIuDl69jwS/n8NyyY1g6LhS9fezFjkdE1CBX88uw8mgqtp7MRqVKje5etvjk2SD08LKDVMrRRySOO5uM+zXrJuNZNyqw80wuJnVx12rRCQA6edigk4dN3fXSKhXSCyuQUVSBtMJypBfUXlYq1TDWl8PG1ABGejIY68tgpC+r+3NBmRKrj6UhJb8M30/oAEcLtv2g+2PhiUhHxVwtRF5JNd55mk3FicQmk0rQx9cBv7xkgamr4zB9TTw+GOqPcZ1aix2NiOieBEHA8auFWHE4FfsuXoe+TIphIU6Y2q0NfFtx1CY1DWNaSJPxT/dcglwmwfTubZ74Y5sZ6iHA2QIBzhYPfd8ubW0wd2Minvn6KL6fEIoOra21kJCaA46TJdJR0Yk5MNGXoV+7u/uwEZE4WlkYYtOLndHDyxYLfjmHj3dfgEbT7Ge0E5GOOZJcgKeWHMHYH2KQmFmMOX29cPSNPvh0ZHsWnahJ8XYwg6J1bZPx5tohJiHjBnaczsGM7h46N2pogH8r/DKrK0wMZIhcfgJRsRliR6ImqkGFp4SEBCxZsgRff/01EhIStJ2JiOpRXl2DXWdzMSiAy7kTNTWmBnL8MFGB8eFu+P7gVfzfhlOoUqnFjkVEBLVGwJd/XMaElTGoUqnxybOBOPpGH8zt7w07s+Y7jYl029hObkgtKMfxq4ViR2l0giDg3zuTYGdmgBd66uaILm8HM0TP6opwDxu8se0sFkafg0qtETsWNTH1Fp4++OADTJo0CYWFhSgoKMCUKVPw73//+0lkI6L7+PV0DkqrazCmo6vYUYjoHuQyKT4cGoAFT7XDrnO5GPvDCRSWVYsdi4hasMKyakxeFYuv9iVjeIgzdr3cHaPD3PgFFjV5TwU6wtxQjg2xmWJHaXS/nc1FQkYxXhvgA5MGrCbXVFka62PV5DDM6OGBtcfTMWFFDN/30B3qXdXOx8cHp0+fhqGhIQCgsrISwcHBuHTp0hMJ2Bi4qh01NxFLj6BapcGeV7pzyWGiJm732Vy8sjERDuaGWDUlDG3tTMWOREQtzMn0IsxadwpFFUq8H+GPyDBXvn8gnfLejvNYF5OOE2/2bTZNxqtUavT74iDMDPWw8/+6QdZMGvn/cioL87eehZ2pAZZP7AB/p4fvHUW6pSH1lnpHPDk5OaGqqqruenV1NZydnR8/HRE9kjNZxTiTdRPjwt34ppFIBwwOdMSGGeEor67BiG+P4dDl/Gbbp4KImhZBELDiSCpGf38C+nIpts3sgjEd+f6BdM/YTm5QqQVsTcgSO0qjWX0sDVk3KvH2kHbNpugEAMNDXLD5hc5QawSM/O44dp3NFTsSNQH1Fp4sLCzg7++PyZMnY8qUKQgICIClpSVefvllvPzyy08iIxH9w/qYDBjpyTAshAVgIl0R6maFX17qCltTfUxcGYuIpUexOT6TvZ+ISGtKq1R4aV0CPtyZhN6+9vj1/7o90qpVRE1Bc2syXlBWjW/2p6BfO3t09bQVO06ja+9qiR3/1xW+jmZ4aV0Cvt6X3Cz+3ujR1TuRdPjw4Rg+fHjd9V69emkzDxE9QEmVCjtO5yCivRPMDfXEjkNED8HNxhi//l83bE3IxtpjaXhtyxn8Z9cFRHZ0w/jw1nC21K2VbIio6bqQW4KX1iUgo6gCbw72xYweHhzlRDpvTEc3vLr5NI5fLUSXttot1lwvrcLus9fw25lcmBrWLhrSmKOSFv95GZUqNd58ql2jHbOpsTczxIbnw/HmtrP4/I/LSL5ehk9HBrGvXAtVb+Fp0qRJUCqVuHz5MoDank96evzASySG6FPZqFCqMS7cTewoRPQIjPXlmBDeGuM7ueH4lUKsPpaG7w9ewfcHr6C/nwMmdXFHZw8bfkAkokciCAI2x2fhnehzsDDSw4bnw9GxjbXYsYgaxZAgR7z/63lsiM3USuGpqFyJ3edysfN0LmJSC6ERABcrI2SlVSIqLgPjOrVulMe5nFeK9TEZmBDeutn3fTTUk+GLUe3haW+K//5+CelFFfhhQgfYmxuKHY2esHoLTwcOHMCkSZPg7u4OQRCQmZmJNWvWoEePHk8iHxHdIggC1sVkIMDZHEEulmLHIaLHIJFI0MXTFl08bZFZVIF1MRmIisvA7+fz4O1gis4eNjDUl8FQLoOhngxGelIY6sn+8SOFIAAqtQYqtYAaTe2lSq1Bza1tGkGAmaEcVsb6sDLRh5WxHqyM9WFhpAe5rN6Z9kSkY8qqa/D2L2exPTEHXdraYHFkMOzN+OGOmg9DPRlGhLpgfUwGCsuqG6XJ+M0KFX4/fw2/nsnBsSuFUGsEeNiZYHYfLzwd5Agve1NELj+BT/dcwuAAR1ib6D/2Y/5n1wWYGMgxp5/3Yx9LF0gkEszq7Ym2dqaYuzERQ785ih8mKjj1t4Wpd1W7Dh06YP369fDx8QEAXL58GWPGjMHJkyfrPfiePXswZ84cqNVqTJ8+HW+88cYdtx86dAivvPIKzpw5g6ioKIwcORIAkJiYiJkzZ6KkpAQymQwLFizA6NGjAQCTJ0/GwYMHYWFRe6KuXr0awcHBD8zBVe2oOTiZfgPPfncMH48IxJiOHPFE1NxUqdTYcToH606kI7WgHFU1GihrNFp5LHNDOaxM9OFkYYTIjq4YEujIYhSRDjufcxOz159CemE5XunnjVm9PZtVs2Ki2y5dK8XAxYfw1lO+mNGj7SMfp6y6Bp/vvYSfT6RDpRbgZm2Mp4Mc8XSQE9o5mt0x8vjStVI8teQwRilc8PGIoMfKf/ByPiatjMWCp9rh+R4ej3UsXXQu+yaeXxuP4goVvhzdHoMCHMWORI2gIfWWekc8qVSquqITAHh7e0OlUtX74Gq1GrNmzcIff/wBFxcXhIWFISIiAn5+fnX7uLm5YfXq1fjss8/uuK+xsTHWrl0LLy8v5OTkoEOHDhg4cCAsLS0BAP/973/rilRELcW6mHSYGsgR0d5J7ChEpAWGejKMUrhilMK1bptaI6C6Ro0qlQaVKjWq/vEjkUigL5NCLpNALpX+/WdZ7XaJRIKSShWKK1S4UaGs/SlX4kaFCsUVShRVqHAu+ybmRCXi0z2XMK1bG4wOc4WJQb1vDbQip7gSao0AY30ZTAzkMJBLOeWQqB6CIODnE+n48LcLsDLWw/rnwxHuYSN2LCKt8Wllhg63mow/3/3Repftv5iHt385h9ySKkSGuWJMRzcEOlvc91g+rcwwpYs7VhxNxegwNwS7Wj5S9hq1Bh/9lgQ3a2NM7NI40/Z0TYCzBaJnd8WMtSfx4s8J+NeA2kI5/79v/up9d6lQKDB9+nSMHz8eALBu3TooFIp6DxwbGwtPT094eNRWciMjIxEdHX1H4cnd3R0AIJXe+S2rt/ffww6dnJxgb2+P/Pz8usITUUtTXKHEzjO5GKVwEe1DIRE9eTKpBMb6chg/4sh+CyM9uD6gvYtGI2DfxetYfugKPtiZhMV/Xsb48NaY3NX9iUzRySmuxK+ncxCdmIOk3JI7bqt97jKY6MthbFB7aWOqj8+fa98o0yuIdF1JlQpvbD2DXWevoae3Hb4YxX8b1DKMvdVk/Ms/LmNCZ3fYmTXsvL9eWoX3f03Cb2dy4e1gii1ju6BDa6sG3XdOPy9En87Bwuhz+OWlro80onBTfBYu55Xhu3GhMJC33Abb9maGiJoRjvlbz+CzvbVNx/8zPJCfcZq5ev92v/vuO3zzzTdYsmQJAKB79+6YNWtWvQfOzs6Gq+vf39q6uLggJibmoQPGxsZCqVSibdu/h1IuWLAAH3zwAfr27YtFixbBwODuF5vly5dj+fLlAID8/PyHflyipmRrQjaUNRqM7dgyvx0hIu2QSiXo7+eA/n4OSMi4geUHr+K7g1fw4+FUDA9xxvM9POBp37iNT2+UK7HrXC6iE3MQm1oEoHbZ5beHtIOlsT7Kq2tQrqxBRbX6jsuSqhocuJSPXeeuYUI4XwupZTudWYzZGxKQU1yFNwb7YkZ3D0g5tY5aiCFBjvjlVDaW7E/BtweuoI+vPUYpXNHLx+6e08Y1GgEb4zPx8a4LqKrR4F8DvDGjR1voyxs+xdzMUA9vD2mHOVGJj9RovLRKhS/+uIQwdysMCmj1UPdtjgz1ZFg8OhjeDmb4bO8lnM4sxleRIWj/iKPJqOmrt/C0bNkyzJs3D/Pmzavb9tVXX2HOnDlaDQYAubm5mDBhAtasWVM3Kurjjz9Gq1atoFQqMWPGDHzyySdYuHDhXfedMWMGZsyYAQANGqFF1FTVNhVPR4ibJfyczMWOQ0TNVKibFZZN6IDUgnL8ePgqtpzMwsb4TATeav6pUtf2nKqu0dT++dZ1ZY0GhnoyOJgbwMHcEK3MDeFgYQgHMwO0sjCEg7khbE0NkJBxAzsSc3Dwcj5qNALa2png1f7eeKa9E9xtTerNJwgCen12APsv5LHwRC2WIAhYeTQNi3ZfgL2ZITa90LnBIzaImgtDPRl+nt4JKddLsSk+C9sSsrA3KQ92ZgZ4NtQFzylc6laLS7lehrd+OYvY1CJ0amONj0cEwuMRV5KLaO+E9TEZj9Ro/LsDV1BQpsSKSWGcVnbL7abjoW5WeHVTIp797hhe6eeFmb3Yo645qrfwtGbNmruKTKtXr6638OTs7IzMzMy661lZWXB2dm5wsJKSEgwZMgQfffQRwsPD67Y7OtY2IDMwMMCUKVPu6g9F1NycuFqEq/nl+Oy59mJHIaIWoI2tCT4aHoi5/b3x0/F0nEy/AT2ZBPpyKfRkUujLpTC4/edb1yuUalwvrcK1m1WISS3C9dIqqNR3r13iaGGIad3aICLYCX6O5g/15lsikaCvrwN+jklHhbIGxvockk8tS5VKjTe2nsH2xBz093PAf0cGwfJR5+ESNQOe9mZ466l2eG2gD/66eB2b4rPww+GrWHbwChStrRDgbIH1MRkw0pfh02eD8JzC5bGKPhKJBB8MDcBTSw7jv79fbFCjcUEQsOpoGr4/dBXDQ5w5ouceOre1we45PbBg+1l8tvcyDl0uwBej28PFyljsaNSI7vuubcOGDVi/fj1SU1MRERFRt720tBTW1g9oGHFLWFgYkpOTkZqaCmdnZ0RFRWH9+vUNCqVUKjF8+HBMnDjxribiubm5cHR0hCAI2L59OwICAhp0TCJdtT42A+aGcjwdxFUfiOjJsTU1wNz+j7bUs0YjoKhCiWs3q3C9tAp5JdVoY2uCju7WjzUdqG87e6w8mopjKYXo5+fwyMch0jX5pdV44ad4JGQUsxkv0f/Qk0kxwL8VBvi3wvWSKmw7lY1NcZlYfSwNTwc54t1n/BvcB6o+/2w0PkrhihC3+484rFSq8ea2v4vFHwz1b5QMzZGFsR6+HhOCPr72WBh9HoMXH8a/hwdgaHDDB65Q0yYRBOHuryQBpKenIzU1FW+++SYWLVpUt93MzAxBQUGQy+v/pnHXrl145ZVXoFarMXXqVCxYsAALFy6EQqFAREQE4uLiMHz4cNy4cQOGhoZo1aoVzp8/j59//hlTpkyBv//f/zhXr16N4OBg9OnTB/n5+RAEAcHBwVi2bBlMTR88XLIhy/sRNUUFZdXo/PE+jA9vjXef4X9WRNSyKWs0CP3wDzzT3vGxl7Qm0hVJOSV4fm08Csur8eWoYAwO5BdRRPURBAHFFSpYPcR0uIYqrVKh7+cH4WBuiO2z7t1oPLOoAi/8dBIXrpVgXr/aYjH7sDVMZlEFXtmYiJPpNzA02AkfDguAuaGe2LHoARpSb7lv4ak5YeGJdNWyg1ewaPdF/DmvBzztzcSOQ0QkupfWnUR82g3EvNWXIz6o2fsjKQ9zok7B3FAPP05SIOBWzzUiEld0YjbmRCXio+EBdzUaP3Q5Hy9HnYJGI+CryBD09rUXKaXuqlFr8O2BK/hqXzJamRticWQwwtzrn3VF4mhIvaXhrfyJ6InSaASsj8lAxzbWLDoREd3Sx9cB10urcT6nROwoRFojCAKWHbyCGT/Fw8veFDtmd2XRiagJiWjvhE5trPHpnksoKlcCqP13+92BK5i8KhYOZobYMbsbi06PSC6T4uW+XtjyYmfIZRJELj+BVUdT0QLGzDRbLDwRNVFHUgqQUVSBcZ3cxI5CRNRk9Paxg0QC7LtwXewoRFpRXaPGvzafwaLdFzEk0BEbX+gMe3NDsWMR0T/cbjReVl2D//5+EeXVNZi1PgGf7LmIwYGO2PZSlwat2EoPFuJmhZ3/1w19fO3x/q9J+NfmM6hSqcWORY+gQUvCKJVKXL58GQDg4+MDPT3OsSTStvUxGbA20ceggFZiRyEiajJsTA0Q4mqJfRfzMKefl9hxiBpVYVk1XvjpJOLTb2BuP2+83JdNxImaqn82Gj9+pRAZRRV46ylfPN/dg/9uG5GZoR6+H98BS/YnY/GfyUi5XoplEzrA0cJI7Gj0EOod8XTgwAF4eXlh1qxZeOmll+Dt7Y1Dhw49iWxELVZeSRX+uJCH5zq4wEAuEzsOEVGT0redA85k3cT1kiqxoxA1mmMpBYhYehTncm7im7GhmNPPix9eiZq4Of28YG9mgJuVKvw0rRNm9GjLf7daIJVK8Eo/b3w/oQNSrpfhma+PIj6tSOxY9BDqLTy9+uqr2Lt3Lw4ePIhDhw7h999/x9y5c59ENqIWKyo2E2qNgDEdOc2OiOh/9bnVM+OvS5xuR7qvrLoGC345i7E/xkBfLsWmFzpjSBBXriPSBWaGevh1djfse7UXunraih2n2Rvo3wrbZ3WFqYEMY344gXUx6WJHogaqt/CkUqng4+NTd93b2xsqlUqroYhasms3q/DD4avo62vPueFERPfg28oMzpZG7PNEOu9YSgEGfnkI62MzML1bG+ye0x1BLpZixyKih2BvbghrE32xY7QYXg5miJ7dDV3a2mLBL+fw5razUNZoxI5F9ai3x5NCocD06dMxfvx4AMC6deugUCi0HoyopXpvx3mo1BosfMZP7ChERE2SRCJBH197bE3IQpVKDUM9Tkkm3VJWXYOPd13AupgMeNiaYMuLndGhNZcKJyJqCAsjPaycHIbP9l7Cdweu4HJeKb4bHwp7My7E0FTVW3j67rvv8M0332DJkiUAgO7du2PWrFlaD0bUEv2RlIc956/h9UE+aG3D0U5ERPfTp509fjqRjhNXC9HLh8tVk+44llKA17acQc7NSkzv1gb/GujD4ikR0UOSSSWYP8gX/k7meG3zGTz11RF8Pqo9enrbPdLxBEHAqcxiSCUSOJgbwNbUAHqyeieINYhaIyCvpAqZRRXIvFGJjKIKyKUSvNy35SySUm/hadmyZZg3bx7mzZtXt+2rr77CnDlztBqMqKUpq67Bwuhz8G1lhue7e4gdh4ioSevsYQMjPRn2X7zOwhPpBI5yIiJqfE8HOcHT3hQvbziFSStjMb1bG7w2yOehFmhKKyjHO9HncDi5oG6bRALYmOjD3swQ9uYGcDAzhIO5AWxMDSCVSgBBgHBrX0G4fVn7hwqVGlk3KmsLTUUVyC6uhEot3HHsIGeLFlV4kgi3fzv3ERoaioSEhDu2hYSE4NSpU1oN1pgUCgXi4+PFjkH0QO//eh6rj6Vh68wuCHWzEjsOEVGTN31NPC7kluDI/N5cRYiatH0X8rAw+jxyblZiWleOciIiamxVKjX+s+sC1h5Ph5+jOZaMCYGnvekD71Ndo8ayA1fxzYEUGMikmNvfG61tjJFXUo28kipcL63G9ZIq5JVW4XpJNQrKqqF5YPXkb9Ym+nC1MoKLtTFcrYzham1069IYTpaGzWrl8obUW+474mnDhg1Yv349UlNTERERUbe9tLQU1tb8doaoMZ3JKsaaY2kY36k1i05ERA3Ur509/ryQh8t5ZfBpZSZ2HKK7XLtZhfd/PY/d567B28GUo5yIiLTEUE+GD4YGoLuXHV7fchrPfH0E7z7jh9Fhrvf8cupYSgHe3n4OVwvK8XSQI9552g8O5g/uEVWj1qC4UlU3wkkiAW4f+fZjSADoy6UwMah3clmLct/fRpcuXeDo6IiCggK8+uqrddvNzMwQFBT0RMIRtQQ1ag3e2HoWtqYGeG2QT/13ICIiAEBv39opdn9eyGPhiZoUtUbAzyfS8d/fL0Gl1uD1QT6Y3s0D+vLG6RdCRET31t/PAXte6YF5mxLxxrazOHg5Hx+PCISlce3Kg/ml1fjPrgv45VQ23KyNsWZqxwb3hZLLpLA1NdBm/GbrvoWn1q1bo3Xr1jh+/PiTzEPU4qw6moak3BJ8Oy4U5oZ6YschItIZDuaGCHS2wP6L1zGrt6fYcYgAAOdzbuKtbWdxOusmunvZ4t/DArhgCBHRE+RgboifpnbCD4ev4r+/X0JiZjE+H9UeqQXl+GT3RVSq1Pi/Pp6Y1duT056fEI7/IhJRZlEFvvjjMvr62mNwQCux4xAR6Zw+vvZYsj8ZReVKWJvoix2HWrDy6hos/vMyVh5Ng5WxHr6KDEZEeyf2HyMiEoFUKsELPduic1sbvLzhFMb+EAMACPewxr+HBdbb/4kaFwtPRCIRBAELo89BIgE+GBbAN6ZERI+gXzsHfLUvGQcuXceIUBex41ALdTSlAK9vOYPs4kqM6eiGNwb5wsKYo5iJiMQW5GKJ317ujqV/pcDbwRTDgp35uUsE9U40/+qrrxq0jYgezm9nc/HXpXzM6+8NZ0sjseMQEekkfydz2JsZYN/F62JHoRZqc3wmJq2MhaGeFFte7IyPRwSy6ERE1ISYGMgxf5Avhoe4sOgkknoLT2vWrLlr2+rVq7WRhajFuFmpwvu/JiHA2RyTu7iLHYeISGdJpRL08bXHoUv5UNZoxI5DLYggCFi6PxmvbTmDTh7W2D6rKxTuXLGOiIjof913qt2GDRuwfv16pKamIiIiom57SUkJrK35nyrR4/hkz0UUllVj1eQwyGVc4YaI6HH08bVHVFwm4tOK0MXTVuw41AKoNbXT5dfFZGBYsBM+HdmeK9YRERHdx30LT126dIGjoyMKCgrw6quv1m03MzNDUFDQEwlH1BydTL+B9TEZmNatDQKcLcSOQ0Sk87p52UJfLsW+i9dZeCKtq1Sq8XLUKfyRlIcXe7bF6wN9IJVy6gYREdH93Lfw1Lp1a7Ru3Rp//vknjIyMIJVKcfnyZVy8eBGBgYFPMiNRs7Ls4BXYmupjXn9vsaMQETULxvpydGlrg/0Xr+Odp/3EjkPNWFG5EtPWxCExsxjvR/hjEqfLExER1aveMcE9evRAVVUVsrOzMWDAAPz000+YPHnyE4hG1PxcL6nC/ovXMbKDK0wMuKgkEVFj6etrj9SCclzJLxM7CjVTmUUVGPndMZzPKcF340JZdCIiImqgegtPgiDA2NgY27Ztw0svvYTNmzfj/PnzTyIbUbOzJSELao2A0WGuYkchImpWevvaAwD2X+DqdtT4zmXfxPBvj6GwXIl10zthUICj2JGIiIh0RoMKT8ePH8e6deswZMgQAIBarW7Qwffs2QMfHx94enpi0aJFd91+6NAhhIaGQi6XY8uWLXfctmbNGnh5ecHLy+uOlfVOnjyJwMBAeHp64uWXX4YgCA3KQiQ2QRCwKS4THdtYo42tidhxiIiaFRcrY/i2MsO+i3liR6FmRBAE/HYmF6O/Pw4DuRRbZ3ZGGFeuIyIieij1Fp4WL16Mjz/+GMOHD4e/vz+uXr2K3r1713tgtVqNWbNmYffu3UhKSsKGDRuQlJR0xz5ubm5YvXo1xo4de8f2oqIivP/++4iJiUFsbCzef/993LhxAwAwc+ZM/PDDD0hOTkZycjL27NnzMM+XSDQxqUVIK6xAJEc7ERFpRb92DohLu4Htp7LFjkLNQF5JFV746SRmrU+Ap70ptr3UBZ72ZmLHIiIi0jn1Npnp2bMnevbsWXfdw8MDS5YsqffAsbGx8PT0hIeHBwAgMjIS0dHR8PP7u+mnu7s7AEAqvbP+9fvvv6N///6wtq79Rql///7Ys2cPevXqhZKSEoSHhwMAJk6ciO3bt2Pw4MH15iES28a4TJgZyjGYw/OJiLTi+e4eiE0rwisbE3HhWgleH+gLGVcbo4ckCAI2xWfi379dgLJGgzcH+2JatzaQy+r9vpaIiIju4b6Fp1deeQWLFy/GM888A4nk7jdtO3bseOCBs7Oz4er698gOFxcXxMTENCjUve6bnZ2N7OxsuLi43LX9XpYvX47ly5cDAPLz8xv0uETacrNShV1nc/GcwgVG+jKx4xARNUsWxnr4eVonvP/reXx/8CouXyvFV2NCYG6oJ3Y00hHpheV4c9tZHLtSiE5trLHo2SBOjyciInpM9y08TZgwAQDwr3/964mFaUwzZszAjBkzAAAKhULkNNTS7UjMRnWNBpFhbmJHISJq1vTlUnw0PBC+juZ4f8d5DP/mKH6cFMbiAT2QWiNg1dFUfLb3EuRSKT4aHoAxYW6QcsQcERHRY7tv4alDhw4Aaqfa3R4xZGdn1+ADOzs7IzMzs+56VlYWnJ2dG3zfAwcO3HHfXr16wdnZGVlZWY90TCIxRcVlwt/JHAHOFmJHISJqESaEt0ZbOxPMWpeAoUuP4Jtxoeju1fD3MdRyXLxWgvlbz+J0ZjH6+trj38MD4GhhJHYsIiKiZuOBk9Xfe+892NrawsfHB97e3rCzs8MHH3zQoAOHhYUhOTkZqampUCqViIqKQkRERIPuO3DgQOzduxc3btzAjRs3sHfvXgwcOBCOjo4wNzfHiRMnIAgC1q5di6FDhzbomERiOZd9E+dzSjCaTcWJiJ6oLm1tsWN2NzhaGGHSylisOJLK1XAJGo2Ac9k38d2BKxiz/ASeXnIEmUUVWDImBD9OUrDoRERE1MjuO+Lpiy++wNGjRxEXF4c2bdoAAK5evYqZM2fiyy+/xNy5cx98YLkcS5cuxcCBA6FWqzF16lT4+/tj4cKFUCgUiIiIQFxcHIYPH44bN27g119/xbvvvovz58/D2toa77zzDsLCwgAACxcurGs0/u2332Ly5MmorKzE4MGD2VicmryNcZkwkEsxtD1H5xERPWmu1sbY+lIXzNuYiA93JuFibgn+PTwABnL222tJ8kurcSQlH4cuF+Bwcj4KypQAAN9WZpje3QMzenjA2kRf5JRERETNk0S4z1d/ISEh+OOPP2Bra3vH9vz8fAwYMACnTp16IgEbg0KhQHx8vNgxqAWqVKrR8T9/ol87B3w5OljsOERELZZGI2Dxn5exZH8K3KyNoXC3QoCTBQJdLODnaA4Tg3oX+iUdIggCzmbfxJ5z13DgUj6ScksAANYm+ujmaYse3nbo7mULB3NDkZMSERHptobUW+77LkulUt1VdAJq+zypVKrHT0fUAuw+l4vSqhqMUnCaHRGRmKRSCeYN8IG/swWiYjNw6HIBtiXUrowrkQBtbE0Q6GyBACcLBDhbwM/JHBZGXA1Pl9wuNv12Nhe7zuYis6gSMqkEHVpb4bWBPujuZYsAJws2DCciInrC7lt40te//3DjB91GRH/bGJcJdxtjhHtYix2FiIgADPRvhYH+rQAAeSVVOJd9E2ezb+JcdgliU4sQnZhTt6+zpRH8nMzRztEcfo5m8HO0gIuVEQsXTci9ik1yqQRdPW3xf729MMDfAZbGfN9KREQkpvsWnk6fPg1zc/O7tguCgKqqKq2GImoOruaXISa1CK8P8oFEwg8pRERNjYO5IRzMDdG3nUPdtvzSapzLuYmknBJcyK392XchD5pbjQlMDeRo52iGdo7mGBzgiHAPa77G/8OF3BL8dek6qlUa1Gg0qFELUKprL2s0GqjUAmrUGqgFQILa0WZSiQQSAPjHn6USCe71a/3nthq1gBOphSw2ERERNXH3LTyp1eonmYOo2dkUnwWZVIKRoS5iRyEiogayMzNAbx979Paxr9tWqVTjUl5pXSEqKacEW09mYe3xdLR3tcTMnm0xwM+hxY6EUqk12Hs+D2uOpyE2tahuu0wqgZ5MAj2pFHKZBHKZFHpSCfTkUkglEgiCAAGAIAAaQcDtrqMaQYDmHi1I79WVtJ2jOf6vjxcG+LHYRERE1FSxkyaRFqjUGmxNyEJvH3vYs3EpEZFOM9KXIdjVEsGulnXbqlRqbDmZheWHruLFn0/Cw84EL/TwwLAQ5xazYl5BWTU2xGRgXUwGrpVUwdXaCAueaodnO7jAyliPI8GIiIgIAAtPRFrx18XryC+tRmQYm4oTETVHhnoyjA9vjcgwV+w+dw3LDl7B/K1n8cUflzGtWxuM6egGM8Pm2Zw8MbMYa4+lYeeZXCjVGnT3ssW/hwWgt689ZC101BcRERHdHwtPRFqwMS4T9mYG6OVjJ3YUIiLSIrlMimfaO+HpIEccTi7AsoNX8J9dF/H1/hRM7Nwas3p7wli/ebzdyiupwsyfTyIhoxgm+jKM6eiKCZ3d4WlvKnY0IiIiasKaxzshoibk2s0q/HXpOl7s2RZymVTsOERE9ARIJBL08LZDD287nM4sxrKDV/DtgSuIS7uBVZPDYGKg22+5NBoB/9p8GhdyS/F+hD9GhDo32xFdRERE1Lj4qZiokW1NyIJGAEYpOM2OiKglau9qie/Gd8BXkSGITyvClNVxqFDWiB3rsaw6lobDyQV4++l2mNTFnUUnIiIiajAWnogakUYjYGNcJjp72MDd1kTsOEREJKKI9k5YfLv4tEp3i08Xr5Xgkz0X0dfXHmM7uokdh4iIiHQMC09EjaS8ugaf/H4RGUUVGM2m4kREhNri05ejgxGXVoSpOjjyqUqlxitRiTA3lOOTkUFcqY6IiIgemm43HCBqAtQaAVsTsvDZ75dwvbQaQ4OdMDiwldixiIioiRga7AwAmLsxEdNWx2Pl5DAY6ctETtUwn++9hIvXSrFysgK2pgZixyEiIiIdxMIT0WM4dqUA/955AUm5JQi+1dOjQ2srsWMREVETMzTYGYIAzNuUiKmr43Si+HQ0pQA/HE7F+HA39PF1EDsOERER6SgWnogewdX8Mny8+yL+SMqDs6URlowJwTNBjpyCQERE9zUsxBkCBLy66TSmrYnDiklNt/h0s0KFVzedhoedCRY85Sd2HCIiItJhLDwRPYTiCiWW7EvB2uNpMNST4fVBPpjatQ0M9ZrmBwciImpahoe4AADmbTqN6Wvj8OPEpld8EgQBb20/i4Kyamyb2KXJ5SMiIiLdwsITUQMdSynAS+sTUFKpwugwN8zr7w07M/a7ICKihzM8xAWCALy6+TSeXxuPVVPCoCdrOuu9bE/Mxm9ncvHaQB8EuViKHYeIiIh0HAtPRA2wKT4Tb207Cw87E0TNCIdvK3OxIxERkQ4bEeqCKpUGb/1yFnvP52FIkKPYkQAAmUUVWLj9PBStrfBiz7ZixyEiIqJmoOl8vUbUBAmCgM9+v4TXt5xB57Y22DKzC4tORETUKEaHucLZ0ggbYjPEjgKgdpXWVzedhgDgy9HBkEnZt5CIiIgeHwtPRPdRpVJjTlQilv6VgsgwV6ycHAZzQz2xYxERUTMhk0oQGeaKIykFSC8sFzsOvj90BbFpRXg/wh+u1sZixyEiIqJmgoUnonsoKldi/I8x2HE6B/MH+eLjEYFNqv8GERE1D88pXCGVABvjMkXNcb2kCl/+cRlPBbbCiFBnUbMQERFR88JP0kT/42p+GUZ8exRnsm/im7GhmNmrLSQSTjcgIqLG18rCEH18HbApPgsqtUa0HL+cyoZKLeDVAT78P4+IiIgaFQtP1KSVVKnw18XriE8rQsr1MhSVK6HWCFp7vNjUIoz47hhKqmqw4fnwJtPslYiImq+xnVxRUFaNfRfyRHl8QRCw+WQWQt0s0dbOVJQMRERE1HxxVTtqkgRBwPbEbHz020UUlFXfcZtEAlgY6cHaWB+WxnqwNtGHpbE+LI30YGWiDwsjPVga68HK+M4/y6QSVKnUqFSpUalUo0qlQaVKjapbP6kF5fh0zyW4WBth9eSOcLNhfwsiItK+nt72cLQwxIbYTAwKePJfeCRmFiPlehk+HhH4xB+biIiImj+tFp727NmDOXPmQK1WY/r06XjjjTfuuL26uhoTJ07EyZMnYWNjg40bN8Ld3R3r1q3Df//737r9zpw5g4SEBAQHB6NXr17Izc2FkZERAGDv3r2wt7fX5tOgJ+xyXine2X4OMalFaO9qic9HtYcEwI0KJW6UK1FUoUJxhRJF5UrcqFAip7gK53NKUFyhQqVK/ViP3amNNb6f0AGWxvqN82SIiIjqIZNKMErhiiX7k5FZVPHEG3tvPpkFQz0pnuYoXyIiItICrRWe1Go1Zs2ahT/++AMuLi4ICwtDREQE/Pz86vZZsWIFrKyskJKSgqioKMyfPx8bN27EuHHjMG7cOADA2bNnMWzYMAQHB9fdb926dVAoFNqKTiIpr67BV/uSsfJIKkwN5fjP8EBEhrlC+hDLOVep1LhZqULxreLUjQoVblbWXmoEAUZ6MhjpyWB468dIXwZDuRRG+jIY68vgYWv6UI9HRETUGEaFueLr/cnYFJ+JVwf4PLHHrVKp8evpHAwOcIQZV24lIiIiLdBa4Sk2Nhaenp7w8PAAAERGRiI6OvqOwlN0dDTee+89AMDIkSMxe/ZsCIJwR1PLDRs2IDIyUlsxqQkQBAG7zl7DhzuTcK2kCqMVrpg/2BfWJg8/6uh2QcnB3FALSYmIiLTD2dIIPb3tsCk+E3P6ekH+hFZS/f38NZRW1WBkB5cn8nhERETU8mjtXU12djZcXV3rrru4uCA7O/u++8jlclhYWKCwsPCOfTZu3IgxY8bcsW3KlCkIDg7Ghx9+CEG4d6Pp5cuXQ6FQQKFQID8/vzGeEmnBlfwyTFwZi1nrE2Bjqo9tL3XBJyODHqnoREREpMvGdHRDXkk1/rr05N63bDmZBWdLI3T2sHlij0lEREQtS5Ne1S4mJgbGxsYICAio27Zu3TqcPXsWhw8fxuHDh/HTTz/d874zZsxAfHw84uPjYWdn96Qi00M4fqUQgxcfRmJmMd6P8MeO2d0Q6mYldiwiIiJR9PG1h72ZATbEZjyRx8sursSRlAI828GF08yJiIhIa7RWeHJ2dkZmZmbd9aysLDg7O993n5qaGty8eRM2Nn9/4xYVFXXXaKfbxzAzM8PYsWMRGxurradAWlRSpcKrmxLhbGWEfa/2xKQu7pDxTS8REbVgcpkUo8NcceDSdeQUV2r98badzIIgAM9xmh0RERFpkdYKT2FhYUhOTkZqaiqUSiWioqIQERFxxz4RERFYs2YNAGDLli3o06dPXX8njUaDTZs23dHfqaamBgUFBQAAlUqFnTt33jEainTHe9HnkVdajS9GtYe9GfsxERERAcAohSsEAJviM+vd93EIgoAtCVkI97B+4qvoERERUcuitcKTXC7H0qVLMXDgQLRr1w6jRo2Cv78/Fi5ciB07dgAApk2bhsLCQnh6euKLL77AokWL6u5/6NAhuLq61jUnB4Dq6moMHDgQQUFBCA4OhrOzM55//nltPQXSkt1nc7HtVDZm9fZECKfWERER1XG1NkZ3LztsjMuEWnPvPpaNIS7tBtILK/BcB9f6dyYiIiJ6DBLhft25mxGFQoH4+HixYxCA6yVVGLj4EFytjbF1ZhfoPaFVe4iIiHTFnnO5ePHnBKycrEAfXwetPMZrm09j19lcxL3dD8b6WlvkmIiIiJq5htRb+KmfnhhBEDB/6xlUKNX4YlQwi05ERET30LedA2xNDbAhVjvT7cqra/Db2VwMCXJk0YmIiIi0jp/86YnZEJuJvy7l443BvvC0NxU7DhERUZOkJ5PiOYUL9l+8jms3qxr9+LvO5qJCqcZzCk6zIyIiIu1j4YmeiLSCcvz7tyR09bTBpM7uYschIiJq0iLDXKHWCNishSbjm09moY2tCRSt2WeRiIiItI+FJ9K6GrUG8zYlQiaV4L8j20MqlYgdiYiIqElrbWOCrp42iIrLhKYRm4ynF5YjNrUIIzu41K0kTERERKRNLDyR1n1/6CoSMorx4dAAOFkaiR2HiIhIJ4zp6Ibs4kocTilotGNuPZkFiQQYHuLcaMckIiIiehAWnkirzmXfxJd/XMaQIEcMDXYSOw4REZHOGODXCjYm+tgQk9Eox9NoBGxNyEY3T1t+EURERERPDAtPpDVVKjXmbkyEtYk+PhoWwCH9RERED0FfLsWoMFf8nnQN0YnZj328Y1cKkV1cyabiRERE9ESx8ERa89nvl5B8vQyfjgyCpbG+2HGIiIh0zpy+Xujobo1XN53G/ot5j3WszSczYW4oxwA/h0ZKR0RERFQ/Fp5IK1Kul2HF0VSM6+SGXj72YschIiLSSYZ6Mvw4SQE/J3PM/DkBJ64WPtJxSqpU2HPuGiKCnWCoJ2vklERERET3x8ITacWaY2nQk0oxt7+32FGIiIh0mpmhHlZP6Qg3a2NMXxOPM1nFD32MnadzUV2jwcgOnGZHRERETxYLT9ToblaqsDUhC8+0d4KtqYHYcYiIiHSetYk+fprWCZbGepi0MhbJeaUNvu+ZrGL8cPgqvOxN0d7FQospiYiIiO7GwhM1us3xmahQqjG5i7vYUYiIiJqNVhaGWDe9E+QyKcaviEFmUcUD988prsS8jYmIWHoUJZUqvP20Hxf6ICIioieOhSdqVGqNgLXH06FobYVAfqtKRETUqFrbmODnaZ1QpdJg/IoYXC+pumuf8uoafL73Enp/dgA7z+ZiZq+2+Ou1XujpbSdCYiIiImrpWHiiRvXXxevIKKrA5K7uYkchIiJqlnxamWH1lDDkl1ZjwopYFFcoAdR++RMVm4Fenx3A1/tTMNC/FfbN64n5g3xhbqgncmoiIiJqqeRiB6DmZfWxNLQyN8RA/1ZiRyEiImq2Qtys8ONEBSavjsPkVXGY3dsTn+29hIvXShHqZonvJ3RAqJuV2DGJiIiIOOKJGk9yXimOpBRgQufW0JPx1CIiItKmLp62WDomBGezb2L62niUVddg6dgQbJ3ZhUUnIiIiajI44okazepjadCXSxEZxqWaiYiInoQB/q2wfEIHZBZVILKjGwz1ZGJHIiIiIroDC0/UKG5WqLAtIRtD2zvBxtRA7DhEREQtRt92DmJHICIiIrovzoeiRrEpPhOVKjWbihMRERERERFRHRae6LGpNQLWHE9DxzbW8HeyEDsOERERERERETURLDzRY9t3IQ9ZNyoxpYu72FGIiIiIiIiIqAlh4Yke2+pjaXCyMER/P/aYICIiIiIiIqK/sfBEj+XStVIcu1KICZ3dIZfxdCIiIiIiIiKiv2m1UrBnzx74+PjA09MTixYtuuv26upqjB49Gp6enujUqRPS0tIAAGlpaTAyMkJwcDCCg4Px4osv1t3n5MmTCAwMhKenJ15++WUIgqDNp0D1WH0sDQZyKSLDXMWOQkRERERERERNjNYKT2q1GrNmzcLu3buRlJSEDRs2ICkp6Y59VqxYASsrK6SkpGDu3LmYP39+3W1t27ZFYmIiEhMTsWzZsrrtM2fOxA8//IDk5GQkJydjz5492noKVI/iCiV+OZWF4SHOsDLRFzsOERERERERETUxWis8xcbGwtPTEx4eHtDX10dkZCSio6Pv2Cc6OhqTJk0CAIwcORL79u174Aim3NxclJSUIDw8HBKJBBMnTsT27du19RSoHhvjMlGl0mASm4oTERERERER0T1orfCUnZ0NV9e/p1+5uLggOzv7vvvI5XJYWFigsLAQAJCamoqQkBD07NkThw8frtvfxcXlgce8bfny5VAoFFAoFMjPz2/U50ZAjVqDtcfTEe5hjXaO5mLHISIiIiIiIqImSC52gHtxdHRERkYGbGxscPLkSQwbNgznz59/qGPMmDEDM2bMAAAoFAptxGzR/ryQh+ziSrzztJ/YUYiIiIiIiIioidLaiCdnZ2dkZmbWXc/KyoKzs/N996mpqcHNmzdhY2MDAwMD2NjYAAA6dOiAtm3b4vLly3B2dkZWVtYDj0nad+1mFZb+lQJnSyP0a2cvdhwiIiIiIiIiaqK0VngKCwtDcnIyUlNToVQqERUVhYiIiDv2iYiIwJo1awAAW7ZsQZ8+fSCRSJCfnw+1Wg0AuHr1KpKTk+Hh4QFHR0eYm5vjxIkTEAQBa9euxdChQ7X1FOh/VKnUWLo/Gb0/O4DLeWWYP9gXcplWF0YkIiIiIiIiIh2mtal2crkcS5cuxcCBA6FWqzF16lT4+/tj4cKFUCgUiIiIwLRp0zBhwgR4enrC2toaUVFRAIBDhw5h4cKF0NPTg1QqxbJly2BtbQ0A+PbbbzF58mRUVlZi8ODBGDx4sLaeAt0iCAJ+P5+Hj3YlIbOoEoP8W+Gtp9rBzcZY7GhERERERERE1IRJhActI9dMKBQKxMfHix1DJyXnleL9X5NwJKUA3g6mePcZf3T1tBU7FhERERERERGJrCH1libZXJzuVlyhxK+nc2BragAbUwPYmOrD1tQA5oZySCSSRn+8mxUqfPnnZfx0Ih2mBnK8H+GPcZ3cOLWOiIiIiIiIiBqMhScdcbWgHO9E372yn55MAhuTvwtR1ib6MDWQw8RADlMD2a1Led02EwM59GQSVCjVqFSqUa6sQUW1GhXKGpQray/Lqmqw43QOblaqMLaTG+b194G1ib4Iz5qIiIiIiIiIdBkLTzqivYslYhf0RWGZEoVlShSUVaOgrBqF5UoUlNZeFpZV40p+Gcqra1BWXQOV+uFnUcqkEhjry9DexRJvPdUOfk7mWng2RERERERERNQSsPCkI2RSCezNDGFvZtjg+1TXqFFera4rRP2zIGWiL4OxgRwm+jIY6ctgoi+HsYEM+jKpVqbuEREREREREVHLw8JTM2Ygl8FALuM0OSIiIiIiIiISBTtFExERERERERGRVrDwREREREREREREWsHCExERERERERERaQULT0REREREREREpBUsPBERERERERERkVaw8ERERERERERERFrBwhMREREREREREWkFC09ERERERERERKQVLDwREREREREREZFWSARBEMQOoW22trZwd3cXO0ajyM/Ph52dndgxiB4Jz1/SZTx/SdfxHCZdxvOXdB3PYdJlDzp/09LSUFBQ8MD7t4jCU3OiUCgQHx8vdgyiR8Lzl3QZz1/SdTyHSZfx/CVdx3OYdNnjnr+cakdERERERERERFrBwhMREREREREREWkFC086ZsaMGWJHIHpkPH9Jl/H8JV3Hc5h0Gc9f0nU8h0mXPe75yx5PRERERERERESkFRzxREREREREREREWsHCExERERERERERaQULTzpiz5498PHxgaenJxYtWiR2HKIHyszMRO/eveHn5wd/f3989dVXAICioiL0798fXl5e6N+/P27cuCFyUqIHU6vVCAkJwdNPPw0ASE1NRadOneDp6YnRo0dDqVSKnJDo3oqLizFy5Ej4+vqiXbt2OH78OF+DSad8+eWX8Pf3R0BAAMaMGYOqqiq+BlOTNXXqVNjb2yMgIKBu2/1ecwVBwMsvvwxPT08EBQUhISFBrNhEde51Dr/22mvw9fVFUFAQhg8fjuLi4rrbPv74Y3h6esLHxwe///57vcdn4UkHqNVqzJo1C7t370ZSUhI2bNiApKQksWMR3ZdcLsfnn3+OpKQknDhxAt988w2SkpKwaNEi9O3bF8nJyejbty+LqNTkffXVV2jXrl3d9fnz52Pu3LlISUmBlZUVVqxYIWI6ovubM2cOBg0ahIsXL+L06dNo164dX4NJZ2RnZ2PJkiWIj4/HuXPnoFarERUVxddgarImT56MPXv23LHtfq+5u3fvRnJyMpKTk7F8+XLMnDlTjMhEd7jXOdy/f3+cO3cOZ86cgbe3Nz7++GMAQFJSEqKionD+/Hns2bMHL730EtRq9QOPz8KTDoiNjYWnpyc8PDygr6+PyMhIREdHix2L6L4cHR0RGhoKADAzM0O7du2QnZ2N6OhoTJo0CQAwadIkbN++XcSURA+WlZWF3377DdOnTwdQ+w3l/v37MXLkSAA8h6npunnzJg4dOoRp06YBAPT19WFpacnXYNIpNTU1qKysRE1NDSoqKuDo6MjXYGqyevToAWtr6zu23e81Nzo6GhMnToREIkF4eDiKi4uRm5v7pCMT3eFe5/CAAQMgl8sBAOHh4cjKygJQew5HRkbCwMAAbdq0gaenJ2JjYx94fBaedEB2djZcXV3rrru4uCA7O1vEREQNl5aWhlOnTqFTp07Iy8uDo6MjAKBVq1bIy8sTOR3R/b3yyiv49NNPIZXW/ldZWFgIS0vLuv+A+VpMTVVqairs7OwwZcoUhISEYPr06SgvL+drMOkMZ2dn/Otf/4KbmxscHR1hYWGBDh068DWYdMr9XnP52Y500cqVKzF48GAAj3YOs/BERFpTVlaGZ599FosXL4a5ufkdt0kkEkgkEpGSET3Yzp07YW9vjw4dOogdheih1dTUICEhATNnzsSpU6dgYmJy17Q6vgZTU3bjxg1ER0cjNTUVOTk5KC8vv2sKCJEu4Wsu6bKPPvoIcrkc48aNe+RjsPCkA5ydnZGZmVl3PSsrC87OziImIqqfSqXCs88+i3HjxmHEiBEAAAcHh7qhxLm5ubC3txczItF9HT16FDt27IC7uzsiIyOxf/9+zJkzB8XFxaipqQHA12JqulxcXODi4oJOnToBAEaOHImEhAS+BpPO+PPPP9GmTRvY2dlBT08PI0aMwNGjR/kaTDrlfq+5/GxHumT16tXYuXMn1q1bV1c8fZRzmIUnHRAWFobk5GSkpqZCqVQiKioKERERYsciui9BEDBt2jS0a9cO8+bNq9seERGBNWvWAADWrFmDoUOHihWR6IE+/vhjZGVlIS0tDVFRUejTpw/WrVuH3r17Y8uWLQB4DlPT1apVK7i6uuLSpUsAgH379sHPz4+vwaQz3NzccOLECVRUVEAQhLpzmK/BpEvu95obERGBtWvXQhAEnDhxAhYWFnVT8oiakj179uDTTz/Fjh07YGxsXLc9IiICUVFRqK6uRmpqKpKTk9GxY8cHHksiCIKg7cD0+Hbt2oVXXnkFarUaU6dOxYIFC8SORHRfR44cQffu3REYGFjXH+c///kPOnXqhFGjRiEjIwOtW7fGpk2b7mpiR9TUHDhwAJ999hl27tyJq1evIjIyEkVFRQgJCcHPP/8MAwMDsSMS3SUxMRHTp0+HUqmEh4cHVq1aBY1Gw9dg0hnvvvsuNm7cCLlcjpCQEPz444/Izs7mazA1SWPGjMGBAwdQUFAABwcHvP/++xg2bNg9X3MFQcDs2bOxZ88eGBsbY9WqVVAoFGI/BWrh7nUOf/zxx6iuroaNjQ2A2gbjy5YtA1A7/W7lypWQy+VYvHhxXf+n+2HhiYiIiIiIiIiItIJT7YiIiIiIiIiISCtYeCIiIiIiIiIiIq1g4YmIiIiIiIiIiLSChSciIiIiIiIiItIKFp6IiIiIiIiIiEgrWHgiIiIirfjoo4/g7++PoKAgBAcHIyYmBgCwePFiVFRUPPTxVq9ejZycnLrr06dPR1JSUqNk7dKly0Ptf+DAATz99NON8tj3Ul1djX79+iE4OBgbN26847b//T24u7ujoKBAa1m0KScnByNHjmzw/mq1Gh06dMChQ4fqtg0YMACbN2/WRjwiIiJqBHKxAxAREVHzc/z4cezcuRMJCQkwMDBAQUEBlEolgNrC0/jx42FsbNzg46nVaqxevRoBAQFwcnICAPz444+NlvfYsWONdqzGcOrUKQBAYmLiXbf97++hKaqpqYFcXv/bTCcnJ2zZsqXBx5XJZPj222/x/PPP4+TJk9iyZQukUimee+65x4lLREREWsQRT0RERNTocnNzYWtrCwMDAwCAra0tnJycsGTJEuTk5KB3797o3bs3AGDmzJlQKBTw9/fHu+++W3cMd3d3zJ8/H6GhodiwYQPi4+Mxbtw4BAcHo7KyEr169UJ8fDwAwNTUFAsWLED79u0RHh6OvLw8AMCVK1cQHh6OwMBAvP322zA1Nb1n3tvbDxw4gF69emHkyJHw9fXFuHHjIAgCAGDPnj3w9fVFaGgotm3bVnff8vJyTJ06FR07dkRISAiio6MBAHPmzMEHH3wAAPj999/Ro0cPaDSaOx63qKgIw4YNQ1BQEMLDw3HmzBlcv34d48ePR1xcHIKDg3HlypW6/bds2XLX7wEAvv76a4SGhiIwMBAXL158YK5/EgQBs2fPho+PD/r164ennnqqrhD0z5FU8fHx6NWr1wOPu3r1akRERKBPnz7o27cvJk6ciO3bt9c91rhx4+7KkJaWhoCAgLr7jxgxAoMGDYKXlxdef/31e/5dderUCZ07d8Z7772Ht956C0uXLr3nfkRERNRECERERESNrLS0VGjfvr3g5eUlzJw5Uzhw4EDdba1btxby8/PrrhcWFgqCIAg1NTVCz549hdOnT9ft98knn9Tt17NnTyEuLu6e1wEIO3bsEARBEF577TXhww8/FARBEIYMGSKsX79eEARB+O677wQTE5N75r29/a+//hLMzc2FzMxMQa1WC+Hh4cLhw4eFyspKwcXFRbh8+bKg0WiE5557ThgyZIggCILw5ptvCj/99JMgCIJw48YNwcvLSygrKxPKy8sFPz8/Yf/+/YK3t7eQkpJy1+POnj1beO+99wRBEIR9+/YJ7du3r8tx+/j/639/D61btxaWLFkiCIIgfPPNN8K0adMemOuftm7dKvTr10+oqakRsrOzBQsLC2Hz5s11x7399xQXFyf07NnzgcddtWqV4OzsXPf3eeDAAWHo0KGCIAhCcXGx4O7uLqhUqjsePzU1VfD39xcEQRBWrVoltGnTRiguLhYqKysFNzc3ISMj456/g8LCQsHY2Fh466237nk7ERERNR0c8URERESNztTUFCdPnsTy5cthZ2eH0aNHY/Xq1ffcd9OmTQgNDUVISAjOnz9/R9+m0aNHN+jx9PX163oudejQAWlpaQBqp/zdnoY1duzYBh2rY8eOcHFxgVQqRXBwMNLS0nDx4kW0adMGXl5ekEgkGD9+fN3+e/fuxaJFixAcHIxevXqhqqoKGRkZMDY2xg8//ID+/ftj9uzZaNu27V2PdeTIEUyYMAEA0KdPHxQWFqKkpKRBOf9pxIgRdz33++X6p0OHDmHMmDGQyWRwcnJCnz596n2sBx23f//+sLa2BgD07NkTycnJyM/Px4YNG/Dss8/WO/2ub9++sLCwgKGhIfz8/JCenn7P/Q4dOgQLCwucO3eu3rxEREQkLvZ4IiIiIq2QyWTo1asXevXqhcDAQKxZswaTJ0++Y5/U1FR89tlniIuLg5WVFSZPnoyqqqq6201MTBr0WHp6epBIJHWPW1NT88i5b08PbOixBEHA1q1b4ePjc9dtZ8+ehY2NzR3NwLXhduZ/5n1QroaQy+V1UwP/+Xdyv+PGxMTc9fc1ceJE/Pzzz4iKisKqVasa/Dz+97n8U3l5OV5//XXs378fU6ZMwa5du/DUU0891HMjIiKiJ4cjnoiIiKjRXbp0CcnJyXXXExMT0bp1awCAmZkZSktLAQAlJSUwMTGBhYUF8vLysHv37vse85/3a6jw8HBs3boVABAVFfWwT6OOr68v0tLS6votbdiwoe62gQMH4uuvv67rBXW7MXh6ejo+//xznDp1Crt3765b1e+funfvjnXr1gGo7S9la2sLc3PzB2Zp6O/hfrn+qUePHti4cSPUajVyc3Px119/1d3m7u6OkydPAkDd77Chx71t8uTJWLx4MQDAz8+v3swN8cEHH2DUqFHw9fXFt99+i7lz595RGCMiIqKmhYUnIiIianRlZWWYNGkS/Pz8EBQUhKSkJLz33nsAgBkzZmDQoEHo3bs32rdvj5CQEPj6+mLs2LHo2rXrfY85efJkvPjii3c01a7P4sWL8cUXXyAoKAgpKSmwsLB4pOdjaGiI5cuXY8iQIQgNDYW9vX3dbe+88w5UKhWCgoLg7++Pd955B4IgYNq0afjss8/g5OSEFStWYPr06XcVSN577z2cPHkSQUFBeOONN7BmzZp6szT093CvXP9r+PDh8PLygp+fHyZOnIjOnTvX3fbuu+9izpw5UCgUkMlkD3Xc2xwcHNCuXTtMmTKl3ufVEOfPn8cvv/yCBQsWAABCQkIwcOBAfPLJJ41yfCIiImp8EuH211VEREREzUxFRQWMjIwgkUgQFRWFDRs23HN1N6o1efJkPP300xg5cmSjHK+iogKBgYFISEh45KIfERER6Tb2eCIiIqJm6+TJk5g9ezYEQYClpSVWrlwpdqQW488//8S0adMwd+5cFp2IiIhaMI54IiIiIiIiIiIirWCPJyIiIiIiIiIi0goWnoiIiIiIiIiISCtYeCIiIiIiIiIiIq1g4YmIiIiIiIiIiLSChSciIiIiIiIiItKK/wcXVvsFS0BwogAAAABJRU5ErkJggg==", "text/plain": [ - "
" + "([array([0.02910451])], [array([[7, 0]], dtype=int64)])" ] }, + "execution_count": 6, "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index of the 20-th query best matches : [[195 26]]\n" - ] + "output_type": "execute_result" } ], "source": [ - "from aeon.similarity_search import SeriesSearch\n", - "\n", - "query_length = 35\n", - "estimator = SeriesSearch(distance=\"euclidean\").fit(X_train) # X_test is a 3D array\n", - "mp, ip = estimator.predict(X_test, query_length) # X_test is a 2D array\n", - "plot_matrix_profile(X_test, mp, 0)\n", - "print(f\"Index of the 20-th query best matches : {ip[20]}\")" - ] - }, - { - "cell_type": "markdown", - "id": "0dca5122", - "metadata": {}, - "source": [ - "Notice that we find the same best match for the 20-ith query, which was the query that we used for `QuerySearch` !\n", - "\n", - "`SeriesSearch` returns two lists, `mp` and `ip`, which respectively contain the distances to the best matches of all queries of size `query_length` in `X_test` (the `i-th` query being `X_test[:, i : i + query_length]`) and the indexes of these best matches in `X_train` in the `(ix_case, ix_timepoint)` format, such as `X_train[ix_case, :, ix_timepoint : ix_timepoint + query_length]` will be the matching subsquence.\n", + "from aeon.similarity_search.series import StompMotif\n", "\n", - "Most of the options (`k`, `threshold`, `inverse_distance`, etc.) from `QuerySearch` are also available for `SeriesSearch`." + "motif = StompMotif(length=length).fit(series_fit)\n", + "motif.predict(series_predict)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ff23faf5-2941-441a-8c4c-0cf66eaca121", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -521,7 +342,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.12.0" } }, "nbformat": 4, From f51d66a6efb78c7bc1b261af6fdc1f1b5e13128e Mon Sep 17 00:00:00 2001 From: baraline Date: Sun, 19 Jan 2025 13:34:51 +0100 Subject: [PATCH 16/18] Update documentation and add default test params --- aeon/similarity_search/series/_commons.py | 26 ++- .../similarity_search/series/motifs/_stomp.py | 49 +++++- .../series/neighbors/_dummy.py | 26 +++ .../series/neighbors/_mass.py | 26 +++ .../similarity_search/similarity_search.ipynb | 157 +++++++++++++++++- 5 files changed, 261 insertions(+), 23 deletions(-) diff --git a/aeon/similarity_search/series/_commons.py b/aeon/similarity_search/series/_commons.py index 4e62e5aacb..8b309bb6b2 100644 --- a/aeon/similarity_search/series/_commons.py +++ b/aeon/similarity_search/series/_commons.py @@ -146,20 +146,32 @@ def _extract_top_k_from_dist_profile( # Could add aggregation function as parameter instead of just max -def _extract_top_k_motifs(MP, IP, k): +def _extract_top_k_motifs(MP, IP, k, allow_trivial_matches, exclusion_size): criterion = np.zeros(len(MP)) for i in range(len(MP)): - criterion[i] = max(MP[i]) - idx = np.argsort(criterion) - return [MP[i] for i in idx[:k]], [IP[i] for i in idx[:k]] + if len(MP[i]) > 0: + criterion[i] = max(MP[i]) + else: + criterion[i] = np.inf + idx, _ = _extract_top_k_from_dist_profile( + criterion, k, np.inf, allow_trivial_matches, exclusion_size + ) + return [MP[i] for i in idx], [IP[i] for i in idx] -def _extract_top_r_motifs(MP, IP, k): + +def _extract_top_r_motifs(MP, IP, k, allow_trivial_matches, exclusion_size): criterion = np.zeros(len(MP)) for i in range(len(MP)): criterion[i] = len(MP[i]) - idx = np.argsort(criterion)[::-1] - return [MP[i] for i in idx[:k]], [IP[i] for i in idx[:k]] + idx, _ = _extract_top_k_from_dist_profile( + _inverse_distance_profile(criterion), + k, + np.inf, + allow_trivial_matches, + exclusion_size, + ) + return [MP[i] for i in idx], [IP[i] for i in idx] @njit(cache=True, fastmath=True) diff --git a/aeon/similarity_search/series/motifs/_stomp.py b/aeon/similarity_search/series/motifs/_stomp.py index fbd459b890..c912cdfacd 100644 --- a/aeon/similarity_search/series/motifs/_stomp.py +++ b/aeon/similarity_search/series/motifs/_stomp.py @@ -87,7 +87,7 @@ def _fit( y=None, ): if self.normalize: - self.X_means_, X_stds_ = sliding_mean_std_one_series(X, self.length, 1) + self.X_means_, self.X_stds_ = sliding_mean_std_one_series(X, self.length, 1) return self def predict( @@ -164,9 +164,13 @@ def predict( inverse_distance=inverse_distance, ) if motif_extraction_method == "k_motifs": - return _extract_top_k_motifs(MP, IP, k) + return _extract_top_k_motifs( + MP, IP, k, allow_trivial_matches, self.length // exclusion_factor + ) elif motif_extraction_method == "r_motifs": - return _extract_top_r_motifs(MP, IP, k) + return _extract_top_r_motifs( + MP, IP, k, allow_trivial_matches, self.length // exclusion_factor + ) def compute_matrix_profile( self, @@ -225,9 +229,12 @@ def compute_matrix_profile( is_self_mp = False if self.normalize: X_means, X_stds = sliding_mean_std_one_series(X, self.length, 1) - X_dotX = get_ith_products(X, self.X_, self.length, 0) exclusion_size = self.length // exclusion_factor + + if motif_size == np.inf: + motif_size = X.shape[1] - self.length + 1 + if self.normalize: MP, IP = _stomp_normalized( self.X_, @@ -260,6 +267,32 @@ def compute_matrix_profile( ) return MP, IP + @classmethod + def _get_test_params(cls, parameter_set: str = "default"): + """Return testing parameter settings for the estimator. + + Parameters + ---------- + parameter_set : str, default="default" + Name of the set of test parameters to return, for use in tests. If no + special parameters are defined for a value, will return `"default"` set. + There are currently no reserved values for transformers. + + Returns + ------- + params : dict or list of dict, default = {} + Parameters to create testing instances of the class + Each dict are parameters to construct an "interesting" test instance, i.e., + `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance. + """ + if parameter_set == "default": + params = {"length": 3} + else: + raise NotImplementedError( + f"The parameter set {parameter_set} is not yet implemented" + ) + return params + @njit(cache=True, fastmath=True) def _stomp_normalized( @@ -330,7 +363,7 @@ def _stomp_normalized( variable size. """ n_queries = X_A.shape[1] - L + 1 - _max_timestamp = X_B.shape[1] - L + _max_timestamp = X_B.shape[1] - L + 1 MP = List() IP = List() @@ -347,7 +380,7 @@ def _stomp_normalized( dist_profile = _inverse_distance_profile(dist_profile) if is_self_mp: - ub = min(i_q + exclusion_size, _max_timestamp) + ub = min(i_q + exclusion_size, _max_timestamp + 1) lb = max(0, i_q - exclusion_size) dist_profile[lb:ub] = np.inf @@ -425,7 +458,7 @@ def _stomp( variable size. """ n_queries = X_A.shape[1] - L + 1 - _max_timestamp = X_B.shape[1] - L + _max_timestamp = X_B.shape[1] - L + 1 MP = List() IP = List() @@ -440,7 +473,7 @@ def _stomp( dist_profile = _inverse_distance_profile(dist_profile) if is_self_mp: - ub = min(i_q + exclusion_size, _max_timestamp) + ub = min(i_q + exclusion_size, _max_timestamp + 1) lb = max(0, i_q - exclusion_size) dist_profile[lb:ub] = np.inf diff --git a/aeon/similarity_search/series/neighbors/_dummy.py b/aeon/similarity_search/series/neighbors/_dummy.py index 7b4d4d89da..a3120f98dd 100644 --- a/aeon/similarity_search/series/neighbors/_dummy.py +++ b/aeon/similarity_search/series/neighbors/_dummy.py @@ -132,6 +132,32 @@ def compute_distance_profile(self, X: np.ndarray): X = z_normalise_series_2d(X) return _naive_squared_distance_profile(self.X_subs, X) + @classmethod + def _get_test_params(cls, parameter_set: str = "default"): + """Return testing parameter settings for the estimator. + + Parameters + ---------- + parameter_set : str, default="default" + Name of the set of test parameters to return, for use in tests. If no + special parameters are defined for a value, will return `"default"` set. + There are currently no reserved values for transformers. + + Returns + ------- + params : dict or list of dict, default = {} + Parameters to create testing instances of the class + Each dict are parameters to construct an "interesting" test instance, i.e., + `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance. + """ + if parameter_set == "default": + params = {"length": 3} + else: + raise NotImplementedError( + f"The parameter set {parameter_set} is not yet implemented" + ) + return params + @njit(cache=True, fastmath=True, parallel=True) def _naive_squared_distance_profile( diff --git a/aeon/similarity_search/series/neighbors/_mass.py b/aeon/similarity_search/series/neighbors/_mass.py index 9565407fc8..befc8b33fd 100644 --- a/aeon/similarity_search/series/neighbors/_mass.py +++ b/aeon/similarity_search/series/neighbors/_mass.py @@ -165,6 +165,32 @@ def compute_distance_profile(self, X: np.ndarray): return distance_profile + @classmethod + def _get_test_params(cls, parameter_set: str = "default"): + """Return testing parameter settings for the estimator. + + Parameters + ---------- + parameter_set : str, default="default" + Name of the set of test parameters to return, for use in tests. If no + special parameters are defined for a value, will return `"default"` set. + There are currently no reserved values for transformers. + + Returns + ------- + params : dict or list of dict, default = {} + Parameters to create testing instances of the class + Each dict are parameters to construct an "interesting" test instance, i.e., + `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance. + """ + if parameter_set == "default": + params = {"length": 3} + else: + raise NotImplementedError( + f"The parameter set {parameter_set} is not yet implemented" + ) + return params + @njit(cache=True, fastmath=True) def _squared_distance_profile(QT, T, Q): diff --git a/examples/similarity_search/similarity_search.ipynb b/examples/similarity_search/similarity_search.ipynb index 803398d551..53e5a9bcc5 100644 --- a/examples/similarity_search/similarity_search.ipynb +++ b/examples/similarity_search/similarity_search.ipynb @@ -275,9 +275,9 @@ "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ - "
" + "
" ] }, "metadata": {}, @@ -288,7 +288,7 @@ "distance_profile = snn.compute_distance_profile(\n", " series_predict[:, starting_timestep_predict : starting_timestep_predict + length],\n", ")\n", - "plt.figure(figsize=(5, 3))\n", + "plt.figure(figsize=(7, 2))\n", "plt.plot(distance_profile)\n", "plt.show()" ] @@ -301,19 +301,104 @@ "### 1.2 Motif search with STOMP" ] }, + { + "attachments": { + "f492cb89-5bf3-4641-8be2-a77805f20b88.png": { + "image/png": "" + } + }, + "cell_type": "markdown", + "id": "6aecb58e-9de9-4264-959e-4180ab3fa27a", + "metadata": {}, + "source": [ + "When doing motif search, it's important to define the type of motif you want to extract from a series. We'll use the figure and definitions given by [1] :\n", + "\n", + "![image.png](attachment:f492cb89-5bf3-4641-8be2-a77805f20b88.png)\n", + "\n", + "For now, the `StompMotif` estimators supports only \"Pair motifs\", \"k-Motiflets\", and \"k-motifs\". Note that the naming \"k-motifs\" is a bit confusing, it extract motifs based on a range parameter and not by number of closests neihbors. To choose the type of motifs you want to extract, you will have to use the parameters of the `predict` method :\n", + "\n", + "- for **\"Pair Motifs\"** : This is the default configuration\n", + "\n", + "- for **\"k-Motiflets\"** : ```{\"motif_size\": k}```\n", + "\n", + "- for **\"k-motifs\"** : ```{\"motif_size\": np.inf, \"dist_threshold\": r, \"motif_extraction_method\": \"r_motifs\"}```\n", + "\n", + "These configuration will extract the best motif only, if you want to extract more than one motifs, you can use the `k` parameter to extract the `top-k` motifs. \n", + "\n", + "**The term `k` of `top-k` motifs, while also used in `k-Motiflets`, is not the same. We use `motif_size` as the `k` in `k-Motiflets`. This is to avoid \"extraction the `top-k` `k-motiflets`\", which can lead to confusions. Rather, we extract the `top-k` `motif_size-motiflets`**.\n", + "\n", + "The `top-k` using `motif_extraction_method=\"r_motifs\"` will be the motif with the highest cardinality (i.e. the more matches in range `r`), while for `motif_extraction_method=\"k_motifs\"`,which is the default value, the best motifs will be those who minimize the maximum pairwise distance." + ] + }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 11, "id": "ff23faf5-2941-441a-8c4c-0cf66eaca121", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "([array([0.02910451])], [array([[7, 0]], dtype=int64)])" + "([array([2.16605047]),\n", + " array([3.23155459]),\n", + " array([8.15076681]),\n", + " array([8.15076681]),\n", + " array([26.42906254])],\n", + " [array([[13, 30]], dtype=int64),\n", + " array([[31, 13]], dtype=int64),\n", + " array([[108, 77]], dtype=int64),\n", + " array([[ 77, 108]], dtype=int64),\n", + " array([[59, 76]], dtype=int64)])" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from aeon.similarity_search.series import StompMotif\n", + "\n", + "motif = StompMotif(length=length, normalize=True).fit(series_fit)\n", + "motif.predict(\n", + " k=5,\n", + " motif_size=1,\n", + " motif_extraction_method=\"k_motifs\",\n", + " allow_trivial_matches=False,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "d16036a3-f5b9-41d2-ae23-a1bcf0737c93", + "metadata": {}, + "source": [ + "\n", + "Note that we also support giving another series in `predict`, which will use this series to search for the motifs matching subsequences in the series given during `fit`. For those familiar with the matrix profile notations, this is the case of using `MP(A,B)`, while not using a series in `predict` is doing a self matrix profile `MP(A,A)`." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "59117ea7-2cbf-49d6-829a-792805b4aaf7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "([array([0.01197907]),\n", + " array([0.0622802]),\n", + " array([0.14565364]),\n", + " array([0.70546699]),\n", + " array([1.19303001])],\n", + " [array([[83, 78]], dtype=int64),\n", + " array([[50, 49]], dtype=int64),\n", + " array([[32, 30]], dtype=int64),\n", + " array([[9, 4]], dtype=int64),\n", + " array([[101, 95]], dtype=int64)])" ] }, - "execution_count": 6, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -321,9 +406,65 @@ "source": [ "from aeon.similarity_search.series import StompMotif\n", "\n", - "motif = StompMotif(length=length).fit(series_fit)\n", - "motif.predict(series_predict)" + "motif.predict(\n", + " series_predict,\n", + " k=5,\n", + " motif_size=1,\n", + " motif_extraction_method=\"k_motifs\",\n", + " allow_trivial_matches=False,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "9190fdf4-db3d-4d51-b2c8-41b88a9f6f74", + "metadata": {}, + "source": [ + "You can also return the matrix profile with the same parameterization as `predict` (minus `motif_extraction_method` parameter) using :" ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "4c36738a-e6a0-4452-aee2-ccbad99d6d8b", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "MP, IP = motif.compute_matrix_profile()\n", + "\n", + "plt.figure(figsize=(7, 2))\n", + "plt.plot([MP[i][0] for i in range(len(MP))])\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "1610adf3-5cb1-466e-9cad-fb248148fd5a", + "metadata": {}, + "source": [ + "## References\n", + "[1] Patrick Schäfer and Ulf Leser. 2022. Motiflets: Simple and Accurate Detection\n", + " of Motifs in Time Series. Proc. VLDB Endow. 16, 4 (December 2022), 725–737." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "989ba9f2-6dd8-4db7-9dfc-783aac5e6fcb", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { From 763bdcf2794670ceee7ce375868d66c859ad830e Mon Sep 17 00:00:00 2001 From: baraline Date: Sun, 19 Jan 2025 20:41:47 +0100 Subject: [PATCH 17/18] Fix identifiers and test data shape for all_estimators tests --- aeon/similarity_search/collection/__init__.py | 5 +- aeon/similarity_search/collection/_base.py | 14 +---- .../collection/neighbors/_rp_cosine_lsh.py | 6 ++- aeon/similarity_search/series/_base.py | 9 +--- .../similarity_search/series/motifs/_stomp.py | 1 + .../series/neighbors/_dummy.py | 17 ++++-- .../series/tests/test_commons.py | 4 +- aeon/testing/testing_data.py | 52 +++++-------------- aeon/utils/base/_identifier.py | 2 + aeon/utils/tags/_tags.py | 2 +- .../similarity_search/similarity_search.ipynb | 12 ++--- 11 files changed, 46 insertions(+), 78 deletions(-) diff --git a/aeon/similarity_search/collection/__init__.py b/aeon/similarity_search/collection/__init__.py index ab3a546193..3a08ed22d6 100644 --- a/aeon/similarity_search/collection/__init__.py +++ b/aeon/similarity_search/collection/__init__.py @@ -1,5 +1,8 @@ """Similarity search for time series collection.""" -__all__ = ["BaseCollectionSimilaritySearch"] +__all__ = ["BaseCollectionSimilaritySearch", "RandomProjectionIndexANN"] from aeon.similarity_search.collection._base import BaseCollectionSimilaritySearch +from aeon.similarity_search.collection.neighbors._rp_cosine_lsh import ( + RandomProjectionIndexANN, +) diff --git a/aeon/similarity_search/collection/_base.py b/aeon/similarity_search/collection/_base.py index 618a531081..cbbf8de1e9 100644 --- a/aeon/similarity_search/collection/_base.py +++ b/aeon/similarity_search/collection/_base.py @@ -9,7 +9,6 @@ from typing import Union, final import numpy as np -from numba import get_num_threads, set_num_threads from aeon.base import BaseCollectionEstimator from aeon.similarity_search._base import BaseSimilaritySearch @@ -22,16 +21,9 @@ class BaseCollectionSimilaritySearch(BaseCollectionEstimator, BaseSimilaritySear _tags = { "input_data_type": "Collection", "capability:multivariate": True, - "capability:unequal_length": True, - "capability:multithreading": True, - "X_inner_type": ["np-list", "numpy3D"], + "X_inner_type": ["numpy3D"], } - @abstractmethod - def __init__(self, n_jobs=1): - self.n_jobs = n_jobs - super().__init__() - @final def fit( self, @@ -63,11 +55,7 @@ def fit( # Store minimum number of n_timepoints for unequal length collections self.n_channels_ = X[0].shape[0] self.n_cases_ = len(X) - prev_threads = get_num_threads() - set_num_threads(self._n_jobs) self._fit(X, y=y) - set_num_threads(prev_threads) - self.is_fitted = True return self diff --git a/aeon/similarity_search/collection/neighbors/_rp_cosine_lsh.py b/aeon/similarity_search/collection/neighbors/_rp_cosine_lsh.py index a6f3097f78..1fad3f08f7 100644 --- a/aeon/similarity_search/collection/neighbors/_rp_cosine_lsh.py +++ b/aeon/similarity_search/collection/neighbors/_rp_cosine_lsh.py @@ -1,7 +1,7 @@ """Random projection LSH index.""" import numpy as np -from numba import njit, prange +from numba import get_num_threads, njit, prange, set_num_threads from aeon.similarity_search.collection._base import BaseCollectionSimilaritySearch from aeon.utils.numba.general import z_normalise_series_2d, z_normalise_series_3d @@ -134,6 +134,8 @@ def _fit(self, X, y=None): self """ + prev_threads = get_num_threads() + set_num_threads(self._n_jobs) rng = np.random.default_rng(self.random_state) if self.normalize: X = z_normalise_series_3d(X) @@ -171,7 +173,7 @@ def _fit(self, X, y=None): self.bool_hashes_value_list_ = np.asarray(list(self.dict_bool_hashes_.values())) self.bool_hashes_key_list_ = np.asarray(list(self.dict_bool_hashes_.keys())) - + set_num_threads(prev_threads) return self def _get_bucket_content(self, key): diff --git a/aeon/similarity_search/series/_base.py b/aeon/similarity_search/series/_base.py index 2e1b6d40e0..6ee1f27270 100644 --- a/aeon/similarity_search/series/_base.py +++ b/aeon/similarity_search/series/_base.py @@ -4,7 +4,6 @@ from typing import Union, final import numpy as np -from numba import get_num_threads, set_num_threads from aeon.base import BaseSeriesEstimator from aeon.similarity_search._base import BaseSimilaritySearch @@ -20,8 +19,7 @@ class BaseSeriesSimilaritySearch(BaseSeriesEstimator, BaseSimilaritySearch): } @abstractmethod - def __init__(self, axis=1, n_jobs=1): - self.n_jobs = n_jobs + def __init__(self, axis=1): super().__init__(axis=axis) @final @@ -56,12 +54,7 @@ def fit( self.n_channels_ = X.shape[0] self.n_timepoints_ = X.shape[1] self.X_ = X - - prev_threads = get_num_threads() - set_num_threads(self._n_jobs) self._fit(X, y=y) - set_num_threads(prev_threads) - self.is_fitted = True return self diff --git a/aeon/similarity_search/series/motifs/_stomp.py b/aeon/similarity_search/series/motifs/_stomp.py index c912cdfacd..43bc76f049 100644 --- a/aeon/similarity_search/series/motifs/_stomp.py +++ b/aeon/similarity_search/series/motifs/_stomp.py @@ -233,6 +233,7 @@ def compute_matrix_profile( exclusion_size = self.length // exclusion_factor if motif_size == np.inf: + # convert infs here as numba seem to not be able to do == np.inf ? motif_size = X.shape[1] - self.length + 1 if self.normalize: diff --git a/aeon/similarity_search/series/neighbors/_dummy.py b/aeon/similarity_search/series/neighbors/_dummy.py index a3120f98dd..12bb7a1035 100644 --- a/aeon/similarity_search/series/neighbors/_dummy.py +++ b/aeon/similarity_search/series/neighbors/_dummy.py @@ -6,7 +6,7 @@ __all__ = ["DummySNN"] import numpy as np -from numba import njit, prange +from numba import get_num_threads, njit, prange, set_num_threads from aeon.similarity_search.series._base import BaseSeriesSimilaritySearch from aeon.similarity_search.series._commons import ( @@ -23,6 +23,8 @@ class DummySNN(BaseSeriesSimilaritySearch): """Estimator to compute the on profile and distance profile using brute force.""" + _tags = {"capability:multithreading": True} + def __init__( self, length: int, @@ -31,16 +33,21 @@ def __init__( ): self.length = length self.normalize = normalize - super().__init__(n_jobs=n_jobs) + self.n_jobs = n_jobs + super().__init__() def _fit( self, X: np.ndarray, y=None, ): + prev_threads = get_num_threads() + set_num_threads(self._n_jobs) + self.X_subs = get_all_subsequences(self.X_, self.length, 1) if self.normalize: self.X_subs = z_normalise_series_3d(self.X_subs) + set_num_threads(prev_threads) return self def predict( @@ -128,9 +135,13 @@ def compute_distance_profile(self, X: np.ndarray): length of X_. """ + prev_threads = get_num_threads() + set_num_threads(self._n_jobs) if self.normalize: X = z_normalise_series_2d(X) - return _naive_squared_distance_profile(self.X_subs, X) + distance_profile = _naive_squared_distance_profile(self.X_subs, X) + set_num_threads(prev_threads) + return distance_profile @classmethod def _get_test_params(cls, parameter_set: str = "default"): diff --git a/aeon/similarity_search/series/tests/test_commons.py b/aeon/similarity_search/series/tests/test_commons.py index 774eee8dee..36e8b6babc 100644 --- a/aeon/similarity_search/series/tests/test_commons.py +++ b/aeon/similarity_search/series/tests/test_commons.py @@ -20,8 +20,8 @@ make_example_2d_numpy_series, ) -K_VALUES = [1, 3, np.inf] -THRESHOLDS = [np.inf, 0.7] +K_VALUES = [1, 3, 5] +THRESHOLDS = [np.inf, 1.5] NN_MATCHES = [False, True] EXCLUSION_SIZE = [3, 5] diff --git a/aeon/testing/testing_data.py b/aeon/testing/testing_data.py index eb134cddda..e6730c9958 100644 --- a/aeon/testing/testing_data.py +++ b/aeon/testing/testing_data.py @@ -10,7 +10,8 @@ from aeon.forecasting import BaseForecaster from aeon.regression import BaseRegressor from aeon.segmentation import BaseSegmenter -from aeon.similarity_search import BaseSimilaritySearch +from aeon.similarity_search.collection import BaseCollectionSimilaritySearch +from aeon.similarity_search.series import BaseSeriesSimilaritySearch from aeon.testing.data_generation import ( make_example_1d_numpy, make_example_2d_dataframe_collection, @@ -219,7 +220,7 @@ }, } -EQUAL_LENGTH_UNIVARIATE_SIMILARITY_SEARCH = { +EQUAL_LENGTH_UNIVARIATE_COLLETION_SIMILARITY_SEARCH = { "numpy3D": { "train": ( make_example_3d_numpy( @@ -401,7 +402,7 @@ }, } -EQUAL_LENGTH_MULTIVARIATE_SIMILARITY_SEARCH = { +EQUAL_LENGTH_MULTIVARIATE_COLLETION_SIMILARITY_SEARCH = { "numpy3D": { "train": ( make_example_3d_numpy( @@ -553,7 +554,7 @@ }, } -UNEQUAL_LENGTH_UNIVARIATE_SIMILARITY_SEARCH = { +UNEQUAL_LENGTH_UNIVARIATE_COLLETION_SIMILARITY_SEARCH = { "np-list": { "train": ( make_example_3d_numpy_list( @@ -685,30 +686,6 @@ }, } -UNEQUAL_LENGTH_MULTIVARIATE_SIMILARITY_SEARCH = { - "np-list": { - "train": ( - make_example_3d_numpy_list( - n_cases=10, - n_channels=2, - min_n_timepoints=10, - max_n_timepoints=20, - random_state=data_rng.randint(np.iinfo(np.int32).max), - return_y=False, - ), - None, - ), - "test": ( - make_example_2d_numpy_series( - n_timepoints=10, - n_channels=2, - random_state=data_rng.randint(np.iinfo(np.int32).max), - ), - None, - ), - }, -} - X_classification_missing_train, y_classification_missing_train = make_example_3d_numpy( n_cases=10, n_channels=1, @@ -828,7 +805,7 @@ FULL_TEST_DATA_DICT.update( { f"EqualLengthUnivariate-SimilaritySearch-{k}": v - for k, v in EQUAL_LENGTH_UNIVARIATE_SIMILARITY_SEARCH.items() + for k, v in EQUAL_LENGTH_UNIVARIATE_COLLETION_SIMILARITY_SEARCH.items() } ) FULL_TEST_DATA_DICT.update( @@ -846,7 +823,7 @@ FULL_TEST_DATA_DICT.update( { f"EqualLengthMultivariate-SimilaritySearch-{k}": v - for k, v in EQUAL_LENGTH_MULTIVARIATE_SIMILARITY_SEARCH.items() + for k, v in EQUAL_LENGTH_MULTIVARIATE_COLLETION_SIMILARITY_SEARCH.items() } ) FULL_TEST_DATA_DICT.update( @@ -863,8 +840,8 @@ ) FULL_TEST_DATA_DICT.update( { - f"UnequalLengthUnivariate-SimilaritySearch-{k}": v - for k, v in UNEQUAL_LENGTH_UNIVARIATE_SIMILARITY_SEARCH.items() + f"UnequalLengthUnivariate-CollectionSimilaritySearch-{k}": v + for k, v in UNEQUAL_LENGTH_UNIVARIATE_COLLETION_SIMILARITY_SEARCH.items() } ) FULL_TEST_DATA_DICT.update( @@ -879,12 +856,6 @@ for k, v in UNEQUAL_LENGTH_MULTIVARIATE_REGRESSION.items() } ) -FULL_TEST_DATA_DICT.update( - { - f"UnequalLengthMultivariate-SimilaritySearch-{k}": v - for k, v in UNEQUAL_LENGTH_MULTIVARIATE_SIMILARITY_SEARCH.items() - } -) FULL_TEST_DATA_DICT.update( { f"MissingValues-Classification-{k}": v @@ -1017,14 +988,15 @@ def _get_task_for_estimator(estimator): # collection data with continuous target labels elif isinstance(estimator, BaseRegressor): data_label = "Regression" - elif isinstance(estimator, BaseSimilaritySearch): - data_label = "SimilaritySearch" + elif isinstance(estimator, BaseCollectionSimilaritySearch): + data_label = "CollectionSimilaritySearch" # series data with no secondary input elif ( isinstance(estimator, BaseAnomalyDetector) or isinstance(estimator, BaseSegmenter) or isinstance(estimator, BaseSeriesTransformer) or isinstance(estimator, BaseForecaster) + or isinstance(estimator, BaseSeriesSimilaritySearch) ): data_label = "None" else: diff --git a/aeon/utils/base/_identifier.py b/aeon/utils/base/_identifier.py index cf2722cfcb..03e8d8beaf 100644 --- a/aeon/utils/base/_identifier.py +++ b/aeon/utils/base/_identifier.py @@ -55,6 +55,8 @@ def get_identifier(estimator): identifiers.remove("collection-estimator") if len(identifiers) > 1 and "transformer" in identifiers: identifiers.remove("transformer") + if len(identifiers) > 1 and "similarity-search" in identifiers: + identifiers.remove("similarity-search") if len(identifiers) > 1: TypeError( diff --git a/aeon/utils/tags/_tags.py b/aeon/utils/tags/_tags.py index e1bacdd5ad..d85ba87caa 100644 --- a/aeon/utils/tags/_tags.py +++ b/aeon/utils/tags/_tags.py @@ -138,7 +138,7 @@ class : identifier for the base class of objects this tag applies to "point belongs to.", }, "requires_y": { - "class": ["transformer", "anomaly-detector", "segmenter"], + "class": ["transformer", "anomaly-detector", "segmenter", "similarity-search"], "type": "bool", "description": "Does this estimator require y to be passed in its methods?", }, diff --git a/examples/similarity_search/similarity_search.ipynb b/examples/similarity_search/similarity_search.ipynb index 53e5a9bcc5..91024292ef 100644 --- a/examples/similarity_search/similarity_search.ipynb +++ b/examples/similarity_search/similarity_search.ipynb @@ -332,7 +332,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 6, "id": "ff23faf5-2941-441a-8c4c-0cf66eaca121", "metadata": {}, "outputs": [ @@ -351,7 +351,7 @@ " array([[59, 76]], dtype=int64)])" ] }, - "execution_count": 11, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -363,8 +363,6 @@ "motif.predict(\n", " k=5,\n", " motif_size=1,\n", - " motif_extraction_method=\"k_motifs\",\n", - " allow_trivial_matches=False,\n", ")" ] }, @@ -379,7 +377,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 7, "id": "59117ea7-2cbf-49d6-829a-792805b4aaf7", "metadata": {}, "outputs": [ @@ -398,7 +396,7 @@ " array([[101, 95]], dtype=int64)])" ] }, - "execution_count": 12, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -410,8 +408,6 @@ " series_predict,\n", " k=5,\n", " motif_size=1,\n", - " motif_extraction_method=\"k_motifs\",\n", - " allow_trivial_matches=False,\n", ")" ] }, From 85c71741af20e7587971b4c7b8a4a06939474a24 Mon Sep 17 00:00:00 2001 From: baraline Date: Mon, 20 Jan 2025 08:50:40 +0100 Subject: [PATCH 18/18] Fix missing params --- .../collection/neighbors/_rp_cosine_lsh.py | 3 ++- .../collection/tests/test_base.py | 16 ++++------------ .../series/tests/test_commons.py | 4 ++-- 3 files changed, 8 insertions(+), 15 deletions(-) diff --git a/aeon/similarity_search/collection/neighbors/_rp_cosine_lsh.py b/aeon/similarity_search/collection/neighbors/_rp_cosine_lsh.py index 1fad3f08f7..61142d6f83 100644 --- a/aeon/similarity_search/collection/neighbors/_rp_cosine_lsh.py +++ b/aeon/similarity_search/collection/neighbors/_rp_cosine_lsh.py @@ -116,7 +116,8 @@ def __init__( self.use_discrete_vectors = use_discrete_vectors self.random_state = random_state self.normalize = normalize - super().__init__(n_jobs=n_jobs) + self.n_jobs = n_jobs + super().__init__() def _fit(self, X, y=None): """ diff --git a/aeon/similarity_search/collection/tests/test_base.py b/aeon/similarity_search/collection/tests/test_base.py index 9180a071cf..c1efaa30f0 100644 --- a/aeon/similarity_search/collection/tests/test_base.py +++ b/aeon/similarity_search/collection/tests/test_base.py @@ -11,7 +11,6 @@ make_example_1d_numpy, make_example_2d_numpy_series, make_example_3d_numpy, - make_example_3d_numpy_list, ) @@ -21,8 +20,6 @@ def test_input_shape_fit_predict_collection(): # dummy data to pass to fit when testing predict/predict_proba X_3D_uni = make_example_3d_numpy(n_channels=1, return_y=False) X_3D_multi = make_example_3d_numpy(n_channels=2, return_y=False) - X_3D_uni_list = make_example_3d_numpy_list(n_channels=1, return_y=False) - X_3D_multi_list = make_example_3d_numpy_list(n_channels=2, return_y=False) X_2D_uni = make_example_2d_numpy_series(n_channels=1) X_2D_multi = make_example_2d_numpy_series(n_channels=2) X_1D = make_example_1d_numpy() @@ -31,8 +28,6 @@ def test_input_shape_fit_predict_collection(): valid_inputs_fit = [ X_3D_uni, X_3D_multi, - X_3D_multi_list, - X_3D_uni_list, X_2D_uni, X_2D_multi, ] @@ -53,10 +48,7 @@ def test_input_shape_fit_predict_collection(): estimator_uni.predict(X_2D_multi) with pytest.raises(ValueError): estimator_multi.predict(X_2D_uni) - - for _input in [X_3D_uni, X_3D_uni_list]: - with pytest.raises(TypeError): - estimator_uni.predict(_input) - for _input in [X_3D_multi, X_3D_multi_list]: - with pytest.raises(TypeError): - estimator_multi.predict(_input) + with pytest.raises(TypeError): + estimator_uni.predict(X_3D_uni) + with pytest.raises(TypeError): + estimator_multi.predict(X_3D_multi) diff --git a/aeon/similarity_search/series/tests/test_commons.py b/aeon/similarity_search/series/tests/test_commons.py index 36e8b6babc..6f2c816193 100644 --- a/aeon/similarity_search/series/tests/test_commons.py +++ b/aeon/similarity_search/series/tests/test_commons.py @@ -109,7 +109,7 @@ def test__extract_top_k_motifs(): [0, 7], ] ) - MP_k, IP_k = _extract_top_k_motifs(MP, IP, 2) + MP_k, IP_k = _extract_top_k_motifs(MP, IP, 2, True, 0) assert_(len(MP_k) == 2) assert_(MP_k[0] == [0.6, 0.7]) assert_(IP_k[0] == [0, 7]) @@ -135,7 +135,7 @@ def test__extract_top_r_motifs(): [0, 7], ] ) - MP_k, IP_k = _extract_top_r_motifs(MP, IP, 2) + MP_k, IP_k = _extract_top_r_motifs(MP, IP, 2, True, 0) assert_(len(MP_k) == 2) assert_(MP_k[0] == [1.0, 1.5, 2.0, 1.5]) assert_(IP_k[0] == [1, 2, 3, 4])