From fbb1866c6c35e7bb6c0e168e727bc28fb4b560d1 Mon Sep 17 00:00:00 2001 From: Aryan Pola <98093778+aryanpola@users.noreply.github.com> Date: Thu, 26 Sep 2024 10:38:09 +0530 Subject: [PATCH] [DOC] Similarity docstring (#2056) * added docstrings in series_search * Added docstrings to more files in similarity_search * Changes in squared_distance_profile * Pre-commit fixes * requested changes made --- aeon/similarity_search/_commons.py | 44 +++++++ .../squared_distance_profile.py | 111 +++++++++++++++++- .../matrix_profiles/naive_matrix_profile.py | 5 + .../matrix_profiles/stomp.py | 78 ++++++++++++ aeon/similarity_search/series_search.py | 12 +- 5 files changed, 247 insertions(+), 3 deletions(-) diff --git a/aeon/similarity_search/_commons.py b/aeon/similarity_search/_commons.py index e31b75b88c..22ecbbeaea 100644 --- a/aeon/similarity_search/_commons.py +++ b/aeon/similarity_search/_commons.py @@ -13,6 +13,12 @@ def fft_sliding_dot_product(X, q): """ Use FFT convolution to calculate the sliding window dot product. + This function applies the Fast Fourier Transform (FFT) to efficiently compute + the sliding dot product between the input time series `X` and the query `q`. + The dot product is computed for each channel individually. The sliding window + approach ensures that the dot product is calculated for every possible subsequence + of `X` that matches the length of `q` + Parameters ---------- X : array, shape=(n_channels, n_timepoints) @@ -135,6 +141,44 @@ def extract_top_k_and_threshold_from_distance_profiles_one_series( exclusion_size=None, inverse_distance=False, ): + """ + Extract the top-k smallest values from distance profiles and apply threshold. + + This function processes a distance profile and extracts the top-k smallest + distance values, optionally applying a threshold to exclude distances above + a given value. It also optionally handles exclusion zones to avoid selecting + neighboring timestamps. + + Parameters + ---------- + distance_profiles : np.ndarray, 2D array of shape (n_cases, n_candidates) + Precomputed distance profile. Can be a TypedList if n_candidates vary between + cases. + id_x : int + Identifier of the series or subsequence from which the distance profile + is computed. + k : int + Number of matches to returns + threshold : float + All matches below this threshold will be returned + exclusion_size : int or None, optional, default=None + Size of the exclusion zone around the current subsequence. This prevents + selecting neighboring subsequences within the specified range, useful for + avoiding trivial matches in time series data. If set to `None`, no + exclusion zone is applied. + inverse_distance : bool, optional + Wheter to return the worst matches instead of the bests. The default is False. + + Returns + ------- + top_k_dist : np.ndarray + Array of the top-k smallest distance values, potentially excluding values above + the threshold or those within the exclusion zone. + top_k : np.ndarray + Array of shape (k, 2) where each row contains the `id_x` identifier and the + index of the corresponding subsequence (or timestamp) with the top-k smallest + distances. + """ if inverse_distance: # To avoid div by 0 case distance_profiles += 1e-8 diff --git a/aeon/similarity_search/distance_profiles/squared_distance_profile.py b/aeon/similarity_search/distance_profiles/squared_distance_profile.py index 83a6805ff8..c4de7ed836 100644 --- a/aeon/similarity_search/distance_profiles/squared_distance_profile.py +++ b/aeon/similarity_search/distance_profiles/squared_distance_profile.py @@ -114,6 +114,32 @@ def normalized_squared_distance_profile( @njit(cache=True, fastmath=True, parallel=True) def _squared_distance_profile(QX, X, q, mask): + """ + Compute squared distance profiles between query subsequence and time series. + + Parameters + ---------- + QX : List of np.ndarray + List of precomputed dot products between queries and time series, with each + element corresponding to a different time series. + Shape of each array is (n_channels, n_timepoints - query_length + 1). + X : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints) + The input samples. If X is an unquel length collection, expect a numba TypedList + 2D array of shape (n_channels, n_timepoints) + q : np.ndarray, 2D array of shape (n_channels, query_length) + The query used for similarity search. + mask : np.ndarray, 3D array of shape (n_cases, n_timepoints - query_length + 1) + Boolean mask of the shape of the distance profile indicating for which part + of it the distance should be computed. + + Returns + ------- + distance_profiles : np.ndarray + 3D array of shape (n_cases, n_channels, n_timepoints - query_length + 1) + The distance profile between q and the input time series X independently + for each channel. + + """ distance_profiles = List() query_length = q.shape[1] n_channels = q.shape[0] @@ -137,6 +163,29 @@ def _squared_distance_profile(QX, X, q, mask): @njit(cache=True, fastmath=True) def _squared_dist_profile_one_series(QT, T, Q): + """ + Compute squared distance profile between query subsequence and a single time series. + + This function calculates the squared distance profile for a single time series by + leveraging the dot product of the query and time series as well as precomputed sums + of squares to efficiently compute the squared distances. + + Parameters + ---------- + QT : np.ndarray, 2D array of shape (n_channels, n_timepoints - query_length + 1) + The dot product between the query and the time series. + T : np.ndarray, 2D array of shape (n_channels, series_length) + The series used for similarity search. Note that series_length can be equal, + superior or inferior to n_timepoints, it doesn't matter. + Q : np.ndarray + 2D array of shape (n_channels, query_length) representing query subsequence. + + Returns + ------- + distance_profile : np.ndarray + 2D array of shape (n_channels, n_timepoints - query_length + 1) + The squared distance profile between the query and the input time series. + """ n_channels, profile_length = QT.shape query_length = Q.shape[1] distance_profile = -2 * QT @@ -159,6 +208,36 @@ def _squared_dist_profile_one_series(QT, T, Q): def _normalized_squared_distance_profile( QX, mask, X_means, X_stds, q_means, q_stds, query_length ): + """ + Compute the normalized squared distance profiles between query subsequence and input time series. + + Parameters + ---------- + QX : List of np.ndarray + List of precomputed dot products between queries and time series, with each element + corresponding to a different time series. + Shape of each array is (n_channels, n_timepoints - query_length + 1). + mask : np.ndarray, 3D array of shape (n_cases, n_timepoints - query_length + 1) + Boolean mask of the shape of the distance profile indicating for which part + of it the distance should be computed. + X_means : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - query_length + 1) # noqa: E501 + Means of each subsequences of X of size query_length + X_stds : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - query_length + 1) # noqa: E501 + Stds of each subsequences of X of size query_length + q_means : np.ndarray, 1D array of shape (n_channels) + Means of the query q + q_stds : np.ndarray, 1D array of shape (n_channels) + Stds of the query q + query_length : int + The length of the query subsequence used for the distance profile computation. + + Returns + ------- + List of np.ndarray + List of 2D arrays, each of shape (n_channels, n_timepoints - query_length + 1). + Each array contains the normalized squared distance profile between the query subsequence and the corresponding time series. + Entries in the array are set to infinity where the mask is False. + """ distance_profiles = List() n_channels = q_means.shape[0] Q_is_constant = q_stds <= AEON_NUMBA_STD_THRESHOLD @@ -189,7 +268,37 @@ def _normalized_squared_distance_profile( def _normalized_squared_dist_profile_one_series( QT, T_means, T_stds, Q_means, Q_stds, query_length, Q_is_constant ): - # Compute znormalized squared euclidean distance + """ + Compute the z-normalized squared Euclidean distance profile for one time series. + + Parameters + ---------- + QT : np.ndarray, 2D array of shape (n_channels, n_timepoints - query_length + 1) + The dot product between the query and the time series. + T_means : np.ndarray, 1D array of length n_channels + The mean values of the time series for each channel. + + T_stds : np.ndarray, 2D array of shape (n_channels, profile_length) + The standard deviations of the time series for each channel and position. + Q_means : np.ndarray, 1D array of shape (n_channels) + Means of the query q + Q_stds : np.ndarray, 1D array of shape (n_channels) + Stds of the query q + query_length : int + The length of the query subsequence used for the distance profile computation. + Q_is_constant : np.ndarray + 1D array of shape (n_channels,) where each element is a Boolean indicating + whether the query standard deviation for that channel is less than or equal + to a specified threshold. + + Returns + ------- + np.ndarray + 2D array of shape (n_channels, n_timepoints - query_length + 1) containing the + z-normalized squared distance profile between the query subsequence and the time + series. Entries are computed based on the z-normalized values, with special + handling for constant values. + """ n_channels, profile_length = QT.shape distance_profile = np.full((n_channels, profile_length), np.inf) diff --git a/aeon/similarity_search/matrix_profiles/naive_matrix_profile.py b/aeon/similarity_search/matrix_profiles/naive_matrix_profile.py index 0b49159ec8..04caa908c4 100644 --- a/aeon/similarity_search/matrix_profiles/naive_matrix_profile.py +++ b/aeon/similarity_search/matrix_profiles/naive_matrix_profile.py @@ -30,6 +30,11 @@ def naive_matrix_profile( """ Compute a matrix profile in a naive way, by looping through a query search. + The matrix profile is computed by comparing each subsequence of the input series `T` + against the input samples `X`. The function uses a similarity search approach with + a variety of distance measures and can apply exclusion zones to avoid overlapping + matches. The result includes both the distances and the indices of the best matches. + Parameters ---------- X: np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints) diff --git a/aeon/similarity_search/matrix_profiles/stomp.py b/aeon/similarity_search/matrix_profiles/stomp.py index 5bfdfa0959..3524c6fea9 100644 --- a/aeon/similarity_search/matrix_profiles/stomp.py +++ b/aeon/similarity_search/matrix_profiles/stomp.py @@ -402,6 +402,57 @@ def _stomp_normalized( exclusion_size, inverse_distance, ): + """ + Compute the Matrix Profile using the STOMP algorithm with normalized distances. + + X: np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints) + The input samples. If X is an unquel length collection, expect a TypedList + of 2D arrays of shape (n_channels, n_timepoints) + T : np.ndarray, 2D array of shape (n_channels, series_length) + The series used for similarity search. Note that series_length can be equal, + superior or inferior to n_timepoints, it doesn't matter. + L : int + Length of the subsequences used for the distance computation. + XdotT : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - L + 1) + Precomputed dot products between each time series in X and the query series T. + X_means : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - L + 1) + Means of each subsequences of X of size L. Should be a numba TypedList if X is + unequal length. + X_stds : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - L + 1) + Stds of each subsequences of X of size L. Should be a numba TypedList if X is + unequal length. + T_means : np.ndarray, 2D array of shape (n_channels, n_timepoints - L + 1) + Means of each subsequences of T of size L. + T_stds : np.ndarray, 2D array of shape (n_channels, n_timepoints - L + 1) + Stds of each subsequences of T of size L. + mask : np.ndarray, 2D array of shape (n_cases, n_timepoints - length + 1) + Boolean mask of the shape of the distance profiles indicating for which part + of it the distance should be computed. In this context, it is the mask for the + first query of size L in T. This mask will be updated during the algorithm. + k : int, default=1 + The number of best matches to return during predict for each subsequence. + threshold : float, default=np.inf + The number of best matches to return during predict for each subsequence. + inverse_distance : bool, default=False + If True, the matching will be made on the inverse of the distance, and thus, the + worst matches to the query will be returned instead of the best ones. + exclusion_size : int, optional + The size of the exclusion zone used to prevent returning as top k candidates + the ones that are close to each other (for example i and i+1). + It is used to define a region between + :math:`id_timestomp - exclusion_size` and + :math:`id_timestomp + exclusion_size` which cannot be returned + as best match if :math:`id_timestomp` was already selected. By default, + the value None means that this is not used. + + Returns + ------- + tuple of np.ndarray + - MP : array of shape (n_queries,) + Matrix profile distances for each query subsequence. + - IP : array of shape (n_queries,) + Indexes of the top matches for each query subsequence. + """ n_queries = T.shape[1] - L + 1 MP = np.empty(n_queries, dtype=object) IP = np.empty(n_queries, dtype=object) @@ -499,6 +550,33 @@ def _stomp( def _sort_out_tops(top_dists, prev_top_dists, top_indexes, prev_to_indexes, k): + """ + Sort and combine top distance results from previous and current computations. + + Parameters + ---------- + top_dists : np.ndarray + Array of distances from the current computation. Shape should be (n,). + prev_top_dists : np.ndarray + Array of distances from previous computations. Shape should be (n,). + top_indexes : np.ndarray + Array of indexes corresponding to the top distances from current computation. + Shape should be (n,). + prev_to_indexes : np.ndarray + Array of indexes corresponding to the top distances from previous computations. + Shape should be (n,). + k : int, default=1 + The number of best matches to return during predict for each subsequence. + + Returns + ------- + tuple + A tuple containing two elements: + - A 1D numpy array of sorted distances, of length min(k, + total number of distances). + - A 1D numpy array of indexes corresponding to the sorted distances, + of length min(k, total number of distances). + """ all_dists = np.concatenate((prev_top_dists, top_dists)) all_indexes = np.concatenate((prev_to_indexes, top_indexes)) if k == np.inf: diff --git a/aeon/similarity_search/series_search.py b/aeon/similarity_search/series_search.py index 69867216e2..bcf3d7fe53 100644 --- a/aeon/similarity_search/series_search.py +++ b/aeon/similarity_search/series_search.py @@ -141,7 +141,10 @@ def predict( apply_exclusion_to_result=False, ): """ - Predict function. + Predict method : Check the shape of X and call _predict to perform the search. + + If the distance profile function is normalized, it stores the mean and stds + from X and X_, with X_ the training data. Parameters ---------- @@ -243,7 +246,12 @@ def _predict( apply_exclusion_to_result, ): """ - Call the matrix profile function. + Private predict method for SeriesSearch. + + This method calculates the matrix profile for a given time series dataset by + comparing all possible subsequences of a specified length against a reference + time series. It handles exclusion zones to prevent nearby matches from being + selected and supports normalization. Parameters ----------