Skip to content

Commit

Permalink
[DOC] Similarity docstring (#2056)
Browse files Browse the repository at this point in the history
* added docstrings in series_search

* Added docstrings to more files in similarity_search

* Changes in squared_distance_profile

* Pre-commit fixes

* requested changes made
  • Loading branch information
aryanpola authored Sep 26, 2024
1 parent 8685600 commit fbb1866
Show file tree
Hide file tree
Showing 5 changed files with 247 additions and 3 deletions.
44 changes: 44 additions & 0 deletions aeon/similarity_search/_commons.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,12 @@ def fft_sliding_dot_product(X, q):
"""
Use FFT convolution to calculate the sliding window dot product.
This function applies the Fast Fourier Transform (FFT) to efficiently compute
the sliding dot product between the input time series `X` and the query `q`.
The dot product is computed for each channel individually. The sliding window
approach ensures that the dot product is calculated for every possible subsequence
of `X` that matches the length of `q`
Parameters
----------
X : array, shape=(n_channels, n_timepoints)
Expand Down Expand Up @@ -135,6 +141,44 @@ def extract_top_k_and_threshold_from_distance_profiles_one_series(
exclusion_size=None,
inverse_distance=False,
):
"""
Extract the top-k smallest values from distance profiles and apply threshold.
This function processes a distance profile and extracts the top-k smallest
distance values, optionally applying a threshold to exclude distances above
a given value. It also optionally handles exclusion zones to avoid selecting
neighboring timestamps.
Parameters
----------
distance_profiles : np.ndarray, 2D array of shape (n_cases, n_candidates)
Precomputed distance profile. Can be a TypedList if n_candidates vary between
cases.
id_x : int
Identifier of the series or subsequence from which the distance profile
is computed.
k : int
Number of matches to returns
threshold : float
All matches below this threshold will be returned
exclusion_size : int or None, optional, default=None
Size of the exclusion zone around the current subsequence. This prevents
selecting neighboring subsequences within the specified range, useful for
avoiding trivial matches in time series data. If set to `None`, no
exclusion zone is applied.
inverse_distance : bool, optional
Wheter to return the worst matches instead of the bests. The default is False.
Returns
-------
top_k_dist : np.ndarray
Array of the top-k smallest distance values, potentially excluding values above
the threshold or those within the exclusion zone.
top_k : np.ndarray
Array of shape (k, 2) where each row contains the `id_x` identifier and the
index of the corresponding subsequence (or timestamp) with the top-k smallest
distances.
"""
if inverse_distance:
# To avoid div by 0 case
distance_profiles += 1e-8
Expand Down
111 changes: 110 additions & 1 deletion aeon/similarity_search/distance_profiles/squared_distance_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,32 @@ def normalized_squared_distance_profile(

@njit(cache=True, fastmath=True, parallel=True)
def _squared_distance_profile(QX, X, q, mask):
"""
Compute squared distance profiles between query subsequence and time series.
Parameters
----------
QX : List of np.ndarray
List of precomputed dot products between queries and time series, with each
element corresponding to a different time series.
Shape of each array is (n_channels, n_timepoints - query_length + 1).
X : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints)
The input samples. If X is an unquel length collection, expect a numba TypedList
2D array of shape (n_channels, n_timepoints)
q : np.ndarray, 2D array of shape (n_channels, query_length)
The query used for similarity search.
mask : np.ndarray, 3D array of shape (n_cases, n_timepoints - query_length + 1)
Boolean mask of the shape of the distance profile indicating for which part
of it the distance should be computed.
Returns
-------
distance_profiles : np.ndarray
3D array of shape (n_cases, n_channels, n_timepoints - query_length + 1)
The distance profile between q and the input time series X independently
for each channel.
"""
distance_profiles = List()
query_length = q.shape[1]
n_channels = q.shape[0]
Expand All @@ -137,6 +163,29 @@ def _squared_distance_profile(QX, X, q, mask):

@njit(cache=True, fastmath=True)
def _squared_dist_profile_one_series(QT, T, Q):
"""
Compute squared distance profile between query subsequence and a single time series.
This function calculates the squared distance profile for a single time series by
leveraging the dot product of the query and time series as well as precomputed sums
of squares to efficiently compute the squared distances.
Parameters
----------
QT : np.ndarray, 2D array of shape (n_channels, n_timepoints - query_length + 1)
The dot product between the query and the time series.
T : np.ndarray, 2D array of shape (n_channels, series_length)
The series used for similarity search. Note that series_length can be equal,
superior or inferior to n_timepoints, it doesn't matter.
Q : np.ndarray
2D array of shape (n_channels, query_length) representing query subsequence.
Returns
-------
distance_profile : np.ndarray
2D array of shape (n_channels, n_timepoints - query_length + 1)
The squared distance profile between the query and the input time series.
"""
n_channels, profile_length = QT.shape
query_length = Q.shape[1]
distance_profile = -2 * QT
Expand All @@ -159,6 +208,36 @@ def _squared_dist_profile_one_series(QT, T, Q):
def _normalized_squared_distance_profile(
QX, mask, X_means, X_stds, q_means, q_stds, query_length
):
"""
Compute the normalized squared distance profiles between query subsequence and input time series.
Parameters
----------
QX : List of np.ndarray
List of precomputed dot products between queries and time series, with each element
corresponding to a different time series.
Shape of each array is (n_channels, n_timepoints - query_length + 1).
mask : np.ndarray, 3D array of shape (n_cases, n_timepoints - query_length + 1)
Boolean mask of the shape of the distance profile indicating for which part
of it the distance should be computed.
X_means : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - query_length + 1) # noqa: E501
Means of each subsequences of X of size query_length
X_stds : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - query_length + 1) # noqa: E501
Stds of each subsequences of X of size query_length
q_means : np.ndarray, 1D array of shape (n_channels)
Means of the query q
q_stds : np.ndarray, 1D array of shape (n_channels)
Stds of the query q
query_length : int
The length of the query subsequence used for the distance profile computation.
Returns
-------
List of np.ndarray
List of 2D arrays, each of shape (n_channels, n_timepoints - query_length + 1).
Each array contains the normalized squared distance profile between the query subsequence and the corresponding time series.
Entries in the array are set to infinity where the mask is False.
"""
distance_profiles = List()
n_channels = q_means.shape[0]
Q_is_constant = q_stds <= AEON_NUMBA_STD_THRESHOLD
Expand Down Expand Up @@ -189,7 +268,37 @@ def _normalized_squared_distance_profile(
def _normalized_squared_dist_profile_one_series(
QT, T_means, T_stds, Q_means, Q_stds, query_length, Q_is_constant
):
# Compute znormalized squared euclidean distance
"""
Compute the z-normalized squared Euclidean distance profile for one time series.
Parameters
----------
QT : np.ndarray, 2D array of shape (n_channels, n_timepoints - query_length + 1)
The dot product between the query and the time series.
T_means : np.ndarray, 1D array of length n_channels
The mean values of the time series for each channel.
T_stds : np.ndarray, 2D array of shape (n_channels, profile_length)
The standard deviations of the time series for each channel and position.
Q_means : np.ndarray, 1D array of shape (n_channels)
Means of the query q
Q_stds : np.ndarray, 1D array of shape (n_channels)
Stds of the query q
query_length : int
The length of the query subsequence used for the distance profile computation.
Q_is_constant : np.ndarray
1D array of shape (n_channels,) where each element is a Boolean indicating
whether the query standard deviation for that channel is less than or equal
to a specified threshold.
Returns
-------
np.ndarray
2D array of shape (n_channels, n_timepoints - query_length + 1) containing the
z-normalized squared distance profile between the query subsequence and the time
series. Entries are computed based on the z-normalized values, with special
handling for constant values.
"""
n_channels, profile_length = QT.shape
distance_profile = np.full((n_channels, profile_length), np.inf)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@ def naive_matrix_profile(
"""
Compute a matrix profile in a naive way, by looping through a query search.
The matrix profile is computed by comparing each subsequence of the input series `T`
against the input samples `X`. The function uses a similarity search approach with
a variety of distance measures and can apply exclusion zones to avoid overlapping
matches. The result includes both the distances and the indices of the best matches.
Parameters
----------
X: np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints)
Expand Down
78 changes: 78 additions & 0 deletions aeon/similarity_search/matrix_profiles/stomp.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,57 @@ def _stomp_normalized(
exclusion_size,
inverse_distance,
):
"""
Compute the Matrix Profile using the STOMP algorithm with normalized distances.
X: np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints)
The input samples. If X is an unquel length collection, expect a TypedList
of 2D arrays of shape (n_channels, n_timepoints)
T : np.ndarray, 2D array of shape (n_channels, series_length)
The series used for similarity search. Note that series_length can be equal,
superior or inferior to n_timepoints, it doesn't matter.
L : int
Length of the subsequences used for the distance computation.
XdotT : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - L + 1)
Precomputed dot products between each time series in X and the query series T.
X_means : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - L + 1)
Means of each subsequences of X of size L. Should be a numba TypedList if X is
unequal length.
X_stds : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - L + 1)
Stds of each subsequences of X of size L. Should be a numba TypedList if X is
unequal length.
T_means : np.ndarray, 2D array of shape (n_channels, n_timepoints - L + 1)
Means of each subsequences of T of size L.
T_stds : np.ndarray, 2D array of shape (n_channels, n_timepoints - L + 1)
Stds of each subsequences of T of size L.
mask : np.ndarray, 2D array of shape (n_cases, n_timepoints - length + 1)
Boolean mask of the shape of the distance profiles indicating for which part
of it the distance should be computed. In this context, it is the mask for the
first query of size L in T. This mask will be updated during the algorithm.
k : int, default=1
The number of best matches to return during predict for each subsequence.
threshold : float, default=np.inf
The number of best matches to return during predict for each subsequence.
inverse_distance : bool, default=False
If True, the matching will be made on the inverse of the distance, and thus, the
worst matches to the query will be returned instead of the best ones.
exclusion_size : int, optional
The size of the exclusion zone used to prevent returning as top k candidates
the ones that are close to each other (for example i and i+1).
It is used to define a region between
:math:`id_timestomp - exclusion_size` and
:math:`id_timestomp + exclusion_size` which cannot be returned
as best match if :math:`id_timestomp` was already selected. By default,
the value None means that this is not used.
Returns
-------
tuple of np.ndarray
- MP : array of shape (n_queries,)
Matrix profile distances for each query subsequence.
- IP : array of shape (n_queries,)
Indexes of the top matches for each query subsequence.
"""
n_queries = T.shape[1] - L + 1
MP = np.empty(n_queries, dtype=object)
IP = np.empty(n_queries, dtype=object)
Expand Down Expand Up @@ -499,6 +550,33 @@ def _stomp(


def _sort_out_tops(top_dists, prev_top_dists, top_indexes, prev_to_indexes, k):
"""
Sort and combine top distance results from previous and current computations.
Parameters
----------
top_dists : np.ndarray
Array of distances from the current computation. Shape should be (n,).
prev_top_dists : np.ndarray
Array of distances from previous computations. Shape should be (n,).
top_indexes : np.ndarray
Array of indexes corresponding to the top distances from current computation.
Shape should be (n,).
prev_to_indexes : np.ndarray
Array of indexes corresponding to the top distances from previous computations.
Shape should be (n,).
k : int, default=1
The number of best matches to return during predict for each subsequence.
Returns
-------
tuple
A tuple containing two elements:
- A 1D numpy array of sorted distances, of length min(k,
total number of distances).
- A 1D numpy array of indexes corresponding to the sorted distances,
of length min(k, total number of distances).
"""
all_dists = np.concatenate((prev_top_dists, top_dists))
all_indexes = np.concatenate((prev_to_indexes, top_indexes))
if k == np.inf:
Expand Down
12 changes: 10 additions & 2 deletions aeon/similarity_search/series_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,10 @@ def predict(
apply_exclusion_to_result=False,
):
"""
Predict function.
Predict method : Check the shape of X and call _predict to perform the search.
If the distance profile function is normalized, it stores the mean and stds
from X and X_, with X_ the training data.
Parameters
----------
Expand Down Expand Up @@ -243,7 +246,12 @@ def _predict(
apply_exclusion_to_result,
):
"""
Call the matrix profile function.
Private predict method for SeriesSearch.
This method calculates the matrix profile for a given time series dataset by
comparing all possible subsequences of a specified length against a reference
time series. It handles exclusion zones to prevent nearby matches from being
selected and supports normalization.
Parameters
----------
Expand Down

0 comments on commit fbb1866

Please sign in to comment.