[DOC] Similarity docstring (#2056)

* added docstrings in series_search * Added docstrings to more files in similarity_search * Changes in squared_distance_profile * Pre-commit fixes * requested changes made
aeon-toolkit · Sep 26, 2024 · fbb1866 · fbb1866
1 parent 8685600
commit fbb1866
Show file tree

Hide file tree

Showing 5 changed files with 247 additions and 3 deletions.
diff --git a/aeon/similarity_search/_commons.py b/aeon/similarity_search/_commons.py
@@ -13,6 +13,12 @@ def fft_sliding_dot_product(X, q):
     """
     Use FFT convolution to calculate the sliding window dot product.
 
+    This function applies the Fast Fourier Transform (FFT) to efficiently compute
+    the sliding dot product between the input time series `X` and the query `q`.
+    The dot product is computed for each channel individually. The sliding window
+    approach ensures that the dot product is calculated for every possible subsequence
+    of `X` that matches the length of `q`
+
     Parameters
     ----------
     X : array, shape=(n_channels, n_timepoints)
@@ -135,6 +141,44 @@ def extract_top_k_and_threshold_from_distance_profiles_one_series(
     exclusion_size=None,
     inverse_distance=False,
 ):
+    """
+    Extract the top-k smallest values from distance profiles and apply threshold.
+
+    This function processes a distance profile and extracts the top-k smallest
+    distance values, optionally applying a threshold to exclude distances above
+    a given value. It also optionally handles exclusion zones to avoid selecting
+    neighboring timestamps.
+
+    Parameters
+    ----------
+    distance_profiles : np.ndarray, 2D array of shape (n_cases, n_candidates)
+        Precomputed distance profile. Can be a TypedList if n_candidates vary between
+        cases.
+    id_x : int
+        Identifier of the series or subsequence from which the distance profile
+        is computed.
+    k : int
+        Number of matches to returns
+    threshold : float
+        All matches below this threshold will be returned
+    exclusion_size : int or None, optional, default=None
+        Size of the exclusion zone around the current subsequence. This prevents
+        selecting neighboring subsequences within the specified range, useful for
+        avoiding trivial matches in time series data. If set to `None`, no
+        exclusion zone is applied.
+    inverse_distance : bool, optional
+        Wheter to return the worst matches instead of the bests. The default is False.
+
+    Returns
+    -------
+    top_k_dist : np.ndarray
+        Array of the top-k smallest distance values, potentially excluding values above
+        the threshold or those within the exclusion zone.
+    top_k : np.ndarray
+        Array of shape (k, 2) where each row contains the `id_x` identifier and the
+        index of the corresponding subsequence (or timestamp) with the top-k smallest
+        distances.
+    """
     if inverse_distance:
         # To avoid div by 0 case
         distance_profiles += 1e-8

diff --git a/aeon/similarity_search/distance_profiles/squared_distance_profile.py b/aeon/similarity_search/distance_profiles/squared_distance_profile.py
@@ -114,6 +114,32 @@ def normalized_squared_distance_profile(
 
 @njit(cache=True, fastmath=True, parallel=True)
 def _squared_distance_profile(QX, X, q, mask):
+    """
+    Compute squared distance profiles between query subsequence and time series.
+
+    Parameters
+    ----------
+    QX : List of np.ndarray
+        List of precomputed dot products between queries and time series, with each
+        element corresponding to a different time series.
+        Shape of each array is (n_channels, n_timepoints - query_length + 1).
+    X : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints)
+        The input samples. If X is an unquel length collection, expect a numba TypedList
+        2D array of shape (n_channels, n_timepoints)
+    q : np.ndarray, 2D array of shape (n_channels, query_length)
+        The query used for similarity search.
+    mask : np.ndarray, 3D array of shape (n_cases, n_timepoints - query_length + 1)
+        Boolean mask of the shape of the distance profile indicating for which part
+        of it the distance should be computed.
+
+    Returns
+    -------
+    distance_profiles : np.ndarray
+        3D array of shape (n_cases, n_channels, n_timepoints - query_length + 1)
+        The distance profile between q and the input time series X independently
+        for each channel.
+
+    """
     distance_profiles = List()
     query_length = q.shape[1]
     n_channels = q.shape[0]
@@ -137,6 +163,29 @@ def _squared_distance_profile(QX, X, q, mask):
 
 @njit(cache=True, fastmath=True)
 def _squared_dist_profile_one_series(QT, T, Q):
+    """
+    Compute squared distance profile between query subsequence and a single time series.
+
+    This function calculates the squared distance profile for a single time series by
+    leveraging the dot product of the query and time series as well as precomputed sums
+    of squares to efficiently compute the squared distances.
+
+    Parameters
+    ----------
+    QT : np.ndarray, 2D array of shape (n_channels, n_timepoints - query_length + 1)
+        The dot product between the query and the time series.
+    T : np.ndarray, 2D array of shape (n_channels, series_length)
+        The series used for similarity search. Note that series_length can be equal,
+        superior or inferior to n_timepoints, it doesn't matter.
+    Q : np.ndarray
+        2D array of shape (n_channels, query_length) representing query subsequence.
+
+    Returns
+    -------
+    distance_profile : np.ndarray
+        2D array of shape (n_channels, n_timepoints - query_length + 1)
+        The squared distance profile between the query and the input time series.
+    """
     n_channels, profile_length = QT.shape
     query_length = Q.shape[1]
     distance_profile = -2 * QT
@@ -159,6 +208,36 @@ def _squared_dist_profile_one_series(QT, T, Q):
 def _normalized_squared_distance_profile(
     QX, mask, X_means, X_stds, q_means, q_stds, query_length
 ):
+    """
+    Compute the normalized squared distance profiles between query subsequence and input time series.
+
+    Parameters
+    ----------
+    QX : List of np.ndarray
+        List of precomputed dot products between queries and time series, with each element
+        corresponding to a different time series.
+        Shape of each array is (n_channels, n_timepoints - query_length + 1).
+    mask : np.ndarray, 3D array of shape (n_cases, n_timepoints - query_length + 1)
+        Boolean mask of the shape of the distance profile indicating for which part
+        of it the distance should be computed.
+    X_means : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - query_length + 1)  # noqa: E501
+        Means of each subsequences of X of size query_length
+    X_stds : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - query_length + 1)  # noqa: E501
+        Stds of each subsequences of X of size query_length
+    q_means : np.ndarray, 1D array of shape (n_channels)
+        Means of the query q
+    q_stds : np.ndarray, 1D array of shape (n_channels)
+        Stds of the query q
+    query_length : int
+        The length of the query subsequence used for the distance profile computation.
+
+    Returns
+    -------
+    List of np.ndarray
+        List of 2D arrays, each of shape (n_channels, n_timepoints - query_length + 1).
+        Each array contains the normalized squared distance profile between the query subsequence and the corresponding time series.
+        Entries in the array are set to infinity where the mask is False.
+    """
     distance_profiles = List()
     n_channels = q_means.shape[0]
     Q_is_constant = q_stds <= AEON_NUMBA_STD_THRESHOLD
@@ -189,7 +268,37 @@ def _normalized_squared_distance_profile(
 def _normalized_squared_dist_profile_one_series(
     QT, T_means, T_stds, Q_means, Q_stds, query_length, Q_is_constant
 ):
-    # Compute znormalized squared euclidean distance
+    """
+    Compute the z-normalized squared Euclidean distance profile for one time series.
+
+    Parameters
+    ----------
+    QT : np.ndarray, 2D array of shape (n_channels, n_timepoints - query_length + 1)
+        The dot product between the query and the time series.
+    T_means : np.ndarray, 1D array of length n_channels
+        The mean values of the time series for each channel.
+
+    T_stds : np.ndarray, 2D array of shape (n_channels, profile_length)
+        The standard deviations of the time series for each channel and position.
+    Q_means : np.ndarray, 1D array of shape (n_channels)
+        Means of the query q
+    Q_stds : np.ndarray, 1D array of shape (n_channels)
+        Stds of the query q
+    query_length : int
+        The length of the query subsequence used for the distance profile computation.
+    Q_is_constant : np.ndarray
+        1D array of shape (n_channels,) where each element is a Boolean indicating
+        whether the query standard deviation for that channel is less than or equal
+        to a specified threshold.
+
+    Returns
+    -------
+    np.ndarray
+        2D array of shape (n_channels, n_timepoints - query_length + 1) containing the
+        z-normalized squared distance profile between the query subsequence and the time
+        series. Entries are computed based on the z-normalized values, with special
+        handling for constant values.
+    """
     n_channels, profile_length = QT.shape
     distance_profile = np.full((n_channels, profile_length), np.inf)
 

diff --git a/aeon/similarity_search/matrix_profiles/naive_matrix_profile.py b/aeon/similarity_search/matrix_profiles/naive_matrix_profile.py
@@ -30,6 +30,11 @@ def naive_matrix_profile(
     """
     Compute a matrix profile in a naive way, by looping through a query search.
 
+    The matrix profile is computed by comparing each subsequence of the input series `T`
+    against the input samples `X`. The function uses a similarity search approach with
+    a variety of distance measures and can apply exclusion zones to avoid overlapping
+    matches. The result includes both the distances and the indices of the best matches.
+
     Parameters
     ----------
     X:  np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints)

diff --git a/aeon/similarity_search/matrix_profiles/stomp.py b/aeon/similarity_search/matrix_profiles/stomp.py
@@ -402,6 +402,57 @@ def _stomp_normalized(
     exclusion_size,
     inverse_distance,
 ):
+    """
+    Compute the Matrix Profile using the STOMP algorithm with normalized distances.
+
+    X:  np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints)
+        The input samples. If X is an unquel length collection, expect a TypedList
+        of 2D arrays of shape (n_channels, n_timepoints)
+    T : np.ndarray, 2D array of shape (n_channels, series_length)
+        The series used for similarity search. Note that series_length can be equal,
+        superior or inferior to n_timepoints, it doesn't matter.
+    L : int
+        Length of the subsequences used for the distance computation.
+    XdotT : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - L + 1)
+        Precomputed dot products between each time series in X and the query series T.
+    X_means : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - L + 1)
+        Means of each subsequences of X of size L. Should be a numba TypedList if X is
+        unequal length.
+    X_stds : np.ndarray, 3D array of shape (n_cases, n_channels, n_timepoints - L + 1)
+        Stds of each subsequences of X of size L. Should be a numba TypedList if X is
+        unequal length.
+    T_means : np.ndarray, 2D array of shape (n_channels, n_timepoints - L + 1)
+        Means of each subsequences of T of size L.
+    T_stds : np.ndarray, 2D array of shape (n_channels, n_timepoints - L + 1)
+        Stds of each subsequences of T of size L.
+    mask : np.ndarray, 2D array of shape (n_cases, n_timepoints - length + 1)
+        Boolean mask of the shape of the distance profiles indicating for which part
+        of it the distance should be computed. In this context, it is the mask for the
+        first query of size L in T. This mask will be updated during the algorithm.
+    k : int, default=1
+        The number of best matches to return during predict for each subsequence.
+    threshold : float, default=np.inf
+        The number of best matches to return during predict for each subsequence.
+    inverse_distance : bool, default=False
+        If True, the matching will be made on the inverse of the distance, and thus, the
+        worst matches to the query will be returned instead of the best ones.
+    exclusion_size : int, optional
+        The size of the exclusion zone used to prevent returning as top k candidates
+        the ones that are close to each other (for example i and i+1).
+        It is used to define a region between
+        :math:`id_timestomp - exclusion_size` and
+        :math:`id_timestomp + exclusion_size` which cannot be returned
+        as best match if :math:`id_timestomp` was already selected. By default,
+        the value None means that this is not used.
+
+    Returns
+    -------
+    tuple of np.ndarray
+        - MP : array of shape (n_queries,)
+          Matrix profile distances for each query subsequence.
+        - IP : array of shape (n_queries,)
+          Indexes of the top matches for each query subsequence.
+    """
     n_queries = T.shape[1] - L + 1
     MP = np.empty(n_queries, dtype=object)
     IP = np.empty(n_queries, dtype=object)
@@ -499,6 +550,33 @@ def _stomp(
 
 
 def _sort_out_tops(top_dists, prev_top_dists, top_indexes, prev_to_indexes, k):
+    """
+    Sort and combine top distance results from previous and current computations.
+
+    Parameters
+    ----------
+    top_dists : np.ndarray
+        Array of distances from the current computation. Shape should be (n,).
+    prev_top_dists : np.ndarray
+        Array of distances from previous computations. Shape should be (n,).
+    top_indexes : np.ndarray
+        Array of indexes corresponding to the top distances from current computation.
+        Shape should be (n,).
+    prev_to_indexes : np.ndarray
+        Array of indexes corresponding to the top distances from previous computations.
+        Shape should be (n,).
+    k : int, default=1
+        The number of best matches to return during predict for each subsequence.
+
+    Returns
+    -------
+    tuple
+        A tuple containing two elements:
+        - A 1D numpy array of sorted distances, of length min(k,
+          total number of distances).
+        - A 1D numpy array of indexes corresponding to the sorted distances,
+          of length min(k, total number of distances).
+    """
     all_dists = np.concatenate((prev_top_dists, top_dists))
     all_indexes = np.concatenate((prev_to_indexes, top_indexes))
     if k == np.inf:

diff --git a/aeon/similarity_search/series_search.py b/aeon/similarity_search/series_search.py
@@ -141,7 +141,10 @@ def predict(
         apply_exclusion_to_result=False,
     ):
         """
-        Predict function.
+        Predict method : Check the shape of X and call _predict to perform the search.
+
+        If the distance profile function is normalized, it stores the mean and stds
+        from X and X_, with X_ the training data.
 
         Parameters
         ----------
@@ -243,7 +246,12 @@ def _predict(
         apply_exclusion_to_result,
     ):
         """
-        Call the matrix profile function.
+        Private predict method for SeriesSearch.
+
+        This method calculates the matrix profile for a given time series dataset by
+        comparing all possible subsequences of a specified length against a reference
+        time series. It handles exclusion zones to prevent nearby matches from being
+        selected and supports normalization.
 
         Parameters
         ----------