Merge branch 'main' of https://github.com/aeon-toolkit/aeon into mm/d…

…ependabot # Conflicts: # sweep.yaml
aeon-toolkit · Nov 13, 2023 · 7e86da2 · 7e86da2
2 parents 6b8d09c + 3482819
commit 7e86da2
Show file tree

Hide file tree

Showing 41 changed files with 926 additions and 659 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -62,13 +62,13 @@ repos:
         args: [ "--convention=numpy" ]
         additional_dependencies: [ toml, tomli ]
 
-# not ready to be enabled yet
-#  - repo: https://github.com/pre-commit/mirrors-mypy
-#    rev: v1.0.1
-#    hooks:
-#      - id: mypy
-#        files: aeon/
-#        additional_dependencies: [ pytest ]
+  # not ready to be enabled yet
+  #  - repo: https://github.com/pre-commit/mirrors-mypy
+  #    rev: v1.0.1
+  #    hooks:
+  #      - id: mypy
+  #        files: aeon/
+  #        additional_dependencies: [ pytest ]
 
   - repo: https://github.com/mgedmin/check-manifest
     rev: "0.49"

diff --git a/aeon/annotation/__init__.py b/aeon/annotation/__init__.py
@@ -1 +1 @@
-"""Implements time series annotation."""
+"""Time series annotation."""
diff --git a/aeon/annotation/eagglo.py b/aeon/annotation/eagglo.py
@@ -110,7 +110,7 @@ def _fit(self, X: pd.DataFrame, y=None):
         X : pd.DataFrame
             Data for anomaly detection (time series).
         y : pd.Series, optional
-            Not used for this unsupervsed method.
+            Not used for this unsupervised method.
 
         Returns
         -------

diff --git a/aeon/annotation/hmm_learn/gaussian.py b/aeon/annotation/hmm_learn/gaussian.py
@@ -19,7 +19,7 @@ class GaussianHMM(BaseHMMLearn):
     ----------
     n_components : int
         Number of states
-    covariance_type : {"sperical", "diag", "full", "tied"}, optional
+    covariance_type : {"spherical", "diag", "full", "tied"}, optional
         The type of covariance parameters to use:
         * "spherical" --- each state uses a single variance value that
             applies to all features.

diff --git a/aeon/anomaly_detection/__init__.py b/aeon/anomaly_detection/__init__.py
@@ -0,0 +1,6 @@
+"""Time Series Anomaly Detection."""
+__all__ = [
+    "STRAY",
+]
+
+from aeon.anomaly_detection._stray import STRAY
diff --git a/aeon/annotation/stray.py → aeon/anomaly_detection/_stray.py b/aeon/annotation/stray.py → aeon/anomaly_detection/_stray.py
@@ -16,8 +16,8 @@
 class STRAY(BaseTransformer):
     """STRAY: robust anomaly detection in data streams with concept drift.
 
-    This is based on STRAY (Search TRace AnomalY) _[1], which is a modification
-    of HDoutliers _[2]. HDoutliers is a powerful algorithm for the detection of
+    This is based on STRAY (Search TRace AnomalY) [1]_, which is a modification
+    of HDoutliers [2]_. HDoutliers is a powerful algorithm for the detection of
     anomalous observations in a dataset, which has (among other advantages) the
     ability to detect clusters of outliers in multi-dimensional data without
     requiring a model of the typical behavior of the system. However, it suffers
@@ -63,7 +63,7 @@ class STRAY(BaseTransformer):
 
     Examples
     --------
-    >>> from aeon.annotation.stray import STRAY
+    >>> from aeon.anomaly_detection import STRAY
     >>> from aeon.datasets import load_airline
     >>> from sklearn.preprocessing import MinMaxScaler
     >>> import numpy as np

diff --git a/aeon/anomaly_detection/tests/__init__.py b/aeon/anomaly_detection/tests/__init__.py
@@ -0,0 +1 @@
+"""Tests for anomaly detection."""
diff --git a/aeon/annotation/tests/test_stray.py → aeon/anomaly_detection/tests/test_stray.py b/aeon/annotation/tests/test_stray.py → aeon/anomaly_detection/tests/test_stray.py
@@ -5,7 +5,7 @@
 import numpy as np
 from sklearn.preprocessing import MinMaxScaler, RobustScaler
 
-from aeon.annotation.stray import STRAY
+from aeon.anomaly_detection import STRAY
 
 
 def test_default_1D():

diff --git a/aeon/benchmarking/tests/test_results_loaders.py b/aeon/benchmarking/tests/test_results_loaders.py
@@ -1,4 +1,5 @@
 """Result loading tests."""
+
 import os
 
 import pytest

diff --git a/aeon/classification/shapelet_based/_rdst.py b/aeon/classification/shapelet_based/_rdst.py
@@ -79,10 +79,6 @@ class RDSTClassifier(BaseClassifier):
         The number of unique classes in the training set.
     fit_time_  : int
         The time (in milliseconds) for ``fit`` to run.
-    n_instances_ : int
-        The number of train cases in the training set.
-    n_dims_ : int
-        The number of dimensions per case in the training set.
     series_length_ : int
         The length of each series in the training set.
     transformed_data_ : list of shape (n_estimators) of ndarray
@@ -120,7 +116,9 @@ class RDSTClassifier(BaseClassifier):
 
     _tags = {
         "capability:multivariate": True,
+        "capability:unequal_length": True,
         "capability:multithreading": True,
+        "X_inner_type": ["np-list", "numpy3D"],
         "non-deterministic": True,  # due to random_state bug in MacOS #324
         "algorithm_type": "shapelet",
     }
@@ -149,10 +147,6 @@ def __init__(
         self.save_transformed_data = save_transformed_data
         self.random_state = random_state
         self.n_jobs = n_jobs
-
-        self.n_instances_ = 0
-        self.n_dims_ = 0
-        self.series_length_ = 0
         self.transformed_data_ = []
 
         self._transformer = None
@@ -180,8 +174,6 @@ def _fit(self, X, y):
         Changes state by creating a fitted model that updates attributes
         ending in "_".
         """
-        self.n_instances_, self.n_dims_, self.series_length_ = X.shape
-
         self._transformer = RandomDilatedShapeletTransform(
             max_shapelets=self.max_shapelets,
             shapelet_lengths=self.shapelet_lengths,
@@ -250,9 +242,9 @@ def _predict_proba(self, X) -> np.ndarray:
         if callable(m):
             return self._estimator.predict_proba(X_t)
         else:
-            dists = np.zeros((X.shape[0], self.n_classes_))
+            dists = np.zeros((len(X), self.n_classes_))
             preds = self._estimator.predict(X_t)
-            for i in range(0, X.shape[0]):
+            for i in range(0, len(X)):
                 dists[i, np.where(self.classes_ == preds[i])] = 1
             return dists
 

diff --git a/aeon/datasets/_data_loaders.py b/aeon/datasets/_data_loaders.py
@@ -103,6 +103,20 @@ def _load_header_info(file):
     return meta_data
 
 
+def _get_channel_strings(line, target, missing):
+    """Split a string with timestamps into separate csv strings."""
+    channel_strings = re.sub(r"\s", "", line)
+    channel_strings = channel_strings.split("):")
+    c = len(channel_strings)
+    if target:
+        c = c - 1
+    for i in range(c):
+        channel_strings[i] = channel_strings[i] + ")"
+        numbers = re.findall(r"\d+\.\d+|" + missing, channel_strings[i])
+        channel_strings[i] = ",".join(numbers)
+    return channel_strings
+
+
 def _load_data(file, meta_data, replace_missing_vals_with="NaN"):
     """Load data from a file with no header.
 
@@ -133,13 +147,20 @@ def _load_data(file, meta_data, replace_missing_vals_with="NaN"):
     current_channels = 0
     series_length = 0
     y_values = []
+    target = False
+    if meta_data["classlabel"] or meta_data["targetlabel"]:
+        target = True
     for line in file:
         line = line.strip().lower()
+        line = line.replace("nan", replace_missing_vals_with)
         line = line.replace("?", replace_missing_vals_with)
-        channels = line.split(":")
+        if meta_data["timestamps"]:
+            channels = _get_channel_strings(line, target, replace_missing_vals_with)
+        else:
+            channels = line.split(":")
         n_cases += 1
         current_channels = len(channels)
-        if meta_data["classlabel"] or meta_data["targetlabel"]:
+        if target:
             current_channels -= 1
         if n_cases == 1:  # Find n_channels and length  from first if not unequal
             n_channels = current_channels
@@ -179,7 +200,7 @@ def _load_data(file, meta_data, replace_missing_vals_with="NaN"):
                 )
             np_case[i] = np.array(data_series)
         data.append(np_case)
-        if meta_data["classlabel"] or meta_data["targetlabel"]:
+        if target:
             y_values.append(channels[n_channels])
     if meta_data["equallength"]:
         data = np.array(data)
@@ -1129,7 +1150,27 @@ def load_forecasting(name, extract_path=None, return_metadata=True):
 
 
 def load_regression(name, split=None, extract_path=None, return_metadata=True):
-    """Download/load forecasting problem from https://forecastingdata.org/.
+    """Download/load regression problem from http://tseregression.org/.
+
+    If you want to load a problem from a local file, specify the
+    location in ``extract_path``. This function assumes the data is stored in format
+    <extract_path>/<name>/<name>_TRAIN.ts and <extract_path>/<name>/<name>_TEST.ts.
+    If you want to load a file directly from a full path, use the function
+    `load_from_tsfile`` directly. If you do not specify ``extract_path``, or if the
+    problem is not present in ``extract_path`` it will attempt to download the data
+    from http://tseregression.org/.
+
+    The list of problems this function can download from the website is in
+    ``datasets/tser_lists.py``.  This function can load timestamped data, but it does
+    not store the time stamps. The time stamp loading is fragile, it will only work
+    if all data are floats.
+
+    Data is assumed to be in the standard .ts format: each row is a (possibly
+    multivariate) time series. Each dimension is separated by a colon, each value in
+    a series is comma separated. For examples see aeon.datasets.data. ArrowHead
+    is an example of a univariate equal length problem, BasicMotions an equal length
+    multivariate problem.
+
 
     Parameters
     ----------
@@ -1226,18 +1267,24 @@ def load_regression(name, split=None, extract_path=None, return_metadata=True):
 def load_classification(name, split=None, extract_path=None, return_metadata=True):
     """Load a classification dataset.
 
-    Loads a TSC dataset from extract_path, or from timeseriesclassification.com,
-    if not on extract path.
+    If you want to load a problem from a local file, specify the
+    location in ``extract_path``. This function assumes the data is stored in format
+    <extract_path>/<name>/<name>_TRAIN.ts and <extract_path>/<name>/<name>_TEST.ts.
+    If you want to load a file directly from a full path, use the function
+    `load_from_tsfile`` directly. If you do not specify ``extract_path``, or if the
+    problem is not present in ``extract_path`` it will attempt to download the data
+    from https://timeseriesclassification.com/.
+
+    The list of problems this function can download from the website is in
+    ``datasets/tsc_lists.py``.  This function can load timestamped data, but it does
+    not store the time stamps. The time stamp loading is fragile, it will only work
+    if all data are floats.
 
     Data is assumed to be in the standard .ts format: each row is a (possibly
-    multivariate) time series.
-    Each dimension is separated by a colon, each value in a series is comma
-    separated. For examples see aeon.datasets.data.tsc. ArrowHead is an example of
-    a univariate equal length problem, BasicMotions an equal length multivariate
-    problem.
-
-    Data is stored in extract_path/name/name.ts, extract_path/name/name_TRAIN.ts and
-    extract_path/name/name_TEST.ts.
+    multivariate) time series. Each dimension is separated by a colon, each value in
+    a series is comma separated. For examples see aeon.datasets.data. ArrowHead
+    is an example of a univariate equal length problem, BasicMotions an equal length
+    multivariate problem.
 
     Parameters
     ----------

diff --git a/aeon/datasets/_data_writers.py b/aeon/datasets/_data_writers.py
@@ -203,7 +203,7 @@ def write_results_to_uea_format(
     third_line : str
         summary performance information (see comment below)
     """
-    if len(y_true) != len(y_pred):
+    if y_true is not None and len(y_true) != len(y_pred):
         raise IndexError(
             "The number of predicted values is not the same as the "
             "number of actual class values"
@@ -287,11 +287,7 @@ def _write_header(
     extension=None,
 ):
     if class_labels is not None and regression:
-        raise ValueError(
-            "Cannot have class_labels and targetlabel. If the problem "
-            "is classification, add class_labels. If regression, "
-            "set targetlabel to true."
-        )
+        raise ValueError("Cannot have class_labels true for a regression problem")
     # create path if it does not exist
     dir = f"{str(path)}/"
     try: