aeon-toolkit · MatthewMiddlehurst · Sep 18, 2024 · Jul 8, 2024 · Jul 8, 2024 · Jul 9, 2024
@@ -1,6 +1,7 @@
 """Implements DummyClusterer to be used as Baseline."""
 
 import numpy as np
+from sklearn.utils import check_random_state
 
 from aeon.clustering.base import BaseClusterer
 
@@ -22,10 +23,17 @@ class DummyClusterer(BaseClusterer):
         - "random": Assign clusters randomly.
         - "uniform": Distribute clusters uniformly among samples.
         - "single_cluster": Assign all samples to a single cluster.
-
     n_clusters : int, default=3
         The number of clusters to generate. This is relevant for "random"
         and "uniform" strategies.
+    random_state : int, np.random.RandomState instance or None, default=None
+        Determines random number generation for centroid initialization.
+        Only used when `strategy` is "random".
+        If `int`, random_state is the seed used by the random number generator;
+        If `np.random.RandomState` instance,
+        random_state is the random number generator;
+        If `None`, the random number generator is the `RandomState` instance used
+        by `np.random`.
 
     Attributes
     ----------
@@ -38,19 +46,19 @@ class DummyClusterer(BaseClusterer):
     >>> import numpy as np
     >>> X = np.array([[1, 2], [3, 4], [5, 6]])
     >>> clusterer = DummyClusterer(strategy="uniform", n_clusters=2)
-    >>> clusterer._fit(X)
+    >>> clusterer.fit(X)
     DummyClusterer(n_clusters=2, strategy='uniform')
     >>> clusterer.labels_
     array([0, 1, 0])
-    >>> clusterer._predict(X)
+    >>> clusterer.predict(X)
     array([0, 1, 0])
     """
 
-    def __init__(self, strategy="random", n_clusters=3):
-        super().__init__()
+    def __init__(self, strategy="random", n_clusters=3, random_state=None):
         self.strategy = strategy
-        self.n_clusters = n_clusters
-        self.labels_ = None
+        self.random_state = random_state
+
+        super().__init__(n_clusters=n_clusters)
 
     def _fit(self, X, y=None):
         """
@@ -72,7 +80,8 @@ def _fit(self, X, y=None):
         n_samples = X.shape[0]
 
         if self.strategy == "random":
-            self.labels_ = np.random.randint(0, self.n_clusters, n_samples)
+            rng = check_random_state(self.random_state)
+            self.labels_ = rng.randint(self.n_clusters, size=n_samples)
         elif self.strategy == "uniform":
             self.labels_ = np.tile(
                 np.arange(self.n_clusters), n_samples // self.n_clusters + 1
@@ -103,7 +112,8 @@ def _predict(self, X, y=None) -> np.ndarray:
         """
         n_samples = X.shape[0]
         if self.strategy == "random":
-            return np.random.randint(0, self.n_clusters, n_samples)
+            rng = check_random_state(self.random_state)
+            return rng.randint(self.n_clusters, size=n_samples)
         elif self.strategy == "uniform":
             return np.tile(
                 np.arange(self.n_clusters), n_samples // self.n_clusters + 1

@@ -209,7 +209,7 @@ def _predict_proba(self, X) -> np.ndarray:
             n_clusters = self.n_clusters
             if n_clusters is None:
                 n_clusters = int(max(preds)) + 1
-            dists = np.zeros((X.shape[0], n_clusters))
+            dists = np.zeros((len(X), n_clusters))
             for i in range(n_cases):
                 dists[i, preds[i]] = 1
             return dists

diff --git a/aeon/testing/estimator_checking/_yield_clustering_checks.py b/aeon/testing/estimator_checking/_yield_clustering_checks.py
@@ -0,0 +1,84 @@
+"""Tests for all clusterers."""
+
+from functools import partial
+
+import numpy as np
+
+from aeon.base._base import _clone_estimator
+from aeon.clustering.deep_learning import BaseDeepClusterer
+from aeon.testing.testing_data import FULL_TEST_DATA_DICT
+
+
+def _yield_clustering_checks(estimator_class, estimator_instances, datatypes):
+    """Yield all clustering checks for an aeon clusterer."""
+    # only class required
+    yield partial(check_clusterer_tags_consistent, estimator_class=estimator_class)
+    yield partial(
+        check_clusterer_does_not_override_final_methods, estimator_class=estimator_class
+    )
+
+    # test class instances
+    for i, estimator in enumerate(estimator_instances):
+        # data type irrelevant
+        if isinstance(estimator, BaseDeepClusterer):
+            yield partial(
+                check_clustering_random_state_deep_learning,
+                estimator=estimator,
+                datatype=datatypes[i][0],
+            )
+
+
+def check_clusterer_tags_consistent(estimator_class):
+    """Test all estimators capability tags reflect their capabilities."""
+    # Test the tag X_inner_type is consistent with capability:unequal_length
+    unequal_length = estimator_class.get_class_tag("capability:unequal_length")
+    valid_types = {"np-list", "df-list", "pd-multivariate", "nested_univ"}
+    if unequal_length:  # one of X_inner_types must be capable of storing unequal length
+        internal_types = estimator_class.get_class_tag("X_inner_type")
+        if isinstance(internal_types, str):
+            assert internal_types in valid_types
+        else:  # must be a list
+            assert bool(set(internal_types) & valid_types)
+    # Test can actually fit/predict with multivariate if tag is set
+    multivariate = estimator_class.get_class_tag("capability:multivariate")
+    if multivariate:
+        X = np.random.random((10, 2, 10))
+        inst = estimator_class.create_test_instance(parameter_set="default")
+        inst.fit(X)
+        inst.predict(X)
+        inst.predict_proba(X)
+
+
+def check_clusterer_does_not_override_final_methods(estimator_class):
+    """Test does not override final methods."""
+    assert "fit" not in estimator_class.__dict__
+    assert "predict" not in estimator_class.__dict__
+
+
+def check_clustering_random_state_deep_learning(estimator, datatype):
+    """Test Deep Clusterer seeding."""
+    random_state = 42
+
+    deep_clr1 = _clone_estimator(estimator, random_state=random_state)
+    deep_clr1.fit(FULL_TEST_DATA_DICT[datatype]["train"][0])
+
+    layers1 = deep_clr1.training_model_.layers[1:]
+
+    deep_clr2 = _clone_estimator(estimator, random_state=random_state)
+    deep_clr2.fit(FULL_TEST_DATA_DICT[datatype]["train"][0])
+
+    layers2 = deep_clr2.training_model_.layers[1:]
+
+    assert len(layers1) == len(layers2)
+
+    for i in range(len(layers1)):
+        weights1 = layers1[i].get_weights()
+        weights2 = layers2[i].get_weights()
+
+        assert len(weights1) == len(weights2)
+
+        for j in range(len(weights1)):
+            _weight1 = np.asarray(weights1[j])
+            _weight2 = np.asarray(weights2[j])
+
+            np.testing.assert_almost_equal(_weight1, _weight2, 4)
diff --git a/aeon/testing/estimator_checking/_yield_estimator_checks.py b/aeon/testing/estimator_checking/_yield_estimator_checks.py
@@ -15,12 +15,16 @@
 from aeon.base._base import _clone_estimator
 from aeon.classification import BaseClassifier
 from aeon.classification.deep_learning.base import BaseDeepClassifier
+from aeon.clustering import BaseClusterer
 from aeon.clustering.deep_learning.base import BaseDeepClusterer
 from aeon.regression import BaseRegressor
 from aeon.regression.deep_learning.base import BaseDeepRegressor
 from aeon.testing.estimator_checking._yield_classification_checks import (
     _yield_classification_checks,
 )
+from aeon.testing.estimator_checking._yield_clustering_checks import (
+    _yield_clustering_checks,
+)
 from aeon.testing.estimator_checking._yield_regression_checks import (
     _yield_regression_checks,
 )
@@ -88,6 +92,11 @@ def _yield_all_aeon_checks(
             estimator_class, estimator_instances, datatypes
         )
 
+    if issubclass(estimator_class, BaseClusterer):
+        yield from _yield_clustering_checks(
+            estimator_class, estimator_instances, datatypes
+        )
+
 
 def _yield_estimator_checks(estimator_class, estimator_instances, datatypes):
     """Yield all general checks for an aeon estimator."""

diff --git a/aeon/testing/test_all_estimators.py b/aeon/testing/test_all_estimators.py
@@ -205,7 +205,7 @@ def _all_estimators(self):
             estimator_types=getattr(self, "estimator_type_filter", None),
             return_names=False,
             exclude_estimators=EXCLUDE_ESTIMATORS,
-            exclude_estimator_types=["classifier", "regressor"],
+            exclude_estimator_types=["classifier", "regressor", "clusterer"],
         )
 
         # subsample estimators by OS & python version

diff --git a/aeon/testing/test_config.py b/aeon/testing/test_config.py
@@ -72,6 +72,8 @@
     # needs investigation
     "SASTClassifier": ["check_fit_deterministic"],
     "RSASTClassifier": ["check_fit_deterministic"],
+    "AEFCNClusterer": ["check_fit_updates_state"],
+    "AEResNetClusterer": ["check_fit_updates_state"],
 }
 
 # We use estimator tags in addition to class hierarchies to further distinguish

diff --git a/aeon/testing/tests/test_all_estimators.py b/aeon/testing/tests/test_all_estimators.py
@@ -9,7 +9,7 @@
 from aeon.utils.sampling import random_partition
 
 ALL_ESTIMATORS = all_estimators(
-    estimator_types=["classifier", "regressor"],
+    estimator_types=["classifier", "regressor", "clusterer"],
     return_names=False,
 )