From ab457786eafb4c4344d1fb8a345ce950eccc5f5e Mon Sep 17 00:00:00 2001
From: MatthewMiddlehurst <m.middlehurst@uea.ac.uk>
Date: Mon, 8 Jul 2024 11:48:17 +0100
Subject: [PATCH 01/15] classification checks in progress

---
 .pre-commit-config.yaml                       |    2 +-
 aeon/base/tests/test_base_collection.py       |   17 +-
 .../tests/test_random_state_deep_learning.py  |   60 -
 .../tests/test_all_classifiers.py             |  252 ----
 aeon/classification/tests/test_base.py        |   23 +-
 aeon/regression/tests/test_base.py            |   11 +-
 .../estimator_checking/_estimator_checking.py |   84 +-
 .../_yield_classification_checks.py           |  290 +++++
 .../_yield_estimator_checks.py                |   50 +-
 .../tests/test_check_estimator.py             |    8 +
 aeon/testing/test_all_estimators.py           |    1 +
 aeon/testing/testing_data.py                  | 1155 ++++++++++++-----
 aeon/testing/tests/test_all_estimators.py     |   15 +
 aeon/testing/{ => tests}/test_softdeps.py     |    0
 aeon/testing/tests/test_testing_data.py       |  266 +++-
 aeon/testing/utils/estimator_checks.py        |   16 +-
 .../tests/test_convert_collection.py          |   59 +-
 .../utils/validation/tests/test_collection.py |   10 +-
 aeon/utils/validation/tests/test_input.py     |   10 +-
 19 files changed, 1541 insertions(+), 788 deletions(-)
 delete mode 100644 aeon/classification/deep_learning/tests/test_random_state_deep_learning.py
 delete mode 100644 aeon/classification/tests/test_all_classifiers.py
 create mode 100644 aeon/testing/estimator_checking/_yield_classification_checks.py
 create mode 100644 aeon/testing/tests/test_all_estimators.py
 rename aeon/testing/{ => tests}/test_softdeps.py (100%)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 40a0378547..2ae880dcf6 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -67,7 +67,7 @@ repos:
         additional_dependencies: [ isort==5.13.2 ]
         args: [ "--nbqa-dont-skip-bad-cells", "--profile=black", "--multi-line=3" ]
       - id: nbqa-black
-        additional_dependencies: [ black==24.2.0 ]
+        additional_dependencies: [ black==24.4.2 ]
         args: [ "--nbqa-dont-skip-bad-cells" ]
       - id: nbqa-flake8
         additional_dependencies: [ flake8==7.0.0 ]
diff --git a/aeon/base/tests/test_base_collection.py b/aeon/base/tests/test_base_collection.py
index 5d044830b1..11df22fef6 100644
--- a/aeon/base/tests/test_base_collection.py
+++ b/aeon/base/tests/test_base_collection.py
@@ -4,7 +4,10 @@
 import pytest
 
 from aeon.base import BaseCollectionEstimator
-from aeon.testing.testing_data import EQUAL_LENGTH_UNIVARIATE, UNEQUAL_LENGTH_UNIVARIATE
+from aeon.testing.testing_data import (
+    EQUAL_LENGTH_UNIVARIATE_COLLECTION,
+    UNEQUAL_LENGTH_UNIVARIATE_COLLECTION,
+)
 from aeon.utils import COLLECTIONS_DATA_TYPES
 from aeon.utils.validation import get_type
 
@@ -12,7 +15,7 @@
 @pytest.mark.parametrize("data", COLLECTIONS_DATA_TYPES)
 def test__get_metadata(data):
     """Test get meta data."""
-    X = EQUAL_LENGTH_UNIVARIATE[data]
+    X = EQUAL_LENGTH_UNIVARIATE_COLLECTION[data]
     meta = BaseCollectionEstimator._get_metadata(X)
     assert not meta["multivariate"]
     assert not meta["missing_values"]
@@ -66,7 +69,7 @@ def test__convert_X(internal_type, data):
     """
     cls = BaseCollectionEstimator()
     # Equal length should default to numpy3D
-    X = EQUAL_LENGTH_UNIVARIATE[data]
+    X = EQUAL_LENGTH_UNIVARIATE_COLLECTION[data]
     cls.metadata_ = cls._check_X(X)
     X2 = cls._convert_X(X)
     assert get_type(X2) == cls.get_tag("X_inner_type")
@@ -86,11 +89,11 @@ def test__convert_X(internal_type, data):
     cls.set_tags(**{"X_inner_type": ["nested_univ", internal_type]})
     X2 = cls._convert_X(X)
     assert get_type(X2) == internal_type
-    if data in UNEQUAL_LENGTH_UNIVARIATE.keys():
-        if internal_type in UNEQUAL_LENGTH_UNIVARIATE.keys():
+    if data in UNEQUAL_LENGTH_UNIVARIATE_COLLECTION.keys():
+        if internal_type in UNEQUAL_LENGTH_UNIVARIATE_COLLECTION.keys():
             cls.set_tags(**{"capability:unequal_length": True})
             cls.set_tags(**{"X_inner_type": ["nested_univ", "np-list", internal_type]})
-            X = UNEQUAL_LENGTH_UNIVARIATE[data]
+            X = UNEQUAL_LENGTH_UNIVARIATE_COLLECTION[data]
             X2 = cls._convert_X(X)
             assert get_type(X2) == "np-list"
 
@@ -98,7 +101,7 @@ def test__convert_X(internal_type, data):
 @pytest.mark.parametrize("data", COLLECTIONS_DATA_TYPES)
 def test_preprocess_collection(data):
     """Test the functionality for preprocessing fit."""
-    data = EQUAL_LENGTH_UNIVARIATE[data]
+    data = EQUAL_LENGTH_UNIVARIATE_COLLECTION[data]
     cls = BaseCollectionEstimator()
     X = cls._preprocess_collection(data)
     assert cls._n_jobs == 1
diff --git a/aeon/classification/deep_learning/tests/test_random_state_deep_learning.py b/aeon/classification/deep_learning/tests/test_random_state_deep_learning.py
deleted file mode 100644
index e7c1f4c962..0000000000
--- a/aeon/classification/deep_learning/tests/test_random_state_deep_learning.py
+++ /dev/null
@@ -1,60 +0,0 @@
-"""Unit tests for classifiers deep learning random_state functionality."""
-
-import inspect
-
-import numpy as np
-import pytest
-
-from aeon.classification import deep_learning
-from aeon.testing.data_generation import make_example_3d_numpy
-from aeon.utils.validation._dependencies import _check_soft_dependencies
-
-__maintainer__ = ["hadifawaz1999"]
-
-
-@pytest.mark.skipif(
-    not _check_soft_dependencies(["tensorflow"], severity="none"),
-    reason="skip test if required soft dependency not available",
-)
-def test_random_state_deep_learning_cls():
-    """Test Deep Classifier seeding."""
-    random_state = 42
-
-    X, y = make_example_3d_numpy(random_state=random_state)
-
-    deep_cls_classes = [
-        member[1] for member in inspect.getmembers(deep_learning, inspect.isclass)
-    ]
-
-    for i in range(len(deep_cls_classes)):
-        if (
-            "BaseDeepClassifier" in str(deep_cls_classes[i])
-            or "InceptionTimeClassifier" in str(deep_cls_classes[i])
-            or "LITETimeClassifier" in str(deep_cls_classes[i])
-            or "TapNetClassifier" in str(deep_cls_classes[i])
-        ):
-            continue
-
-        deep_cls1 = deep_cls_classes[i](random_state=random_state, n_epochs=4)
-        deep_cls1.fit(X, y)
-
-        layers1 = deep_cls1.training_model_.layers[1:]
-
-        deep_cls2 = deep_cls_classes[i](random_state=random_state, n_epochs=4)
-        deep_cls2.fit(X, y)
-
-        layers2 = deep_cls2.training_model_.layers[1:]
-
-        assert len(layers1) == len(layers2)
-
-        for i in range(len(layers1)):
-            weights1 = layers1[i].get_weights()
-            weights2 = layers2[i].get_weights()
-
-            assert len(weights1) == len(weights2)
-
-            for j in range(len(weights1)):
-                _weight1 = np.asarray(weights1[j])
-                _weight2 = np.asarray(weights2[j])
-
-                assert np.array_equal(_weight1, _weight2)
diff --git a/aeon/classification/tests/test_all_classifiers.py b/aeon/classification/tests/test_all_classifiers.py
deleted file mode 100644
index 9219b8c541..0000000000
--- a/aeon/classification/tests/test_all_classifiers.py
+++ /dev/null
@@ -1,252 +0,0 @@
-"""Unit tests for classifier/regressor input output."""
-
-__maintainer__ = []
-
-import inspect
-from sys import platform
-
-import numpy as np
-from sklearn.utils._testing import set_random_state
-
-from aeon.datasets import load_basic_motions, load_unit_test
-from aeon.testing.expected_results.expected_classifier_outputs import (
-    basic_motions_proba,
-    unit_test_proba,
-)
-from aeon.testing.test_all_estimators import BaseFixtureGenerator, QuickTester
-from aeon.testing.utils.estimator_checks import _assert_array_almost_equal
-from aeon.testing.utils.scenarios_classification import ClassifierFitPredict
-from aeon.utils.validation import get_n_cases
-
-
-class ClassifierFixtureGenerator(BaseFixtureGenerator):
-    """Fixture generator for classifier tests.
-
-    Fixtures parameterized
-    ----------------------
-    estimator_class: estimator inheriting from BaseObject
-        ranges over estimator classes not excluded by EXCLUDE_ESTIMATORS, EXCLUDED_TESTS
-    estimator_instance: instance of estimator inheriting from BaseObject
-        ranges over estimator classes not excluded by EXCLUDE_ESTIMATORS, EXCLUDED_TESTS
-        instances are generated by create_test_instance class method
-    scenario: instance of TestScenario
-        ranges over all scenarios returned by retrieve_scenarios
-    """
-
-    # note: this should be separate from TestAllClassifiers
-    #   additional fixtures, parameters, etc should be added here
-    #   Classifiers should contain the tests only
-
-    estimator_type_filter = "classifier"
-
-
-class TestAllClassifiers(ClassifierFixtureGenerator, QuickTester):
-    """Module level tests for all aeon classifiers."""
-
-    def test_classifier_output(self, estimator_instance, scenario):
-        """Test classifier outputs the correct data types and values.
-
-        Test predict produces a np.array or pd.Series with only values seen in the train
-        data, and that predict_proba probability estimates add up to one.
-        """
-        n_classes = scenario.get_tag("n_classes")
-        X = scenario.args["predict"]["X"]
-        y = scenario.args["fit"]["y"]
-        n_cases = get_n_cases(X)
-
-        # run fit and predict
-        y_pred = scenario.run(estimator_instance, method_sequence=["fit", "predict"])
-
-        # check predict
-        assert isinstance(y_pred, np.ndarray)
-        assert y_pred.shape == (n_cases,)
-        assert np.all(np.isin(np.unique(y_pred), np.unique(y)))
-
-        # check predict proba (all classifiers have predict_proba by default)
-        y_proba = scenario.run(estimator_instance, method_sequence=["predict_proba"])
-        assert isinstance(y_proba, np.ndarray)
-        assert y_proba.shape == (n_cases, n_classes)
-        np.testing.assert_almost_equal(y_proba.sum(axis=1), 1, decimal=4)
-
-    def test_classifier_against_expected_results(self, estimator_class):
-        """Test classifier against stored results."""
-        # we only use the first estimator instance for testing
-        classname = estimator_class.__name__
-
-        # We cannot guarantee same results on ARM macOS
-        if platform == "darwin":
-            return None
-
-        # the test currently fails when numba is disabled. See issue #622
-        import os
-
-        if classname == "HIVECOTEV2" and os.environ.get("NUMBA_DISABLE_JIT") == "1":
-            return None
-
-        for data_name, data_dict, data_loader, data_seed in [
-            ["UnitTest", unit_test_proba, load_unit_test, 0],
-            ["BasicMotions", basic_motions_proba, load_basic_motions, 4],
-        ]:
-            # retrieve expected predict_proba output, and skip test if not available
-            if classname in data_dict.keys():
-                expected_probas = data_dict[classname]
-            else:
-                # skip test if no expected probas are registered
-                continue
-
-            # we only use the first estimator instance for testing
-            estimator_instance = estimator_class.create_test_instance(
-                parameter_set="results_comparison"
-            )
-            # set random seed if possible
-            set_random_state(estimator_instance, 0)
-
-            # load test data
-            X_train, y_train = data_loader(split="train")
-            X_test, _ = data_loader(split="test")
-            indices = np.random.RandomState(data_seed).choice(
-                len(y_train), 10, replace=False
-            )
-
-            # train classifier and predict probas
-            estimator_instance.fit(X_train[indices], y_train[indices])
-            y_proba = estimator_instance.predict_proba(X_test[indices])
-
-            # assert probabilities are the same
-            _assert_array_almost_equal(
-                y_proba,
-                expected_probas,
-                decimal=2,
-                err_msg=f"Failed to reproduce results for {classname} on {data_name}",
-            )
-
-    def test_contracted_classifier(self, estimator_class):
-        """Test classifiers that can be contracted."""
-        if estimator_class.get_class_tag(tag_name="capability:contractable") is True:
-            # if we have a contracting parameter set use it, else use default
-            estimator_instance = estimator_class.create_test_instance(
-                parameter_set="contracting"
-            )
-
-            default_params = inspect.signature(estimator_class.__init__).parameters
-
-            # check that the classifier has a time_limit_in_minutes parameter
-            if default_params.get("time_limit_in_minutes", None) is None:
-                raise ValueError(
-                    f"Classifier {estimator_class} which sets "
-                    "capability:contractable=True must have a time_limit_in_minutes "
-                    "parameter."
-                )
-
-            # check that the default value is to turn off contracting
-            if default_params.get("time_limit_in_minutes", None).default not in (
-                0,
-                -1,
-                None,
-            ):
-                raise ValueError(
-                    "time_limit_in_minutes parameter must have a default value of 0, "
-                    "-1 or None, disabling contracting by default."
-                )
-
-            # too short of a contract time can lead to test failures
-            if vars(estimator_instance).get("time_limit_in_minutes", None) < 0.5:
-                raise ValueError(
-                    "Test parameters for test_contracted_classifier must set "
-                    "time_limit_in_minutes to 0.5 or more. It is recommended to make "
-                    "this larger and add an alternative stopping mechanism "
-                    "(i.e. max ensemble members)."
-                )
-
-            scenario = ClassifierFitPredict()
-
-            X_new = scenario.args["predict"]["X"]
-            y_train = scenario.args["fit"]["y"]
-            X_new_instances = get_n_cases(X_new)
-
-            # run fit and predict
-            y_pred = scenario.run(
-                estimator_instance, method_sequence=["fit", "predict"]
-            )
-
-            # check predict
-            assert isinstance(y_pred, np.ndarray)
-            assert y_pred.shape == (X_new_instances,)
-            assert np.all(np.isin(np.unique(y_pred), np.unique(y_train)))
-        else:
-            # skip test if it can't contract
-            return None
-
-    def test_classifier_train_estimate(self, estimator_class):
-        """Test classifiers that can produce train set probability estimates."""
-        if estimator_class.get_class_tag(tag_name="capability:train_estimate") is True:
-            # if we have a train_estimate parameter set use it, else use default
-            estimator_instance = estimator_class.create_test_instance(
-                parameter_set="train_estimate"
-            )
-
-            if (
-                "_fit_predict" not in estimator_class.__dict__
-                or "_fit_predict_proba" not in estimator_class.__dict__
-            ):
-                raise ValueError(
-                    f"Classifier {estimator_class} has capability:train_estimate=True "
-                    "and must override the _fit_predict and _fit_predict_proba methods."
-                )
-
-            scenario = ClassifierFitPredict()
-            n_classes = scenario.get_tag("n_classes")
-            X_train = scenario.args["fit"]["X"]
-            y_train = scenario.args["fit"]["y"]
-            X_train_len = get_n_cases(X_train)
-
-            # check the predictions are valid
-            train_preds = estimator_instance.fit_predict(X_train, y_train)
-            assert isinstance(train_preds, np.ndarray)
-            assert train_preds.shape == (X_train_len,)
-            assert np.all(np.isin(np.unique(train_preds), np.unique(y_train)))
-
-            # check the probabilities are valid
-            train_proba = estimator_instance.fit_predict_proba(X_train, y_train)
-            assert isinstance(train_proba, np.ndarray)
-            assert train_proba.shape == (X_train_len, n_classes)
-            np.testing.assert_almost_equal(train_proba.sum(axis=1), 1, decimal=4)
-        else:
-            # skip test if it can't produce an estimate
-            return None
-
-    def test_classifier_tags_consistent(self, estimator_class):
-        """Test the tag X_inner_type is consistent with capability:unequal_length."""
-        valid_types = {"np-list", "df-list", "pd-multivariate", "nested_univ"}
-        unequal = estimator_class.get_class_tag("capability:unequal_length")
-        if unequal:  # one of X_inner_types must be capable of storing unequal length
-            internal_types = estimator_class.get_class_tag("X_inner_type")
-            if isinstance(internal_types, str):
-                assert internal_types in valid_types
-            else:  # must be a list
-                assert bool(set(internal_types) & valid_types)
-        # Test can actually fit/predict with multivariate if tag is set
-        multivariate = estimator_class.get_class_tag("capability:multivariate")
-        if multivariate:
-            X = np.random.random((10, 2, 20))
-            y = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1])
-            inst = estimator_class.create_test_instance(parameter_set="default")
-            inst.fit(X, y)
-            inst.predict(X)
-            inst.predict_proba(X)
-
-    def test_does_not_override_final_methods(self, estimator_class):
-        """Test does not override final methods."""
-        final_methods = [
-            "fit",
-            "predict",
-            "predict_proba",
-            "fit_predict",
-            "fit_predict_proba",
-        ]
-        for method in final_methods:
-            if method in estimator_class.__dict__:
-                raise ValueError(
-                    f"Classifier {estimator_class} overrides the method {method}. "
-                    f"Override _{method} instead."
-                )
diff --git a/aeon/classification/tests/test_base.py b/aeon/classification/tests/test_base.py
index 8a27ddef6f..d8e0533adf 100644
--- a/aeon/classification/tests/test_base.py
+++ b/aeon/classification/tests/test_base.py
@@ -1,7 +1,6 @@
 """Unit tests for classifier base class functionality."""
 
 import numpy as np
-import numpy.random
 import pandas as pd
 import pytest
 from sklearn.metrics import accuracy_score
@@ -12,9 +11,9 @@
     MockClassifierPredictProba,
 )
 from aeon.testing.testing_data import (
-    EQUAL_LENGTH_MULTIVARIATE,
-    EQUAL_LENGTH_UNIVARIATE,
-    UNEQUAL_LENGTH_UNIVARIATE,
+    EQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION,
+    EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION,
+    UNEQUAL_LENGTH_UNIVARIATE_CLASSIFICATION,
 )
 from aeon.utils import COLLECTIONS_DATA_TYPES
 
@@ -132,11 +131,11 @@ def test_check_y():
         cls._check_y(y, 10)
 
 
-@pytest.mark.parametrize("data", UNEQUAL_LENGTH_UNIVARIATE.keys())
+@pytest.mark.parametrize("data", UNEQUAL_LENGTH_UNIVARIATE_CLASSIFICATION.keys())
 def test_unequal_length_input(data):
     """Test with unequal length failures and passes."""
-    X = UNEQUAL_LENGTH_UNIVARIATE[data]
-    y = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+    X = UNEQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[data]["train"][0]
+    y = UNEQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[data]["train"][1]
 
     # Unable to handle unequal length series
     dummy = MockClassifier()
@@ -151,8 +150,8 @@ def test_unequal_length_input(data):
 @pytest.mark.parametrize("data", COLLECTIONS_DATA_TYPES)
 def test_univariate_equal_length_input(data):
     """Test with unequal length failures and passes."""
-    X = EQUAL_LENGTH_UNIVARIATE[data]
-    y = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+    X = EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[data]["train"][0]
+    y = EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[data]["train"][1]
 
     # Default capabilities
     dummy = MockClassifier()
@@ -163,11 +162,11 @@ def test_univariate_equal_length_input(data):
     _assert_fit_and_predict(dummy, X, y)
 
 
-@pytest.mark.parametrize("data", EQUAL_LENGTH_MULTIVARIATE.keys())
+@pytest.mark.parametrize("data", EQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION.keys())
 def test_multivariate_equal_length_input(data):
     """Test with unequal length failures and passes."""
-    X = EQUAL_LENGTH_MULTIVARIATE[data]
-    y = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+    X = EQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION[data]["train"][0]
+    y = EQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION[data]["train"][1]
 
     # Unable to handle multivariate series
     dummy = MockClassifier()
diff --git a/aeon/regression/tests/test_base.py b/aeon/regression/tests/test_base.py
index cf8c7e54d4..88de3ae6eb 100644
--- a/aeon/regression/tests/test_base.py
+++ b/aeon/regression/tests/test_base.py
@@ -8,7 +8,10 @@
 from aeon.datasets import load_covid_3month
 from aeon.regression.base import BaseRegressor
 from aeon.regression.dummy import DummyRegressor
-from aeon.testing.testing_data import EQUAL_LENGTH_UNIVARIATE, UNEQUAL_LENGTH_UNIVARIATE
+from aeon.testing.testing_data import (
+    EQUAL_LENGTH_UNIVARIATE_COLLECTION,
+    UNEQUAL_LENGTH_UNIVARIATE_COLLECTION,
+)
 from aeon.utils import COLLECTIONS_DATA_TYPES
 
 
@@ -107,9 +110,9 @@ def test__check_y():
 @pytest.mark.parametrize("data", COLLECTIONS_DATA_TYPES)
 def test_unequal_length_input(data):
     """Test with unequal length failures and passes."""
-    if data in UNEQUAL_LENGTH_UNIVARIATE.keys():
+    if data in UNEQUAL_LENGTH_UNIVARIATE_COLLECTION.keys():
         dummy = _TestRegressor()
-        X = UNEQUAL_LENGTH_UNIVARIATE[data]
+        X = UNEQUAL_LENGTH_UNIVARIATE_COLLECTION[data]
         y = np.random.random(size=10)
         with pytest.raises(ValueError, match=r"cannot handle unequal length series"):
             dummy.fit(X, y)
@@ -121,7 +124,7 @@ def test_unequal_length_input(data):
 def test_equal_length_input(data):
     """Test with unequal length failures and passes."""
     dummy = _TestRegressor()
-    X = EQUAL_LENGTH_UNIVARIATE[data]
+    X = EQUAL_LENGTH_UNIVARIATE_COLLECTION[data]
     y = np.random.random(size=10)
     _assert_fit_predict(dummy, X, y)
     dummy = _TestHandlesAllInput()
diff --git a/aeon/testing/estimator_checking/_estimator_checking.py b/aeon/testing/estimator_checking/_estimator_checking.py
index da698ddc8a..78edb2f4e0 100644
--- a/aeon/testing/estimator_checking/_estimator_checking.py
+++ b/aeon/testing/estimator_checking/_estimator_checking.py
@@ -71,24 +71,29 @@ class is passed.
 
     def checks_generator():
         for est in estimators:
-            if isclass(est):
-                if issubclass(est, BaseEstimator):
-                    est = est.create_test_instance(return_first=use_first_parameter_set)
-                else:
+            has_dependencies = _check_estimator_deps(est, severity="none")
+
+            if has_dependencies:
+                if isclass(est):
+                    if issubclass(est, BaseEstimator):
+                        est = est.create_test_instance(
+                            return_first=use_first_parameter_set
+                        )
+                    else:
+                        raise TypeError(
+                            f"Passed class {est} is not a subclass of BaseEstimator."
+                        )
+                elif not isinstance(est, BaseEstimator):
                     raise TypeError(
-                        f"Passed class {est} is not a subclass of BaseEstimator."
+                        f"Passed object {est} is not an instance of BaseEstimator."
                     )
-            elif not isinstance(est, BaseEstimator):
-                raise TypeError(
-                    f"Passed object {est} is not an instance of BaseEstimator."
-                )
 
             if not isinstance(est, list):
                 est = [est]
 
             for e in est:
                 for check in _yield_all_aeon_checks(e):
-                    yield _check_if_xfail(e, check)
+                    yield _check_if_xfail(e, check, has_dependencies)
 
     return pytest.mark.parametrize(
         "estimator, check",
@@ -119,7 +124,7 @@ def check_estimator(
 
     Parameters
     ----------
-    estimator : aeon BaseEstimator instances or classes
+    estimator : aeon BaseEstimator instance or class
         Estimator to run checks on. If estimator is a class, an instance will
         be created using BaseEstimator.create_test_instance().
     raise_exceptions : bool, optional, default=False
@@ -193,22 +198,27 @@ class is passed.
 
     def checks_generator():
         est = estimator
-        if isclass(est):
-            if issubclass(est, BaseEstimator):
-                est = est.create_test_instance(return_first=use_first_parameter_set)
-            else:
+        has_dependencies = _check_estimator_deps(est, severity="none")
+
+        if has_dependencies:
+            if isclass(est):
+                if issubclass(est, BaseEstimator):
+                    est = est.create_test_instance(return_first=use_first_parameter_set)
+                else:
+                    raise TypeError(
+                        f"Passed class {est} is not a subclass of BaseEstimator."
+                    )
+            elif not isinstance(est, BaseEstimator):
                 raise TypeError(
-                    f"Passed class {est} is not a subclass of BaseEstimator."
+                    f"Passed object {est} is not an instance of BaseEstimator."
                 )
-        elif not isinstance(est, BaseEstimator):
-            raise TypeError(f"Passed object {est} is not an instance of BaseEstimator.")
 
-        if not isinstance(est, list):
-            est = [est]
+            if not isinstance(est, list):
+                est = [est]
 
         for e in est:
             for check in _yield_all_aeon_checks(e):
-                yield _check_if_skip(e, check)
+                yield _check_if_skip(e, check, has_dependencies)
 
     if not isinstance(checks_to_run, (list, tuple)) and checks_to_run is not None:
         checks_to_run = [checks_to_run]
@@ -276,46 +286,50 @@ def checks_generator():
     return results
 
 
-def _check_if_xfail(estimator, check):
+def _check_if_xfail(estimator, check, has_dependencies):
     """Check if a check should be xfailed."""
     import pytest
 
-    skip, reason, _ = _should_be_skipped(estimator, check)
+    skip, reason, _ = _should_be_skipped(estimator, check, has_dependencies)
     if skip:
         return pytest.param(estimator, check, marks=pytest.mark.xfail(reason=reason))
 
     return estimator, check
 
 
-def _check_if_skip(estimator, check):
+def _check_if_skip(estimator, check, has_dependencies):
     """Check if a check should be skipped by raising a SkipTest exception."""
-    skip, reason, name = _should_be_skipped(estimator, check)
+    skip, reason, check_name = _should_be_skipped(estimator, check, has_dependencies)
     if skip:
 
         @wraps(check)
         def wrapped(*args, **kwargs):
-            raise SkipTest(
-                f"Skipping {name} for {estimator.__class__.__name__}: {reason}"
+            est_name = (
+                estimator.__name__
+                if isclass(estimator)
+                else estimator.__class__.__name__
             )
+            raise SkipTest(f"Skipping {check_name} for {est_name}: {reason}")
 
         return estimator, wrapped
     return estimator, check
 
 
-def _should_be_skipped(estimator, check):
-    est_name = estimator.__class__.__name__
+def _should_be_skipped(estimator, check, has_dependencies):
+    est_name = (
+        estimator.__name__ if isclass(estimator) else estimator.__class__.__name__
+    )
+    check_name = check.func.__name__ if isinstance(check, partial) else check.__name__
 
     # check estimator dependencies
-    if not _check_estimator_deps(estimator, severity=None):
-        return True, "Incompatible dependencies or Python version"
-
-    check_name = check.func.__name__ if isinstance(check, partial) else check.__name__
+    if not has_dependencies:
+        return True, "Incompatible dependencies or Python version", check_name
 
     # check aeon exclude lists
     if est_name in EXCLUDE_ESTIMATORS:
-        return True, "In aeon estimator exclude list"
+        return True, "In aeon estimator exclude list", check_name
     elif check_name in EXCLUDED_TESTS.get(est_name, []):
-        return True, "In aeon test exclude list for estimator"
+        return True, "In aeon test exclude list for estimator", check_name
 
     return False, "", check_name
 
diff --git a/aeon/testing/estimator_checking/_yield_classification_checks.py b/aeon/testing/estimator_checking/_yield_classification_checks.py
new file mode 100644
index 0000000000..6f515ba3c6
--- /dev/null
+++ b/aeon/testing/estimator_checking/_yield_classification_checks.py
@@ -0,0 +1,290 @@
+import inspect
+from functools import partial
+from sys import platform
+
+import numpy as np
+from sklearn.utils._testing import set_random_state
+
+from aeon.base._base import _clone_estimator
+from aeon.classification.deep_learning import BaseDeepClassifier
+from aeon.datasets import load_basic_motions, load_unit_test
+from aeon.testing.expected_results.expected_classifier_outputs import (
+    basic_motions_proba,
+    unit_test_proba,
+)
+from aeon.testing.testing_data import FULL_TEST_DATA_DICT
+from aeon.testing.utils.estimator_checks import _assert_array_almost_equal, _get_tag
+from aeon.utils.validation import get_n_cases
+
+
+def _yield_classification_checks(estimator, datatypes):
+    """Yield all classification checks for an aeon classifier."""
+    # no data needed
+    yield test_classifier_against_expected_results
+    yield test_classifier_tags_consistent
+    yield test_does_not_override_final_methods
+
+    # data type irrelevant
+    if _get_tag(estimator, "capability:contractable"):
+        yield partial(test_contracted_classifier, datatype=datatypes[0])
+
+    if _get_tag(estimator, "capability:train_estimate"):
+        yield partial(test_classifier_train_estimate, datatype=datatypes[0])
+
+    if isinstance(estimator, BaseDeepClassifier):
+        yield partial(test_random_state_deep_learning_cls, datatype=datatypes[0])
+
+    # test all data types
+    for datatype in datatypes:
+        yield partial(test_classifier_output, datatype=datatype)
+
+
+def test_classifier_against_expected_results(estimator):
+    """Test classifier against stored results."""
+    # we only use the first estimator instance for testing
+    class_name = type(estimator).__name__
+
+    # We cannot guarantee same results on ARM macOS
+    if platform == "darwin":
+        return None
+
+    # the test currently fails when numba is disabled. See issue #622
+    import os
+
+    if class_name == "HIVECOTEV2" and os.environ.get("NUMBA_DISABLE_JIT") == "1":
+        return None
+
+    for data_name, data_dict, data_loader, data_seed in [
+        ["UnitTest", unit_test_proba, load_unit_test, 0],
+        ["BasicMotions", basic_motions_proba, load_basic_motions, 4],
+    ]:
+        # retrieve expected predict_proba output, and skip test if not available
+        if class_name in data_dict.keys():
+            expected_probas = data_dict[class_name]
+        else:
+            # skip test if no expected probas are registered
+            continue
+
+        # we only use the first estimator instance for testing
+        estimator_instance = estimator.create_test_instance(
+            parameter_set="results_comparison"
+        )
+        # set random seed if possible
+        set_random_state(estimator_instance, 0)
+
+        # load test data
+        X_train, y_train = data_loader(split="train")
+        X_test, _ = data_loader(split="test")
+        indices = np.random.RandomState(data_seed).choice(
+            len(y_train), 10, replace=False
+        )
+
+        # train classifier and predict probas
+        estimator_instance.fit(X_train[indices], y_train[indices])
+        y_proba = estimator_instance.predict_proba(X_test[indices])
+
+        # assert probabilities are the same
+        _assert_array_almost_equal(
+            y_proba,
+            expected_probas,
+            decimal=2,
+            err_msg=f"Failed to reproduce results for {class_name} on {data_name}",
+        )
+
+
+def test_classifier_tags_consistent(estimator):
+    """Test the tag X_inner_type is consistent with capability:unequal_length."""
+    estimator_class = type(estimator)
+    valid_types = {"np-list", "df-list", "pd-multivariate", "nested_univ"}
+    unequal = estimator_class.get_class_tag("capability:unequal_length")
+    if unequal:  # one of X_inner_types must be capable of storing unequal length
+        internal_types = estimator_class.get_class_tag("X_inner_type")
+        if isinstance(internal_types, str):
+            assert internal_types in valid_types
+        else:  # must be a list
+            assert bool(set(internal_types) & valid_types)
+    # Test can actually fit/predict with multivariate if tag is set
+    multivariate = estimator_class.get_class_tag("capability:multivariate")
+    if multivariate:
+        X = np.random.random((10, 2, 20))
+        y = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1])
+        inst = estimator_class.create_test_instance(parameter_set="default")
+        inst.fit(X, y)
+        inst.predict(X)
+        inst.predict_proba(X)
+
+
+def test_does_not_override_final_methods(estimator):
+    """Test does not override final methods."""
+    estimator_class = type(estimator)
+    final_methods = [
+        "fit",
+        "predict",
+        "predict_proba",
+        "fit_predict",
+        "fit_predict_proba",
+    ]
+    for method in final_methods:
+        if method in estimator_class.__dict__:
+            raise ValueError(
+                f"Classifier {estimator_class} overrides the method {method}. "
+                f"Override _{method} instead."
+            )
+
+
+def test_contracted_classifier(estimator, datatype):
+    """Test classifiers that can be contracted."""
+    estimator_class = type(estimator)
+
+    default_params = inspect.signature(estimator_class.__init__).parameters
+
+    # check that the classifier has a time_limit_in_minutes parameter
+    if default_params.get("time_limit_in_minutes", None) is None:
+        raise ValueError(
+            f"Classifier {estimator_class} which sets "
+            "capability:contractable=True must have a time_limit_in_minutes "
+            "parameter."
+        )
+
+    # check that the default value is to turn off contracting
+    if default_params.get("time_limit_in_minutes", None).default not in (
+        0,
+        -1,
+        None,
+    ):
+        raise ValueError(
+            "time_limit_in_minutes parameter must have a default value of 0, "
+            "-1 or None, disabling contracting by default."
+        )
+
+    # too short of a contract time can lead to test failures
+    if vars(estimator).get("time_limit_in_minutes", None) < 0.5:
+        raise ValueError(
+            "Test parameters for test_contracted_classifier must set "
+            "time_limit_in_minutes to 0.5 or more. It is recommended to make "
+            "this larger and add an alternative stopping mechanism "
+            "(i.e. max ensemble members)."
+        )
+
+    # run fit and predict
+    estimator.fit(
+        FULL_TEST_DATA_DICT[datatype]["train"][0],
+        FULL_TEST_DATA_DICT[datatype]["train"][1],
+    )
+    y_pred = estimator.predict(FULL_TEST_DATA_DICT[datatype]["test"][0])
+
+    # check predict
+    assert isinstance(y_pred, np.ndarray)
+    assert y_pred.shape == (get_n_cases(FULL_TEST_DATA_DICT[datatype]["test"][0]),)
+    assert np.all(
+        np.isin(np.unique(y_pred), np.unique(FULL_TEST_DATA_DICT[datatype]["test"][1]))
+    )
+
+
+def test_classifier_train_estimate(estimator, datatype):
+    """Test classifiers that can produce train set probability estimates."""
+    estimator = _clone_estimator(estimator)
+    estimator_class = type(estimator)
+
+    # if we have a train_estimate parameter set use it, else use default
+    if (
+        "_fit_predict" not in estimator_class.__dict__
+        or "_fit_predict_proba" not in estimator_class.__dict__
+    ):
+        raise ValueError(
+            f"Classifier {estimator_class} has capability:train_estimate=True "
+            "and must override the _fit_predict and _fit_predict_proba methods."
+        )
+
+    unique_labels = np.unique(FULL_TEST_DATA_DICT[datatype]["train"][1])
+
+    # check the predictions are valid
+    train_preds = estimator.fit_predict(
+        FULL_TEST_DATA_DICT[datatype]["train"][0],
+        FULL_TEST_DATA_DICT[datatype]["train"][1],
+    )
+    assert isinstance(train_preds, np.ndarray)
+    assert train_preds.shape == (
+        get_n_cases(FULL_TEST_DATA_DICT[datatype]["train"][0]),
+    )
+    assert np.all(np.isin(np.unique(train_preds), unique_labels))
+
+    # check the probabilities are valid
+    train_proba = estimator.fit_predict_proba(
+        FULL_TEST_DATA_DICT[datatype]["train"][0],
+        FULL_TEST_DATA_DICT[datatype]["train"][1],
+    )
+    assert isinstance(train_proba, np.ndarray)
+    assert train_proba.shape == (
+        get_n_cases(FULL_TEST_DATA_DICT[datatype]["train"][0]),
+        len(unique_labels),
+    )
+    np.testing.assert_almost_equal(train_proba.sum(axis=1), 1, decimal=4)
+
+
+def test_random_state_deep_learning_cls(estimator, datatype):
+    """Test Deep Classifier seeding."""
+    random_state = 42
+
+    deep_cls1 = _clone_estimator(estimator, random_state=random_state)
+    deep_cls1.fit(
+        FULL_TEST_DATA_DICT[datatype]["train"][0],
+        FULL_TEST_DATA_DICT[datatype]["train"][1],
+    )
+
+    layers1 = deep_cls1.training_model_.layers[1:]
+
+    deep_cls2 = _clone_estimator(estimator, random_state=random_state)
+    deep_cls2.fit(
+        FULL_TEST_DATA_DICT[datatype]["train"][0],
+        FULL_TEST_DATA_DICT[datatype]["train"][1],
+    )
+
+    layers2 = deep_cls2.training_model_.layers[1:]
+
+    assert len(layers1) == len(layers2)
+
+    for i in range(len(layers1)):
+        weights1 = layers1[i].get_weights()
+        weights2 = layers2[i].get_weights()
+
+        assert len(weights1) == len(weights2)
+
+        for j in range(len(weights1)):
+            _weight1 = np.asarray(weights1[j])
+            _weight2 = np.asarray(weights2[j])
+
+            assert np.array_equal(_weight1, _weight2)
+
+
+def test_classifier_output(estimator, datatype):
+    """Test classifier outputs the correct data types and values.
+
+    Test predict produces a np.array or pd.Series with only values seen in the train
+    data, and that predict_proba probability estimates add up to one.
+    """
+    estimator = _clone_estimator(estimator)
+
+    unique_labels = np.unique(FULL_TEST_DATA_DICT[datatype]["train"][1])
+
+    # run fit and predict
+    estimator.fit(
+        FULL_TEST_DATA_DICT[datatype]["train"][0],
+        FULL_TEST_DATA_DICT[datatype]["train"][1],
+    )
+    y_pred = estimator.predict(FULL_TEST_DATA_DICT[datatype]["test"][0])
+
+    # check predict
+    assert isinstance(y_pred, np.ndarray)
+    assert y_pred.shape == (get_n_cases(FULL_TEST_DATA_DICT[datatype]["test"][0]),)
+    assert np.all(np.isin(np.unique(y_pred), unique_labels))
+
+    # check predict proba (all classifiers have predict_proba by default)
+    y_proba = estimator.predict_proba(FULL_TEST_DATA_DICT[datatype]["test"][0])
+
+    assert isinstance(y_proba, np.ndarray)
+    assert y_proba.shape == (
+        get_n_cases(FULL_TEST_DATA_DICT[datatype]["test"][0]),
+        len(unique_labels),
+    )
+    np.testing.assert_almost_equal(y_proba.sum(axis=1), 1, decimal=4)
diff --git a/aeon/testing/estimator_checking/_yield_estimator_checks.py b/aeon/testing/estimator_checking/_yield_estimator_checks.py
index 88e308047d..fbc580745a 100644
--- a/aeon/testing/estimator_checking/_yield_estimator_checks.py
+++ b/aeon/testing/estimator_checking/_yield_estimator_checks.py
@@ -3,7 +3,7 @@
 import types
 from copy import deepcopy
 from functools import partial
-from inspect import getfullargspec, signature
+from inspect import getfullargspec, isclass, signature
 
 import joblib
 import numpy as np
@@ -13,24 +13,25 @@
 
 from aeon.base import BaseEstimator, BaseObject
 from aeon.base._base import _clone_estimator
+from aeon.classification import BaseClassifier
 from aeon.classification.deep_learning.base import BaseDeepClassifier
 from aeon.clustering.deep_learning.base import BaseDeepClusterer
 from aeon.regression.deep_learning.base import BaseDeepRegressor
+from aeon.testing.estimator_checking._yield_classification_checks import (
+    _yield_classification_checks,
+)
 from aeon.testing.test_config import (
     NON_STATE_CHANGING_METHODS,
     NON_STATE_CHANGING_METHODS_ARRAYLIKE,
     VALID_ESTIMATOR_BASE_TYPES,
     VALID_ESTIMATOR_TAGS,
 )
-from aeon.testing.testing_data import (
-    TEST_DATA_DICT,
-    TEST_LABEL_DICT,
-    get_data_types_for_estimator,
-)
+from aeon.testing.testing_data import FULL_TEST_DATA_DICT, _get_datatypes_for_estimator
 from aeon.testing.utils.deep_equals import deep_equals
 from aeon.testing.utils.estimator_checks import (
     _assert_array_almost_equal,
     _get_args,
+    _get_tag,
     _list_required_methods,
     _run_estimator_method,
 )
@@ -38,12 +39,21 @@
 
 
 def _yield_all_aeon_checks(estimator):
-    datatypes = get_data_types_for_estimator(estimator)
+    """Yield all checks for an aeon estimator."""
+    # if a class is passed, all tests are going to be skipped as we could not
+    # instantiate the class
+    datatypes = (
+        _get_datatypes_for_estimator(estimator) if not isclass(estimator) else [None]
+    )
 
     yield from _yield_estimator_checks(estimator, datatypes)
 
+    if isinstance(estimator, BaseClassifier):
+        yield from _yield_classification_checks(estimator, datatypes)
+
 
 def _yield_estimator_checks(estimator, datatypes):
+    """Yield all general checks for an aeon estimator."""
     # no data needed
     yield check_create_test_instance
     yield check_create_test_instances_and_names
@@ -70,17 +80,13 @@ def _yield_estimator_checks(estimator, datatypes):
     yield partial(check_non_state_changing_method, datatype=datatypes[0])
     yield partial(check_fit_updates_state, datatype=datatypes[0])
 
-    if not estimator.get_tag(
-        "fit_is_empty", tag_value_default=False, raise_error=False
-    ):
+    if not _get_tag(estimator, "fit_is_empty", default=False):
         yield partial(check_raises_not_fitted_error, datatype=datatypes[0])
 
-    if not estimator.get_tag("cant-pickle", tag_value_default=False, raise_error=False):
+    if not _get_tag(estimator, "cant-pickle", default=False):
         yield partial(test_persistence_via_pickle, datatype=datatypes[0])
 
-    if not estimator.get_tag(
-        "non-deterministic", tag_value_default=False, raise_error=False
-    ):
+    if not _get_tag(estimator, "non-deterministic", default=False):
         yield partial(check_fit_deterministic, datatype=datatypes[0])
 
 
@@ -452,25 +458,25 @@ def check_non_state_changing_method(estimator, datatype):
     """
     estimator = _clone_estimator(estimator)
 
-    X = deepcopy(TEST_DATA_DICT[datatype[0]]["train"])
-    y = deepcopy(TEST_LABEL_DICT[datatype[1]]["train"])
+    X = deepcopy(FULL_TEST_DATA_DICT[datatype]["train"][0])
+    y = deepcopy(FULL_TEST_DATA_DICT[datatype]["train"][1])
     _run_estimator_method(estimator, "fit", datatype, "train")
 
-    assert deep_equals(X, TEST_DATA_DICT[datatype[0]]["train"]) and deep_equals(
-        y, TEST_LABEL_DICT[datatype[1]]["train"]
+    assert deep_equals(X, FULL_TEST_DATA_DICT[datatype]["train"][0]) and deep_equals(
+        y, FULL_TEST_DATA_DICT[datatype]["train"][1]
     ), f"Estimator: {type(estimator)} has side effects on arguments of fit"
 
     # dict_before = copy of dictionary of estimator before predict, post fit
     dict_before = estimator.__dict__.copy()
-    X = deepcopy(TEST_DATA_DICT[datatype[0]]["test"])
-    y = deepcopy(TEST_LABEL_DICT[datatype[1]]["test"])
+    X = deepcopy(FULL_TEST_DATA_DICT[datatype]["test"][0])
+    y = deepcopy(FULL_TEST_DATA_DICT[datatype]["test"][1])
 
     for method in NON_STATE_CHANGING_METHODS:
         if hasattr(estimator, method):
             _run_estimator_method(estimator, method, datatype, "test")
 
-        assert deep_equals(X, TEST_DATA_DICT[datatype[0]]["test"]) and deep_equals(
-            y, TEST_LABEL_DICT[datatype[1]]["test"]
+        assert deep_equals(X, FULL_TEST_DATA_DICT[datatype]["test"][0]) and deep_equals(
+            y, FULL_TEST_DATA_DICT[datatype]["test"][1]
         ), f"Estimator: {type(estimator)} has side effects on arguments of {method}"
 
         # dict_after = dictionary of estimator after predict and fit
diff --git a/aeon/testing/estimator_checking/tests/test_check_estimator.py b/aeon/testing/estimator_checking/tests/test_check_estimator.py
index 90d52135e6..7e81861e8f 100644
--- a/aeon/testing/estimator_checking/tests/test_check_estimator.py
+++ b/aeon/testing/estimator_checking/tests/test_check_estimator.py
@@ -15,6 +15,7 @@
     MockSegmenter,
 )
 from aeon.testing.mock_estimators._mock_anomaly_detectors import MockAnomalyDetector
+from aeon.testing.utils.deep_equals import deep_equals
 from aeon.transformations.collection import TimeSeriesScaler
 
 EXAMPLE_CLASSES = [
@@ -34,7 +35,9 @@ def test_parametrize_with_checks_classes(estimator, check):
     """Test parametrize_with_checks with class input."""
     assert isinstance(estimator, BaseEstimator)
     assert callable(check)
+    dict_before = estimator.__dict__.copy()
     check(estimator)
+    assert deep_equals(estimator.__dict__, dict_before)
 
 
 @parametrize_with_checks(
@@ -44,7 +47,9 @@ def test_parametrize_with_checks_instances(estimator, check):
     """Test parametrize_with_checks with estimator instance input."""
     assert isinstance(estimator, BaseEstimator)
     assert callable(check)
+    dict_before = estimator.__dict__.copy()
     check(estimator)
+    assert deep_equals(estimator.__dict__, dict_before)
 
 
 @pytest.mark.parametrize("estimator_class", EXAMPLE_CLASSES)
@@ -60,7 +65,10 @@ def test_check_estimator_passed(estimator_class):
 
     # test that no exceptions are raised
     check_estimator(estimator_class, raise_exceptions=True, verbose=False)
+
+    dict_before = estimator.__dict__.copy()
     check_estimator(estimator, raise_exceptions=True, verbose=False)
+    assert deep_equals(estimator.__dict__, dict_before)
 
 
 def test_check_estimator_subset_tests():
diff --git a/aeon/testing/test_all_estimators.py b/aeon/testing/test_all_estimators.py
index a6070c6175..a6dd584487 100644
--- a/aeon/testing/test_all_estimators.py
+++ b/aeon/testing/test_all_estimators.py
@@ -205,6 +205,7 @@ def _all_estimators(self):
             estimator_types=getattr(self, "estimator_type_filter", None),
             return_names=False,
             exclude_estimators=EXCLUDE_ESTIMATORS,
+            exclude_estimator_types=["classifier"],
         )
 
         # subsample estimators by OS & python version
diff --git a/aeon/testing/testing_data.py b/aeon/testing/testing_data.py
index 72251baa31..8d1856d022 100644
--- a/aeon/testing/testing_data.py
+++ b/aeon/testing/testing_data.py
@@ -24,91 +24,735 @@
 
 data_rng = np.random.RandomState(42)
 
-X_collection, y_collection = make_example_3d_numpy(
+
+EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION = {
+    "numpy3D": {
+        "train": make_example_3d_numpy(
+            n_cases=10,
+            n_channels=1,
+            n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+        ),
+        "test": make_example_3d_numpy(
+            n_cases=5,
+            n_channels=1,
+            n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+        ),
+    },
+    "numpy2D": {
+        "train": make_example_2d_numpy_collection(
+            n_cases=10,
+            n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+        ),
+        "test": make_example_2d_numpy_collection(
+            n_cases=5,
+            n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+        ),
+    },
+    "np-list": {
+        "train": make_example_3d_numpy_list(
+            n_cases=10,
+            n_channels=1,
+            min_n_timepoints=20,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+        ),
+        "test": make_example_3d_numpy_list(
+            n_cases=5,
+            n_channels=1,
+            min_n_timepoints=20,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+        ),
+    },
+    "df-list": {
+        "train": make_example_dataframe_list(
+            n_cases=10,
+            n_channels=1,
+            min_n_timepoints=20,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+        ),
+        "test": make_example_dataframe_list(
+            n_cases=5,
+            n_channels=1,
+            min_n_timepoints=20,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+        ),
+    },
+    "pd-wide": {
+        "train": make_example_2d_dataframe_collection(
+            n_cases=10,
+            n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+        ),
+        "test": make_example_2d_dataframe_collection(
+            n_cases=5,
+            n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+        ),
+    },
+    "nested_univ": {
+        "train": make_example_nested_dataframe(
+            n_cases=10,
+            n_channels=1,
+            min_n_timepoints=20,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+        ),
+        "test": make_example_nested_dataframe(
+            n_cases=5,
+            n_channels=1,
+            min_n_timepoints=20,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+        ),
+    },
+    "pd-multiindex": {
+        "train": make_example_multi_index_dataframe(
+            n_cases=10,
+            n_channels=1,
+            min_n_timepoints=20,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+        ),
+        "test": make_example_multi_index_dataframe(
+            n_cases=5,
+            n_channels=1,
+            min_n_timepoints=20,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+        ),
+    },
+}
+
+EQUAL_LENGTH_UNIVARIATE_REGRESSION = {
+    "numpy3D": {
+        "train": make_example_3d_numpy(
+            n_cases=10,
+            n_channels=1,
+            n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+            regression_target=True,
+        ),
+        "test": make_example_3d_numpy(
+            n_cases=5,
+            n_channels=1,
+            n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+            regression_target=True,
+        ),
+    },
+    "numpy2D": {
+        "train": make_example_2d_numpy_collection(
+            n_cases=10,
+            n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+            regression_target=True,
+        ),
+        "test": make_example_2d_numpy_collection(
+            n_cases=5,
+            n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+            regression_target=True,
+        ),
+    },
+    "np-list": {
+        "train": make_example_3d_numpy_list(
+            n_cases=10,
+            n_channels=1,
+            min_n_timepoints=20,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+            regression_target=True,
+        ),
+        "test": make_example_3d_numpy_list(
+            n_cases=5,
+            n_channels=1,
+            min_n_timepoints=20,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+            regression_target=True,
+        ),
+    },
+    "df-list": {
+        "train": make_example_dataframe_list(
+            n_cases=10,
+            n_channels=1,
+            min_n_timepoints=20,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+            regression_target=True,
+        ),
+        "test": make_example_dataframe_list(
+            n_cases=5,
+            n_channels=1,
+            min_n_timepoints=20,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+            regression_target=True,
+        ),
+    },
+    "pd-wide": {
+        "train": make_example_2d_dataframe_collection(
+            n_cases=10,
+            n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+            regression_target=True,
+        ),
+        "test": make_example_2d_dataframe_collection(
+            n_cases=5,
+            n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+            regression_target=True,
+        ),
+    },
+    "nested_univ": {
+        "train": make_example_nested_dataframe(
+            n_cases=10,
+            n_channels=1,
+            min_n_timepoints=20,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+            regression_target=True,
+        ),
+        "test": make_example_nested_dataframe(
+            n_cases=5,
+            n_channels=1,
+            min_n_timepoints=20,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+            regression_target=True,
+        ),
+    },
+    "pd-multiindex": {
+        "train": make_example_multi_index_dataframe(
+            n_cases=10,
+            n_channels=1,
+            min_n_timepoints=20,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+            regression_target=True,
+        ),
+        "test": make_example_multi_index_dataframe(
+            n_cases=5,
+            n_channels=1,
+            min_n_timepoints=20,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+            regression_target=True,
+        ),
+    },
+}
+
+EQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION = {
+    "numpy3D": {
+        "train": make_example_3d_numpy(
+            n_cases=10,
+            n_channels=2,
+            n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+        ),
+        "test": make_example_3d_numpy(
+            n_cases=5,
+            n_channels=2,
+            n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+        ),
+    },
+    "np-list": {
+        "train": make_example_3d_numpy_list(
+            n_cases=10,
+            n_channels=2,
+            min_n_timepoints=20,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+        ),
+        "test": make_example_3d_numpy_list(
+            n_cases=5,
+            n_channels=2,
+            min_n_timepoints=20,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+        ),
+    },
+    "df-list": {
+        "train": make_example_dataframe_list(
+            n_cases=10,
+            n_channels=2,
+            min_n_timepoints=20,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+        ),
+        "test": make_example_dataframe_list(
+            n_cases=5,
+            n_channels=2,
+            min_n_timepoints=20,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+        ),
+    },
+    "nested_univ": {
+        "train": make_example_nested_dataframe(
+            n_cases=10,
+            n_channels=2,
+            min_n_timepoints=20,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+        ),
+        "test": make_example_nested_dataframe(
+            n_cases=5,
+            n_channels=2,
+            min_n_timepoints=20,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+        ),
+    },
+    "pd-multiindex": {
+        "train": make_example_multi_index_dataframe(
+            n_cases=10,
+            n_channels=2,
+            min_n_timepoints=20,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+        ),
+        "test": make_example_multi_index_dataframe(
+            n_cases=5,
+            n_channels=2,
+            min_n_timepoints=20,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+        ),
+    },
+}
+
+EQUAL_LENGTH_MULTIVARIATE_REGRESSION = {
+    "numpy3D": {
+        "train": make_example_3d_numpy(
+            n_cases=10,
+            n_channels=2,
+            n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+            regression_target=True,
+        ),
+        "test": make_example_3d_numpy(
+            n_cases=5,
+            n_channels=2,
+            n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+            regression_target=True,
+        ),
+    },
+    "np-list": {
+        "train": make_example_3d_numpy_list(
+            n_cases=10,
+            n_channels=2,
+            min_n_timepoints=20,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+            regression_target=True,
+        ),
+        "test": make_example_3d_numpy_list(
+            n_cases=5,
+            n_channels=2,
+            min_n_timepoints=20,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+            regression_target=True,
+        ),
+    },
+    "df-list": {
+        "train": make_example_dataframe_list(
+            n_cases=10,
+            n_channels=2,
+            min_n_timepoints=20,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+            regression_target=True,
+        ),
+        "test": make_example_dataframe_list(
+            n_cases=5,
+            n_channels=2,
+            min_n_timepoints=20,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+            regression_target=True,
+        ),
+    },
+    "nested_univ": {
+        "train": make_example_nested_dataframe(
+            n_cases=10,
+            n_channels=2,
+            min_n_timepoints=20,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+            regression_target=True,
+        ),
+        "test": make_example_nested_dataframe(
+            n_cases=5,
+            n_channels=2,
+            min_n_timepoints=20,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+            regression_target=True,
+        ),
+    },
+    "pd-multiindex": {
+        "train": make_example_multi_index_dataframe(
+            n_cases=10,
+            n_channels=2,
+            min_n_timepoints=20,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+            regression_target=True,
+        ),
+        "test": make_example_multi_index_dataframe(
+            n_cases=5,
+            n_channels=2,
+            min_n_timepoints=20,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+            regression_target=True,
+        ),
+    },
+}
+
+UNEQUAL_LENGTH_UNIVARIATE_CLASSIFICATION = {
+    "np-list": {
+        "train": make_example_3d_numpy_list(
+            n_cases=10,
+            n_channels=1,
+            min_n_timepoints=10,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+        ),
+        "test": make_example_3d_numpy_list(
+            n_cases=5,
+            n_channels=1,
+            min_n_timepoints=10,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+        ),
+    },
+    "df-list": {
+        "train": make_example_dataframe_list(
+            n_cases=10,
+            n_channels=1,
+            min_n_timepoints=10,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+        ),
+        "test": make_example_dataframe_list(
+            n_cases=5,
+            n_channels=1,
+            min_n_timepoints=10,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+        ),
+    },
+    "nested_univ": {
+        "train": make_example_nested_dataframe(
+            n_cases=10,
+            n_channels=1,
+            min_n_timepoints=10,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+        ),
+        "test": make_example_nested_dataframe(
+            n_cases=5,
+            n_channels=1,
+            min_n_timepoints=10,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+        ),
+    },
+    "pd-multiindex": {
+        "train": make_example_multi_index_dataframe(
+            n_cases=10,
+            n_channels=1,
+            min_n_timepoints=10,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+        ),
+        "test": make_example_multi_index_dataframe(
+            n_cases=5,
+            n_channels=1,
+            min_n_timepoints=10,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+        ),
+    },
+}
+
+UNEQUAL_LENGTH_UNIVARIATE_REGRESSION = {
+    "np-list": {
+        "train": make_example_3d_numpy_list(
+            n_cases=10,
+            n_channels=1,
+            min_n_timepoints=10,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+            regression_target=True,
+        ),
+        "test": make_example_3d_numpy_list(
+            n_cases=5,
+            n_channels=1,
+            min_n_timepoints=10,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+            regression_target=True,
+        ),
+    },
+    "df-list": {
+        "train": make_example_dataframe_list(
+            n_cases=10,
+            n_channels=1,
+            min_n_timepoints=10,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+            regression_target=True,
+        ),
+        "test": make_example_dataframe_list(
+            n_cases=5,
+            n_channels=1,
+            min_n_timepoints=10,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+            regression_target=True,
+        ),
+    },
+    "nested_univ": {
+        "train": make_example_nested_dataframe(
+            n_cases=10,
+            n_channels=1,
+            min_n_timepoints=10,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+            regression_target=True,
+        ),
+        "test": make_example_nested_dataframe(
+            n_cases=5,
+            n_channels=1,
+            min_n_timepoints=10,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+            regression_target=True,
+        ),
+    },
+    "pd-multiindex": {
+        "train": make_example_multi_index_dataframe(
+            n_cases=10,
+            n_channels=1,
+            min_n_timepoints=10,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+            regression_target=True,
+        ),
+        "test": make_example_multi_index_dataframe(
+            n_cases=5,
+            n_channels=1,
+            min_n_timepoints=10,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+            regression_target=True,
+        ),
+    },
+}
+
+UNEQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION = {
+    "np-list": {
+        "train": make_example_3d_numpy_list(
+            n_cases=10,
+            n_channels=2,
+            max_n_timepoints=20,
+            min_n_timepoints=10,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+        ),
+        "test": make_example_3d_numpy_list(
+            n_cases=5,
+            n_channels=2,
+            max_n_timepoints=20,
+            min_n_timepoints=10,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+        ),
+    },
+    "df-list": {
+        "train": make_example_dataframe_list(
+            n_cases=10,
+            n_channels=2,
+            min_n_timepoints=10,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+        ),
+        "test": make_example_dataframe_list(
+            n_cases=5,
+            n_channels=2,
+            min_n_timepoints=10,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+        ),
+    },
+    "nested_univ": {
+        "train": make_example_nested_dataframe(
+            n_cases=10,
+            n_channels=2,
+            min_n_timepoints=10,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+        ),
+        "test": make_example_nested_dataframe(
+            n_cases=5,
+            n_channels=2,
+            min_n_timepoints=10,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+        ),
+    },
+    "pd-multiindex": {
+        "train": make_example_multi_index_dataframe(
+            n_cases=10,
+            n_channels=2,
+            min_n_timepoints=10,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+        ),
+        "test": make_example_multi_index_dataframe(
+            n_cases=5,
+            n_channels=2,
+            min_n_timepoints=10,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+        ),
+    },
+}
+
+UNEQUAL_LENGTH_MULTIVARIATE_REGRESSION = {
+    "np-list": {
+        "train": make_example_3d_numpy_list(
+            n_cases=10,
+            n_channels=2,
+            max_n_timepoints=20,
+            min_n_timepoints=10,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+            regression_target=True,
+        ),
+        "test": make_example_3d_numpy_list(
+            n_cases=5,
+            n_channels=2,
+            max_n_timepoints=20,
+            min_n_timepoints=10,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+            regression_target=True,
+        ),
+    },
+    "df-list": {
+        "train": make_example_dataframe_list(
+            n_cases=10,
+            n_channels=2,
+            min_n_timepoints=10,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+            regression_target=True,
+        ),
+        "test": make_example_dataframe_list(
+            n_cases=5,
+            n_channels=2,
+            min_n_timepoints=10,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+            regression_target=True,
+        ),
+    },
+    "nested_univ": {
+        "train": make_example_nested_dataframe(
+            n_cases=10,
+            n_channels=2,
+            min_n_timepoints=10,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+            regression_target=True,
+        ),
+        "test": make_example_nested_dataframe(
+            n_cases=5,
+            n_channels=2,
+            min_n_timepoints=10,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+            regression_target=True,
+        ),
+    },
+    "pd-multiindex": {
+        "train": make_example_multi_index_dataframe(
+            n_cases=10,
+            n_channels=2,
+            min_n_timepoints=10,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+            regression_target=True,
+        ),
+        "test": make_example_multi_index_dataframe(
+            n_cases=5,
+            n_channels=2,
+            min_n_timepoints=10,
+            max_n_timepoints=20,
+            random_state=data_rng.randint(np.iinfo(np.int32).max),
+            regression_target=True,
+        ),
+    },
+}
+
+X_classification_missing_train, y_classification_missing_train = make_example_3d_numpy(
     n_cases=10,
     n_channels=1,
     n_timepoints=20,
     random_state=data_rng.randint(np.iinfo(np.int32).max),
 )
-X_collection2, y_collection2 = make_example_3d_numpy(
+X_classification_missing_test, y_classification_missing_test = make_example_3d_numpy(
     n_cases=5,
     n_channels=1,
     n_timepoints=20,
     random_state=data_rng.randint(np.iinfo(np.int32).max),
 )
-y_collection_r = y_collection.astype(np.float32) + data_rng.uniform(
-    size=y_collection.shape
-)
-y_collection2_r = y_collection2.astype(np.float32) + data_rng.uniform(
-    size=y_collection2.shape
-)
-
-X_collection_mv, y_collection_mv = make_example_3d_numpy(
-    n_cases=10,
-    n_channels=2,
-    n_timepoints=20,
-    random_state=data_rng.randint(np.iinfo(np.int32).max),
-)
-X_collection_mv2, y_collection_mv2 = make_example_3d_numpy(
-    n_cases=5,
-    n_channels=2,
-    n_timepoints=20,
-    random_state=data_rng.randint(np.iinfo(np.int32).max),
-)
-y_collection_mv_r = y_collection.astype(np.float32) + data_rng.uniform(
-    size=y_collection.shape
-)
-y_collection_mv2_r = y_collection2.astype(np.float32) + data_rng.uniform(
-    size=y_collection2.shape
-)
+X_classification_missing_train[:, :, data_rng.choice(20, 2)] = np.nan
+X_classification_missing_test[:, :, data_rng.choice(20, 2)] = np.nan
 
-X_collection_ul, y_collection_ul = make_example_3d_numpy_list(
-    n_cases=10,
-    n_channels=1,
-    min_n_timepoints=10,
-    max_n_timepoints=20,
-    random_state=data_rng.randint(np.iinfo(np.int32).max),
-)
-X_collection_ul2, y_collection_ul2 = make_example_3d_numpy_list(
-    n_cases=5,
-    n_channels=1,
-    min_n_timepoints=10,
-    max_n_timepoints=20,
-    random_state=data_rng.randint(np.iinfo(np.int32).max),
-)
-y_collection_ul_r = y_collection.astype(np.float32) + data_rng.uniform(
-    size=y_collection.shape
-)
-y_collection_ul2_r = y_collection2.astype(np.float32) + data_rng.uniform(
-    size=y_collection2.shape
-)
+MISSING_VALUES_CLASSIFICATION = {
+    "train": (X_classification_missing_train, y_classification_missing_train),
+    "test": (X_classification_missing_test, y_classification_missing_test),
+}
 
-X_collection_mi, y_collection_mi = make_example_3d_numpy(
+X_classification_missing_train, y_classification_missing_train = make_example_3d_numpy(
     n_cases=10,
     n_channels=1,
     n_timepoints=20,
     random_state=data_rng.randint(np.iinfo(np.int32).max),
+    regression_target=True,
 )
-X_collection_mi2, y_collection_mi2 = make_example_3d_numpy(
+X_classification_missing_test, y_classification_missing_test = make_example_3d_numpy(
     n_cases=5,
     n_channels=1,
     n_timepoints=20,
     random_state=data_rng.randint(np.iinfo(np.int32).max),
+    regression_target=True,
 )
-X_collection_mi[:, :, data_rng.choice(10, 2)] = np.nan
-X_collection_mi2[:, :, data_rng.choice(10, 2)] = np.nan
-y_collection_mi_r = y_collection.astype(np.float32) + data_rng.uniform(
-    size=y_collection.shape
-)
-y_collection_mi2_r = y_collection2.astype(np.float32) + data_rng.uniform(
-    size=y_collection2.shape
-)
+X_classification_missing_train[:, :, data_rng.choice(20, 2)] = np.nan
+X_classification_missing_test[:, :, data_rng.choice(20, 2)] = np.nan
+
+MISSING_VALUES_REGRESSION = {
+    "train": (X_classification_missing_train, y_classification_missing_train),
+    "test": (X_classification_missing_test, y_classification_missing_test),
+}
 
 X_series = make_example_1d_numpy(
     n_timepoints=30, random_state=data_rng.randint(np.iinfo(np.int32).max)
 )
 X_series2 = X_series[20:30]
 X_series = X_series[:20]
+UNIVARIATE_SERIES_NOLABEL = {"train": (X_series, None), "test": (X_series2, None)}
 
 X_series_mv = make_example_2d_numpy_series(
     n_timepoints=30,
@@ -118,6 +762,10 @@
 )
 X_series_mv2 = X_series_mv[:, 20:30]
 X_series_mv = X_series_mv[:, :20]
+MULTIVARIATE_SERIES_NOLABEL = {
+    "train": (X_series_mv, None),
+    "test": (X_series_mv2, None),
+}
 
 X_series_mi = make_example_1d_numpy(
     n_timepoints=30, random_state=data_rng.randint(np.iinfo(np.int32).max)
@@ -126,223 +774,134 @@
 X_series_mi2[data_rng.choice(10, 1)] = np.nan
 X_series_mi = X_series_mi[:20]
 X_series_mi[data_rng.choice(20, 2)] = np.nan
+MISSING_VALUES_NOLABEL = {"train": (X_series_mi, None), "test": (X_series_mi2, None)}
 
+FULL_TEST_DATA_DICT = {}
+# Collection
+FULL_TEST_DATA_DICT.update(
+    {
+        f"EqualLengthUnivariate-Classification-{k}": v
+        for k, v in EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION.items()
+    }
+)
+FULL_TEST_DATA_DICT.update(
+    {
+        f"EqualLengthUnivariate-Regression-{k}": v
+        for k, v in EQUAL_LENGTH_UNIVARIATE_REGRESSION.items()
+    }
+)
+FULL_TEST_DATA_DICT.update(
+    {
+        f"EqualLengthMultivariate-Classification-{k}": v
+        for k, v in EQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION.items()
+    }
+)
+FULL_TEST_DATA_DICT.update(
+    {
+        f"EqualLengthMultivariate-Regression-{k}": v
+        for k, v in EQUAL_LENGTH_MULTIVARIATE_REGRESSION.items()
+    }
+)
+FULL_TEST_DATA_DICT.update(
+    {
+        f"UnequalLengthUnivariate-Classification-{k}": v
+        for k, v in UNEQUAL_LENGTH_UNIVARIATE_CLASSIFICATION.items()
+    }
+)
+FULL_TEST_DATA_DICT.update(
+    {
+        f"UnequalLengthUnivariate-Regression-{k}": v
+        for k, v in UNEQUAL_LENGTH_UNIVARIATE_REGRESSION.items()
+    }
+)
+FULL_TEST_DATA_DICT.update(
+    {
+        f"UnequalLengthMultivariate-Classification-{k}": v
+        for k, v in UNEQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION.items()
+    }
+)
+FULL_TEST_DATA_DICT.update(
+    {
+        f"UnequalLengthMultivariate-Regression-{k}": v
+        for k, v in UNEQUAL_LENGTH_MULTIVARIATE_REGRESSION.items()
+    }
+)
+FULL_TEST_DATA_DICT.update(
+    {"MissingValues-Classification": MISSING_VALUES_CLASSIFICATION}
+)
+FULL_TEST_DATA_DICT.update({"MissingValues-Regression": MISSING_VALUES_REGRESSION})
+# Series
+FULL_TEST_DATA_DICT.update({"UnivariateSeries-NoLabel": UNIVARIATE_SERIES_NOLABEL})
+FULL_TEST_DATA_DICT.update({"MultivariateSeries-NoLabel": MULTIVARIATE_SERIES_NOLABEL})
+FULL_TEST_DATA_DICT.update({"MissingValues-NoLabel": MISSING_VALUES_NOLABEL})
 
-TEST_DATA_DICT = {
-    "UnivariateCollection": {"train": X_collection, "test": X_collection2},
-    "MultivariateCollection": {"train": X_collection_mv, "test": X_collection_mv2},
-    "UnequalLengthCollection": {"train": X_collection_ul, "test": X_collection_ul2},
-    "MissingValuesCollection": {"train": X_collection_mi, "test": X_collection_mi2},
-    "UnivariateSeries": {"train": X_series, "test": X_series2},
-    "MultivariateSeries": {"train": X_series_mv, "test": X_series_mv2},
-    "MissingValuesSeries": {"train": X_series_mi, "test": X_series_mi2},
-}
-TEST_LABEL_DICT = {
-    "Classification": {
-        "train": y_collection,
-        "test": y_collection2,
-    },
-    "Regression": {
-        "train": y_collection_r,
-        "test": y_collection2_r,
-    },
-    "Anomaly Detection": {
-        "train": None,
-        "test": None,
-    },
-    "Segmentation": {
-        "train": None,
-        "test": None,
-    },
-    "UnivariateCollectionClassification": {
-        "train": y_collection,
-        "test": y_collection2,
-    },
-    "UnivariateCollectionRegression": {
-        "train": y_collection_r,
-        "test": y_collection2_r,
-    },
-    "MultivariateCollectionClassification": {
-        "train": y_collection_mv,
-        "test": y_collection_mv2,
-    },
-    "MultivariateCollectionRegression": {
-        "train": y_collection_mv_r,
-        "test": y_collection_mv2_r,
-    },
-    "UnequalLengthCollectionClassification": {
-        "train": y_collection_ul,
-        "test": y_collection_ul2,
-    },
-    "UnequalLengthCollectionRegression": {
-        "train": y_collection_ul_r,
-        "test": y_collection_ul2_r,
-    },
-    "MissingValuesCollectionClassification": {
-        "train": y_collection_mi,
-        "test": y_collection_mi2,
-    },
-    "MissingValuesCollectionRegression": {
-        "train": y_collection_mi_r,
-        "test": y_collection_mi2_r,
-    },
-    None: {
-        "train": None,
-        "test": None,
-    },
-}
 
-EQUAL_LENGTH_UNIVARIATE = {
-    "numpy3D": X_collection,
-    "numpy2D": make_example_2d_numpy_collection(
-        n_cases=10,
-        n_timepoints=20,
-        random_state=data_rng.randint(np.iinfo(np.int32).max),
-        return_y=False,
-    ),
-    "np-list": make_example_3d_numpy_list(
-        n_cases=10,
-        n_channels=1,
-        min_n_timepoints=20,
-        max_n_timepoints=20,
-        random_state=data_rng.randint(np.iinfo(np.int32).max),
-        return_y=False,
-    ),
-    "df-list": make_example_dataframe_list(
-        n_cases=10,
-        n_channels=1,
-        min_n_timepoints=20,
-        max_n_timepoints=20,
-        random_state=data_rng.randint(np.iinfo(np.int32).max),
-        return_y=False,
-    ),
-    "pd-wide": make_example_2d_dataframe_collection(
-        n_cases=10,
-        n_timepoints=20,
-        random_state=data_rng.randint(np.iinfo(np.int32).max),
-        return_y=False,
-    ),
-    "nested_univ": make_example_nested_dataframe(
-        n_cases=10,
-        n_channels=1,
-        min_n_timepoints=20,
-        max_n_timepoints=20,
-        random_state=data_rng.randint(np.iinfo(np.int32).max),
-        return_y=False,
-    ),
-    "pd-multiindex": make_example_multi_index_dataframe(
-        n_cases=10,
-        n_channels=1,
-        min_n_timepoints=20,
-        max_n_timepoints=20,
-        random_state=data_rng.randint(np.iinfo(np.int32).max),
-        return_y=False,
-    ),
-}
+def _get_datatypes_for_estimator(estimator):
+    """Get all data types for estimator.
 
-UNEQUAL_LENGTH_UNIVARIATE = {
-    "np-list": X_collection_ul,
-    "df-list": make_example_dataframe_list(
-        n_cases=10,
-        n_channels=1,
-        min_n_timepoints=10,
-        max_n_timepoints=20,
-        random_state=data_rng.randint(np.iinfo(np.int32).max),
-        return_y=False,
-    ),
-    "nested_univ": make_example_nested_dataframe(
-        n_cases=10,
-        n_channels=1,
-        min_n_timepoints=10,
-        max_n_timepoints=20,
-        random_state=data_rng.randint(np.iinfo(np.int32).max),
-        return_y=False,
-    ),
-    "pd-multiindex": make_example_multi_index_dataframe(
-        n_cases=10,
-        n_channels=1,
-        min_n_timepoints=10,
-        max_n_timepoints=20,
-        random_state=data_rng.randint(np.iinfo(np.int32).max),
-        return_y=False,
-    ),
-}
+    Parameters
+    ----------
+    estimator : BaseEstimator instance or class
+        Estimator instance or class to check for valid input data types.
+
+    Returns
+    -------
+    datatypes : list of tuple
+        List of valid data types keys for the estimator usable in FULL_TEST_DATA_DICT
+        and TEST_LABEL_DICT. Each tuple is formatted (data_key, label_key).
+    """
+    datatypes = []
+    univariate, multivariate, unequal_length, missing_values = (
+        _get_capabilities_for_estimator(estimator)
+    )
+    label_type = _get_label_type_for_estimator(estimator)
 
+    inner_types = estimator.get_tag("X_inner_type")
+    if not isinstance(inner_types, list):
+        inner_types = [inner_types]
 
-EQUAL_LENGTH_MULTIVARIATE = {
-    "numpy3D": X_collection_mv,
-    "np-list": make_example_3d_numpy_list(
-        n_cases=10,
-        n_channels=2,
-        min_n_timepoints=20,
-        max_n_timepoints=20,
-        random_state=data_rng.randint(np.iinfo(np.int32).max),
-        return_y=False,
-    ),
-    "df-list": make_example_dataframe_list(
-        n_cases=10,
-        n_channels=2,
-        min_n_timepoints=20,
-        max_n_timepoints=20,
-        random_state=data_rng.randint(np.iinfo(np.int32).max),
-        return_y=False,
-    ),
-    "nested_univ": make_example_nested_dataframe(
-        n_cases=10,
-        n_channels=2,
-        min_n_timepoints=20,
-        max_n_timepoints=20,
-        random_state=data_rng.randint(np.iinfo(np.int32).max),
-        return_y=False,
-    ),
-    "pd-multiindex": make_example_multi_index_dataframe(
-        n_cases=10,
-        n_channels=2,
-        min_n_timepoints=20,
-        max_n_timepoints=20,
-        random_state=data_rng.randint(np.iinfo(np.int32).max),
-        return_y=False,
-    ),
-}
+    if isinstance(estimator, BaseCollectionEstimator):
+        for inner_type in inner_types:
+            if univariate:
+                s = f"EqualLengthUnivariate-{label_type}-{inner_type}"
+                if s in FULL_TEST_DATA_DICT:
+                    datatypes.append(s)
 
-UNEQUAL_LENGTH_MULTIVARIATE = {
-    "np-list": make_example_3d_numpy_list(
-        n_cases=10,
-        n_channels=2,
-        max_n_timepoints=20,
-        min_n_timepoints=10,
-        random_state=data_rng.randint(np.iinfo(np.int32).max),
-        return_y=False,
-    ),
-    "df-list": make_example_dataframe_list(
-        n_cases=10,
-        n_channels=2,
-        min_n_timepoints=10,
-        max_n_timepoints=20,
-        random_state=data_rng.randint(np.iinfo(np.int32).max),
-        return_y=False,
-    ),
-    "nested_univ": make_example_nested_dataframe(
-        n_cases=10,
-        n_channels=2,
-        min_n_timepoints=10,
-        max_n_timepoints=20,
-        random_state=data_rng.randint(np.iinfo(np.int32).max),
-        return_y=False,
-    ),
-    "pd-multiindex": make_example_multi_index_dataframe(
-        n_cases=10,
-        n_channels=2,
-        min_n_timepoints=10,
-        max_n_timepoints=20,
-        random_state=data_rng.randint(np.iinfo(np.int32).max),
-        return_y=False,
-    ),
-}
+                if unequal_length:
+                    s = f"UnequalLengthUnivariate-{label_type}-{inner_type}"
+                    if s in FULL_TEST_DATA_DICT:
+                        datatypes.append(s)
 
+            if multivariate:
+                s = f"EqualLengthMultivariate-{label_type}-{inner_type}"
+                if s in FULL_TEST_DATA_DICT:
+                    datatypes.append(s)
 
-def get_data_types_for_estimator(estimator):
-    """Get data types for estimator.
+                if unequal_length:
+                    s = f"UnequalLengthMultivariate-{label_type}-{inner_type}"
+                    if s in FULL_TEST_DATA_DICT:
+                        datatypes.append(s)
+    elif isinstance(estimator, BaseSeriesEstimator):
+        if univariate:
+            datatypes.append("UnivariateSeries-NoLabel")
+        if multivariate:
+            datatypes.append("MultivariateSeries-NoLabel")
+        if missing_values:
+            datatypes.append("MissingValues-NoLabel")
+    else:
+        raise ValueError(f"Unknown estimator type: {type(estimator)}")
+
+    if missing_values:
+        datatypes.append(f"MissingValues-{label_type}")
+
+    if len(datatypes) == 0:
+        raise ValueError(f"No valid data types found for estimator {estimator}")
+
+    return datatypes
+
+
+def _get_capabilities_for_estimator(estimator):
+    """Get capabilities for estimator.
 
     Parameters
     ----------
@@ -351,21 +910,37 @@ def get_data_types_for_estimator(estimator):
 
     Returns
     -------
-    datatypes : list of str
-        List of valid data types for the estimator usable in TEST_DATA_DICT.
+    capabilities : tuple of bool
+        Tuple of valid capabilities for the estimator.
     """
-    univariate = estimator.get_tag("capability:univariate", True, raise_error=False)
+    univariate = estimator.get_tag(
+        "capability:univariate", tag_value_default=True, raise_error=False
+    )
     multivariate = estimator.get_tag(
-        "capability:multivariate", False, raise_error=False
+        "capability:multivariate", tag_value_default=False, raise_error=False
     )
     unequal_length = estimator.get_tag(
-        "capability:unequal_length", False, raise_error=False
+        "capability:unequal_length", tag_value_default=False, raise_error=False
     )
     missing_values = estimator.get_tag(
-        "capability:missing_values", False, raise_error=False
+        "capability:missing_values", tag_value_default=False, raise_error=False
     )
-    datatypes = []
+    return univariate, multivariate, unequal_length, missing_values
+
+
+def _get_label_type_for_estimator(estimator):
+    """Get label type for estimator.
+
+    Parameters
+    ----------
+    estimator : BaseEstimator instance or class
+        Estimator instance or class to check for valid input data types.
 
+    Returns
+    -------
+    label_type : str
+        Label type key for the estimator for use in TEST_LABEL_DICT.
+    """
     if (
         isinstance(estimator, BaseClassifier)
         or isinstance(estimator, BaseClusterer)
@@ -374,35 +949,13 @@ def get_data_types_for_estimator(estimator):
         label_type = "Classification"
     elif isinstance(estimator, BaseRegressor):
         label_type = "Regression"
-    elif isinstance(estimator, BaseAnomalyDetector):
-        label_type = "Anomaly Detection"
-    elif isinstance(estimator, BaseSegmenter):
-        label_type = "Segmentation"
-    elif isinstance(estimator, BaseSeriesTransformer):
-        label_type = None
-    else:
-        raise ValueError(f"Unknown estimator type: {type(estimator)}")
-
-    if isinstance(estimator, BaseCollectionEstimator):
-        if univariate:
-            datatypes.append(("UnivariateCollection", label_type))
-        if multivariate:
-            datatypes.append(("MultivariateCollection", label_type))
-        if unequal_length:
-            datatypes.append(("UnequalLengthCollection", label_type))
-        if missing_values:
-            datatypes.append(("MissingValuesCollection", label_type))
-    elif isinstance(estimator, BaseSeriesEstimator):
-        if univariate:
-            datatypes.append(("UnivariateSeries", label_type))
-        if multivariate:
-            datatypes.append(("MultivariateSeries", label_type))
-        if missing_values:
-            datatypes.append(("MissingValuesSeries", label_type))
+    elif (
+        isinstance(estimator, BaseAnomalyDetector)
+        or isinstance(estimator, BaseSegmenter)
+        or isinstance(estimator, BaseSeriesTransformer)
+    ):
+        label_type = "NoLabel"
     else:
         raise ValueError(f"Unknown estimator type: {type(estimator)}")
 
-    if len(datatypes) == 0:
-        raise ValueError(f"No valid data types found for estimator {estimator}")
-
-    return datatypes
+    return label_type
diff --git a/aeon/testing/tests/test_all_estimators.py b/aeon/testing/tests/test_all_estimators.py
new file mode 100644
index 0000000000..85aec503f0
--- /dev/null
+++ b/aeon/testing/tests/test_all_estimators.py
@@ -0,0 +1,15 @@
+"""Test all estimators in aeon."""
+
+from aeon.registry import all_estimators
+from aeon.testing.estimator_checking import parametrize_with_checks
+
+ALL_ESTIMATORS = all_estimators(
+    estimator_types=["classifier"],
+    return_names=False,
+)
+
+
+@parametrize_with_checks(ALL_ESTIMATORS)
+def test_all_estimators(estimator, check):
+    """Run general estimator checks on all aeon estimators."""
+    check(estimator)
diff --git a/aeon/testing/test_softdeps.py b/aeon/testing/tests/test_softdeps.py
similarity index 100%
rename from aeon/testing/test_softdeps.py
rename to aeon/testing/tests/test_softdeps.py
diff --git a/aeon/testing/tests/test_testing_data.py b/aeon/testing/tests/test_testing_data.py
index 8086fba4d1..466d2581d4 100644
--- a/aeon/testing/tests/test_testing_data.py
+++ b/aeon/testing/tests/test_testing_data.py
@@ -1,14 +1,18 @@
 """Tests for testing data dictionaries."""
 
 import numpy as np
+from sklearn.utils.multiclass import check_classification_targets
 
 from aeon.testing.testing_data import (
-    EQUAL_LENGTH_MULTIVARIATE,
-    EQUAL_LENGTH_UNIVARIATE,
-    TEST_DATA_DICT,
-    TEST_LABEL_DICT,
-    UNEQUAL_LENGTH_MULTIVARIATE,
-    UNEQUAL_LENGTH_UNIVARIATE,
+    EQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION,
+    EQUAL_LENGTH_MULTIVARIATE_REGRESSION,
+    EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION,
+    EQUAL_LENGTH_UNIVARIATE_REGRESSION,
+    FULL_TEST_DATA_DICT,
+    UNEQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION,
+    UNEQUAL_LENGTH_MULTIVARIATE_REGRESSION,
+    UNEQUAL_LENGTH_UNIVARIATE_CLASSIFICATION,
+    UNEQUAL_LENGTH_UNIVARIATE_REGRESSION,
 )
 from aeon.utils.validation import (
     is_collection,
@@ -20,60 +24,210 @@
 
 def test_test_data_dict():
     """Test the contents of the test data dictionary."""
-    for key in TEST_DATA_DICT:
-        assert isinstance(TEST_DATA_DICT[key], dict)
-        assert len(TEST_DATA_DICT[key]) == 2
-        assert "train" in TEST_DATA_DICT[key]
-        assert "test" in TEST_DATA_DICT[key]
-        assert is_collection(TEST_DATA_DICT[key]["train"]) or is_single_series(
-            TEST_DATA_DICT[key]["train"]
-        )
-        assert is_collection(TEST_DATA_DICT[key]["test"]) or is_single_series(
-            TEST_DATA_DICT[key]["train"]
-        )
-
-
-def test_test_label_dict():
-    """Test the contents of the test label dictionary."""
-    for key in TEST_LABEL_DICT:
-        assert isinstance(TEST_LABEL_DICT[key], dict)
-        assert len(TEST_LABEL_DICT[key]) == 2
-        assert "train" in TEST_LABEL_DICT[key]
-        assert "test" in TEST_LABEL_DICT[key]
-        if TEST_LABEL_DICT[key]["train"] is not None:
-            assert isinstance(TEST_LABEL_DICT[key]["train"], np.ndarray)
-            assert isinstance(TEST_LABEL_DICT[key]["test"], np.ndarray)
-            assert TEST_LABEL_DICT[key]["train"].ndim == 1
-            assert TEST_LABEL_DICT[key]["test"].ndim == 1
-
-
-def test_equal_length_univariate():
-    """Test the contents of the equal length univariate data dictionary."""
-    for key in EQUAL_LENGTH_UNIVARIATE:
-        assert is_collection(EQUAL_LENGTH_UNIVARIATE[key], include_2d=True)
-        assert is_univariate(EQUAL_LENGTH_UNIVARIATE[key])
-        assert is_equal_length(EQUAL_LENGTH_UNIVARIATE[key])
-
-
-def test_unequal_length_univariate():
+    for key in FULL_TEST_DATA_DICT:
+        # format
+        assert isinstance(FULL_TEST_DATA_DICT[key], dict)
+        assert len(FULL_TEST_DATA_DICT[key]) == 2
+        assert "train" in FULL_TEST_DATA_DICT[key]
+        assert "test" in FULL_TEST_DATA_DICT[key]
+        # data
+        assert is_collection(FULL_TEST_DATA_DICT[key]["train"][0]) or is_single_series(
+            FULL_TEST_DATA_DICT[key]["train"][0]
+        )
+        assert is_collection(FULL_TEST_DATA_DICT[key]["test"][0]) or is_single_series(
+            FULL_TEST_DATA_DICT[key]["test"][0]
+        )
+        # label
+        if FULL_TEST_DATA_DICT[key]["train"][1] is not None:
+            assert isinstance(FULL_TEST_DATA_DICT[key]["train"][1], np.ndarray)
+            assert isinstance(FULL_TEST_DATA_DICT[key]["test"][1], np.ndarray)
+            assert FULL_TEST_DATA_DICT[key]["train"][1].ndim == 1
+            assert FULL_TEST_DATA_DICT[key]["test"][1].ndim == 1
+
+
+def test_equal_length_univariate_collection():
+    """Test the contents of the equal length univariate data dictionaries."""
+    for key in EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION:
+        assert is_collection(
+            EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[key]["train"][0], include_2d=True
+        )
+        assert is_univariate(EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[key]["train"][0])
+        assert is_equal_length(EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[key]["train"][0])
+        check_classification_targets(
+            EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[key]["train"][1]
+        )
+
+        assert is_collection(
+            EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[key]["test"][0], include_2d=True
+        )
+        assert is_univariate(EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[key]["test"][0])
+        assert is_equal_length(EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[key]["test"][0])
+        check_classification_targets(
+            EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[key]["test"][1]
+        )
+
+    for key in EQUAL_LENGTH_UNIVARIATE_REGRESSION:
+        assert is_collection(
+            EQUAL_LENGTH_UNIVARIATE_REGRESSION[key]["train"][0], include_2d=True
+        )
+        assert is_univariate(EQUAL_LENGTH_UNIVARIATE_REGRESSION[key]["train"][0])
+        assert is_equal_length(EQUAL_LENGTH_UNIVARIATE_REGRESSION[key]["train"][0])
+        assert np.issubdtype(
+            EQUAL_LENGTH_UNIVARIATE_REGRESSION[key]["train"][1].dtype, np.integer
+        ) or np.issubdtype(
+            EQUAL_LENGTH_UNIVARIATE_REGRESSION[key]["train"][1].dtype, np.floating
+        )
+
+        assert is_collection(
+            EQUAL_LENGTH_UNIVARIATE_REGRESSION[key]["test"][0], include_2d=True
+        )
+        assert is_univariate(EQUAL_LENGTH_UNIVARIATE_REGRESSION[key]["test"][0])
+        assert is_equal_length(EQUAL_LENGTH_UNIVARIATE_REGRESSION[key]["test"][0])
+        assert np.issubdtype(
+            EQUAL_LENGTH_UNIVARIATE_REGRESSION[key]["test"][1].dtype, np.integer
+        ) or np.issubdtype(
+            EQUAL_LENGTH_UNIVARIATE_REGRESSION[key]["test"][1].dtype, np.floating
+        )
+
+
+def test_unequal_length_univariate_collection():
     """Test the contents of the unequal length univariate data dictionary."""
-    for key in UNEQUAL_LENGTH_UNIVARIATE:
-        assert is_collection(UNEQUAL_LENGTH_UNIVARIATE[key])
-        assert is_univariate(UNEQUAL_LENGTH_UNIVARIATE[key])
-        assert not is_equal_length(UNEQUAL_LENGTH_UNIVARIATE[key])
+    for key in UNEQUAL_LENGTH_UNIVARIATE_CLASSIFICATION:
+        assert is_collection(UNEQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[key]["train"][0])
+        assert is_univariate(UNEQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[key]["train"][0])
+        assert not is_equal_length(
+            UNEQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[key]["train"][0]
+        )
+        check_classification_targets(
+            UNEQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[key]["train"][1]
+        )
+
+        assert is_collection(UNEQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[key]["test"][0])
+        assert is_univariate(UNEQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[key]["test"][0])
+        assert not is_equal_length(
+            UNEQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[key]["test"][0]
+        )
+        check_classification_targets(
+            UNEQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[key]["test"][1]
+        )
+
+    for key in UNEQUAL_LENGTH_UNIVARIATE_REGRESSION:
+        assert is_collection(UNEQUAL_LENGTH_UNIVARIATE_REGRESSION[key]["train"][0])
+        assert is_univariate(UNEQUAL_LENGTH_UNIVARIATE_REGRESSION[key]["train"][0])
+        assert not is_equal_length(
+            UNEQUAL_LENGTH_UNIVARIATE_REGRESSION[key]["train"][0]
+        )
+        assert np.issubdtype(
+            UNEQUAL_LENGTH_UNIVARIATE_REGRESSION[key]["train"][1].dtype, np.integer
+        ) or np.issubdtype(
+            UNEQUAL_LENGTH_UNIVARIATE_REGRESSION[key]["train"][1].dtype, np.floating
+        )
+
+        assert is_collection(UNEQUAL_LENGTH_UNIVARIATE_REGRESSION[key]["test"][0])
+        assert is_univariate(UNEQUAL_LENGTH_UNIVARIATE_REGRESSION[key]["test"][0])
+        assert not is_equal_length(UNEQUAL_LENGTH_UNIVARIATE_REGRESSION[key]["test"][0])
+        assert np.issubdtype(
+            UNEQUAL_LENGTH_UNIVARIATE_REGRESSION[key]["test"][1].dtype, np.integer
+        ) or np.issubdtype(
+            UNEQUAL_LENGTH_UNIVARIATE_REGRESSION[key]["test"][1].dtype, np.floating
+        )
 
 
-def test_equal_length_multivariate():
+def test_equal_length_multivariate_collection():
     """Test the contents of the equal length multivariate data dictionary."""
-    for key in EQUAL_LENGTH_MULTIVARIATE:
-        assert is_collection(EQUAL_LENGTH_MULTIVARIATE[key])
-        assert not is_univariate(EQUAL_LENGTH_MULTIVARIATE[key])
-        assert is_equal_length(EQUAL_LENGTH_MULTIVARIATE[key])
+    for key in EQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION:
+        assert is_collection(EQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION[key]["train"][0])
+        assert not is_univariate(
+            EQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION[key]["train"][0]
+        )
+        assert is_equal_length(
+            EQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION[key]["train"][0]
+        )
+        check_classification_targets(
+            EQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION[key]["train"][1]
+        )
+
+        assert is_collection(EQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION[key]["test"][0])
+        assert not is_univariate(
+            EQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION[key]["test"][0]
+        )
+        assert is_equal_length(EQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION[key]["test"][0])
+        check_classification_targets(
+            EQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION[key]["test"][1]
+        )
+
+    for key in EQUAL_LENGTH_MULTIVARIATE_REGRESSION:
+        assert is_collection(EQUAL_LENGTH_MULTIVARIATE_REGRESSION[key]["train"][0])
+        assert not is_univariate(EQUAL_LENGTH_MULTIVARIATE_REGRESSION[key]["train"][0])
+        assert is_equal_length(EQUAL_LENGTH_MULTIVARIATE_REGRESSION[key]["train"][0])
+        assert np.issubdtype(
+            EQUAL_LENGTH_MULTIVARIATE_REGRESSION[key]["train"][1].dtype, np.integer
+        ) or np.issubdtype(
+            EQUAL_LENGTH_MULTIVARIATE_REGRESSION[key]["train"][1].dtype, np.floating
+        )
+
+        assert is_collection(EQUAL_LENGTH_MULTIVARIATE_REGRESSION[key]["test"][0])
+        assert not is_univariate(EQUAL_LENGTH_MULTIVARIATE_REGRESSION[key]["test"][0])
+        assert is_equal_length(EQUAL_LENGTH_MULTIVARIATE_REGRESSION[key]["test"][0])
+        assert np.issubdtype(
+            EQUAL_LENGTH_MULTIVARIATE_REGRESSION[key]["test"][1].dtype, np.integer
+        ) or np.issubdtype(
+            EQUAL_LENGTH_MULTIVARIATE_REGRESSION[key]["test"][1].dtype, np.floating
+        )
 
 
-def test_unequal_length_multivariate():
+def test_unequal_length_multivariate_collection():
     """Test the contents of the unequal length multivariate data dictionary."""
-    for key in UNEQUAL_LENGTH_MULTIVARIATE:
-        assert is_collection(UNEQUAL_LENGTH_MULTIVARIATE[key])
-        assert not is_univariate(UNEQUAL_LENGTH_MULTIVARIATE[key])
-        assert not is_equal_length(UNEQUAL_LENGTH_MULTIVARIATE[key])
+    for key in UNEQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION:
+        assert is_collection(
+            UNEQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION[key]["train"][0]
+        )
+        assert not is_univariate(
+            UNEQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION[key]["train"][0]
+        )
+        assert not is_equal_length(
+            UNEQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION[key]["train"][0]
+        )
+        check_classification_targets(
+            UNEQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION[key]["train"][1]
+        )
+
+        assert is_collection(UNEQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION[key]["test"][0])
+        assert not is_univariate(
+            UNEQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION[key]["test"][0]
+        )
+        assert not is_equal_length(
+            UNEQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION[key]["test"][0]
+        )
+        check_classification_targets(
+            UNEQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION[key]["test"][1]
+        )
+
+    for key in UNEQUAL_LENGTH_MULTIVARIATE_REGRESSION:
+        assert is_collection(UNEQUAL_LENGTH_MULTIVARIATE_REGRESSION[key]["train"][0])
+        assert not is_univariate(
+            UNEQUAL_LENGTH_MULTIVARIATE_REGRESSION[key]["train"][0]
+        )
+        assert not is_equal_length(
+            UNEQUAL_LENGTH_MULTIVARIATE_REGRESSION[key]["train"][0]
+        )
+        assert np.issubdtype(
+            UNEQUAL_LENGTH_MULTIVARIATE_REGRESSION[key]["train"][1].dtype, np.integer
+        ) or np.issubdtype(
+            UNEQUAL_LENGTH_MULTIVARIATE_REGRESSION[key]["train"][1].dtype, np.floating
+        )
+
+        assert is_collection(UNEQUAL_LENGTH_MULTIVARIATE_REGRESSION[key]["test"][0])
+        assert not is_univariate(UNEQUAL_LENGTH_MULTIVARIATE_REGRESSION[key]["test"][0])
+        assert not is_equal_length(
+            UNEQUAL_LENGTH_MULTIVARIATE_REGRESSION[key]["test"][0]
+        )
+        assert np.issubdtype(
+            UNEQUAL_LENGTH_MULTIVARIATE_REGRESSION[key]["test"][1].dtype, np.integer
+        ) or np.issubdtype(
+            UNEQUAL_LENGTH_MULTIVARIATE_REGRESSION[key]["test"][1].dtype, np.floating
+        )
+
+
+def test_missing_values_collection():
+    pass
diff --git a/aeon/testing/utils/estimator_checks.py b/aeon/testing/utils/estimator_checks.py
index 9c9ca43a41..10a86e5864 100644
--- a/aeon/testing/utils/estimator_checks.py
+++ b/aeon/testing/utils/estimator_checks.py
@@ -16,7 +16,7 @@
 from aeon.forecasting.base import BaseForecaster
 from aeon.regression.base import BaseRegressor
 from aeon.testing.test_config import VALID_ESTIMATOR_TYPES
-from aeon.testing.testing_data import TEST_DATA_DICT, TEST_LABEL_DICT
+from aeon.testing.testing_data import FULL_TEST_DATA_DICT
 from aeon.transformations.base import BaseTransformer
 from aeon.utils.validation import is_nested_univ_dataframe
 
@@ -26,14 +26,24 @@ def _run_estimator_method(estimator, method_name, datatype, split):
     args = inspect.getfullargspec(method)[0]
     if "X" in args and "y" in args:
         return method(
-            X=TEST_DATA_DICT[datatype[0]][split], y=TEST_LABEL_DICT[datatype[1]][split]
+            X=FULL_TEST_DATA_DICT[datatype][split][0],
+            y=FULL_TEST_DATA_DICT[datatype][split][1],
         )
     elif "X" in args:
-        return method(X=TEST_DATA_DICT[datatype[0]][split])
+        return method(X=FULL_TEST_DATA_DICT[datatype][split][0])
     else:
         return method()
 
 
+def _get_tag(estimator, tag_name, default=None, raise_error=False):
+    if isclass(estimator):
+        return estimator.get_class_tag(tag_name=tag_name, tag_value_default=default)
+    else:
+        return estimator.get_tag(
+            tag_name=tag_name, tag_value_default=default, raise_error=raise_error
+        )
+
+
 def _get_err_msg(estimator):
     return (
         f"Invalid estimator type: {type(estimator)}. Valid estimator types are: "
diff --git a/aeon/utils/conversion/tests/test_convert_collection.py b/aeon/utils/conversion/tests/test_convert_collection.py
index c026765f48..8fcc73ffec 100644
--- a/aeon/utils/conversion/tests/test_convert_collection.py
+++ b/aeon/utils/conversion/tests/test_convert_collection.py
@@ -6,9 +6,9 @@
 
 from aeon.testing.data_generation import make_example_nested_dataframe
 from aeon.testing.testing_data import (
-    EQUAL_LENGTH_MULTIVARIATE,
-    EQUAL_LENGTH_UNIVARIATE,
-    UNEQUAL_LENGTH_UNIVARIATE,
+    EQUAL_LENGTH_MULTIVARIATE_COLLECTION,
+    EQUAL_LENGTH_UNIVARIATE_COLLECTION,
+    UNEQUAL_LENGTH_UNIVARIATE_COLLECTION,
 )
 from aeon.utils import COLLECTIONS_DATA_TYPES
 from aeon.utils.conversion._convert_collection import (
@@ -46,37 +46,46 @@
 def test_convert_collection(input_data, output_data):
     """Test all valid and invalid conversions."""
     # All should work with univariate equal length
-    X = convert_collection(EQUAL_LENGTH_UNIVARIATE[input_data], output_data)
+    X = convert_collection(EQUAL_LENGTH_UNIVARIATE_COLLECTION[input_data], output_data)
     assert get_type(X) == output_data
     # Test with multivariate
-    if input_data in EQUAL_LENGTH_MULTIVARIATE:
-        if output_data in EQUAL_LENGTH_MULTIVARIATE:
-            X = convert_collection(EQUAL_LENGTH_MULTIVARIATE[input_data], output_data)
+    if input_data in EQUAL_LENGTH_MULTIVARIATE_COLLECTION:
+        if output_data in EQUAL_LENGTH_MULTIVARIATE_COLLECTION:
+            X = convert_collection(
+                EQUAL_LENGTH_MULTIVARIATE_COLLECTION[input_data], output_data
+            )
             assert get_type(X) == output_data
         else:
             with pytest.raises(TypeError, match="Cannot convert multivariate"):
                 X = convert_collection(
-                    EQUAL_LENGTH_MULTIVARIATE[input_data], output_data
+                    EQUAL_LENGTH_MULTIVARIATE_COLLECTION[input_data], output_data
                 )
     # Test with unequal length
-    if input_data in UNEQUAL_LENGTH_UNIVARIATE:
-        if output_data in UNEQUAL_LENGTH_UNIVARIATE or output_data == "pd-multiindex":
-            X = convert_collection(UNEQUAL_LENGTH_UNIVARIATE[input_data], output_data)
+    if input_data in UNEQUAL_LENGTH_UNIVARIATE_COLLECTION:
+        if (
+            output_data in UNEQUAL_LENGTH_UNIVARIATE_COLLECTION
+            or output_data == "pd-multiindex"
+        ):
+            X = convert_collection(
+                UNEQUAL_LENGTH_UNIVARIATE_COLLECTION[input_data], output_data
+            )
             assert get_type(X) == output_data
         else:
             with pytest.raises(TypeError, match="Cannot convert unequal"):
                 X = convert_collection(
-                    UNEQUAL_LENGTH_UNIVARIATE[input_data], output_data
+                    UNEQUAL_LENGTH_UNIVARIATE_COLLECTION[input_data], output_data
                 )
 
 
 @pytest.mark.parametrize("input_data", COLLECTIONS_DATA_TYPES)
 def test_convert_df_list(input_data):
     """Test that df list is correctly transposed."""
-    X = convert_collection(EQUAL_LENGTH_UNIVARIATE[input_data], "df-list")
+    X = convert_collection(EQUAL_LENGTH_UNIVARIATE_COLLECTION[input_data], "df-list")
     assert X[0].shape == (20, 1)
-    if input_data in EQUAL_LENGTH_MULTIVARIATE:
-        X = convert_collection(EQUAL_LENGTH_MULTIVARIATE[input_data], "df-list")
+    if input_data in EQUAL_LENGTH_MULTIVARIATE_COLLECTION:
+        X = convert_collection(
+            EQUAL_LENGTH_MULTIVARIATE_COLLECTION[input_data], "df-list"
+        )
         assert X[0].shape == (20, 2)
 
 
@@ -109,43 +118,43 @@ def test_resolve_unequal_length_inner_type():
 @pytest.mark.parametrize("data", COLLECTIONS_DATA_TYPES)
 def test_get_n_cases(data):
     """Test getting the number of cases."""
-    assert get_n_cases(EQUAL_LENGTH_UNIVARIATE[data]) == 10
+    assert get_n_cases(EQUAL_LENGTH_UNIVARIATE_COLLECTION[data]) == 10
 
 
 @pytest.mark.parametrize("data", COLLECTIONS_DATA_TYPES)
 def test_get_type(data):
     """Test getting the type."""
-    assert get_type(EQUAL_LENGTH_UNIVARIATE[data]) == data
+    assert get_type(EQUAL_LENGTH_UNIVARIATE_COLLECTION[data]) == data
 
 
 @pytest.mark.parametrize("data", COLLECTIONS_DATA_TYPES)
 def test_equal_length(data):
     """Test if equal length series correctly identified."""
-    assert _equal_length(EQUAL_LENGTH_UNIVARIATE[data], data)
+    assert _equal_length(EQUAL_LENGTH_UNIVARIATE_COLLECTION[data], data)
 
 
 @pytest.mark.parametrize("data", COLLECTIONS_DATA_TYPES)
 def test_is_equal_length(data):
     """Test if equal length series correctly identified."""
-    assert is_equal_length(EQUAL_LENGTH_UNIVARIATE[data])
+    assert is_equal_length(EQUAL_LENGTH_UNIVARIATE_COLLECTION[data])
 
 
 @pytest.mark.parametrize("data", ["df-list", "np-list"])
 def test_unequal_length(data):
     """Test if unequal length series correctly identified."""
-    assert not _equal_length(UNEQUAL_LENGTH_UNIVARIATE[data], data)
+    assert not _equal_length(UNEQUAL_LENGTH_UNIVARIATE_COLLECTION[data], data)
 
 
 @pytest.mark.parametrize("data", ["df-list", "np-list"])
 def test_is_unequal_length(data):
     """Test if unequal length series correctly identified."""
-    assert not is_equal_length(UNEQUAL_LENGTH_UNIVARIATE[data])
+    assert not is_equal_length(UNEQUAL_LENGTH_UNIVARIATE_COLLECTION[data])
 
 
 @pytest.mark.parametrize("data", COLLECTIONS_DATA_TYPES)
 def test_has_missing(data):
     """Test if missing values are correctly identified."""
-    assert not has_missing(EQUAL_LENGTH_UNIVARIATE[data])
+    assert not has_missing(EQUAL_LENGTH_UNIVARIATE_COLLECTION[data])
     X = np.random.random(size=(10, 2, 20))
     X[5][1][12] = np.NAN
     assert has_missing(X)
@@ -154,9 +163,9 @@ def test_has_missing(data):
 @pytest.mark.parametrize("data", COLLECTIONS_DATA_TYPES)
 def test_is_univariate(data):
     """Test if univariate series are correctly identified."""
-    assert is_univariate(EQUAL_LENGTH_UNIVARIATE[data])
-    if data in EQUAL_LENGTH_MULTIVARIATE.keys():
-        assert not is_univariate(EQUAL_LENGTH_MULTIVARIATE[data])
+    assert is_univariate(EQUAL_LENGTH_UNIVARIATE_COLLECTION[data])
+    if data in EQUAL_LENGTH_MULTIVARIATE_COLLECTION.keys():
+        assert not is_univariate(EQUAL_LENGTH_MULTIVARIATE_COLLECTION[data])
 
 
 NUMPY3D = [
diff --git a/aeon/utils/validation/tests/test_collection.py b/aeon/utils/validation/tests/test_collection.py
index eb40fd5111..30bd467e54 100644
--- a/aeon/utils/validation/tests/test_collection.py
+++ b/aeon/utils/validation/tests/test_collection.py
@@ -5,7 +5,7 @@
 import pytest
 
 from aeon.testing.data_generation import make_example_nested_dataframe
-from aeon.testing.testing_data import EQUAL_LENGTH_UNIVARIATE
+from aeon.testing.testing_data import EQUAL_LENGTH_UNIVARIATE_COLLECTION
 from aeon.utils import COLLECTIONS_DATA_TYPES
 from aeon.utils.validation.collection import (
     _is_pd_wide,
@@ -21,9 +21,9 @@
 def test_is_nested_univ_dataframe(data):
     """Test is_nested_univ_dataframe function for different datatypes."""
     if data == "nested_univ":
-        assert is_nested_univ_dataframe(EQUAL_LENGTH_UNIVARIATE[data])
+        assert is_nested_univ_dataframe(EQUAL_LENGTH_UNIVARIATE_COLLECTION[data])
     else:
-        assert not is_nested_univ_dataframe(EQUAL_LENGTH_UNIVARIATE[data])
+        assert not is_nested_univ_dataframe(EQUAL_LENGTH_UNIVARIATE_COLLECTION[data])
 
 
 def test_nested_univ_is_equal():
@@ -49,9 +49,9 @@ def test_nested_univ_is_equal():
 def test_is_pd_wide(data):
     """Test _is_pd_wide function for different datatypes."""
     if data == "pd-wide":
-        assert _is_pd_wide(EQUAL_LENGTH_UNIVARIATE[data])
+        assert _is_pd_wide(EQUAL_LENGTH_UNIVARIATE_COLLECTION[data])
     else:
-        assert not _is_pd_wide(EQUAL_LENGTH_UNIVARIATE[data])
+        assert not _is_pd_wide(EQUAL_LENGTH_UNIVARIATE_COLLECTION[data])
 
 
 def test_is_tabular():
diff --git a/aeon/utils/validation/tests/test_input.py b/aeon/utils/validation/tests/test_input.py
index ff1e8ed944..5a120177af 100644
--- a/aeon/utils/validation/tests/test_input.py
+++ b/aeon/utils/validation/tests/test_input.py
@@ -3,7 +3,7 @@
 import pytest
 
 from aeon.testing.data_generation._legacy import get_examples
-from aeon.testing.testing_data import EQUAL_LENGTH_UNIVARIATE
+from aeon.testing.testing_data import EQUAL_LENGTH_UNIVARIATE_COLLECTION
 from aeon.utils.validation._input import (
     COLLECTIONS,
     HIERARCHICAL,
@@ -42,9 +42,9 @@ def test_abstract_types():
 @pytest.mark.parametrize("data", COLLECTIONS)
 def test_input_collections(data):
     """Test is_collection with correct input."""
-    assert is_collection(EQUAL_LENGTH_UNIVARIATE[data])
-    assert not is_single_series(EQUAL_LENGTH_UNIVARIATE[data])
-    assert not is_hierarchical(EQUAL_LENGTH_UNIVARIATE[data])
+    assert is_collection(EQUAL_LENGTH_UNIVARIATE_COLLECTION[data])
+    assert not is_single_series(EQUAL_LENGTH_UNIVARIATE_COLLECTION[data])
+    assert not is_hierarchical(EQUAL_LENGTH_UNIVARIATE_COLLECTION[data])
 
 
 @pytest.mark.parametrize("data_type", HIERARCHICAL)
@@ -81,7 +81,7 @@ def test_input_series(data_type):
 @pytest.mark.parametrize("data_type", COLLECTIONS)
 def test_input_collection(data_type):
     """Test is_collection with correct input."""
-    d = EQUAL_LENGTH_UNIVARIATE[data_type]
+    d = EQUAL_LENGTH_UNIVARIATE_COLLECTION[data_type]
     assert is_collection(d)
     assert not is_single_series(d)
     assert not is_hierarchical(d)

From 3c209bc9f984f3b76d9332a15374b83d697851cf Mon Sep 17 00:00:00 2001
From: MatthewMiddlehurst <m.middlehurst@uea.ac.uk>
Date: Thu, 11 Jul 2024 10:26:09 +0100
Subject: [PATCH 02/15] rework yield checks

---
 .../estimator_checking/_estimator_checking.py | 67 ++++-------------
 .../_yield_classification_checks.py           |  6 +-
 .../_yield_estimator_checks.py                | 74 ++++++++++---------
 3 files changed, 58 insertions(+), 89 deletions(-)

diff --git a/aeon/testing/estimator_checking/_estimator_checking.py b/aeon/testing/estimator_checking/_estimator_checking.py
index 78edb2f4e0..471c761274 100644
--- a/aeon/testing/estimator_checking/_estimator_checking.py
+++ b/aeon/testing/estimator_checking/_estimator_checking.py
@@ -31,8 +31,8 @@ def parametrize_with_checks(
 ) -> Callable:
     """Pytest specific decorator for parametrizing aeon estimator checks.
 
-    The `id` of each check is set to be a pprint version of the estimator
-    and the name of the check with its keyword arguments.
+    The `id` of each check is set to be the name of the check with its keyword
+    arguments, including a pprint version of the estimator.
 
     This allows to use `pytest -k` to specify which tests to run i.e.
         pytest -k check_fit_updates_state
@@ -69,35 +69,16 @@ class is passed.
 
     import pytest
 
-    def checks_generator():
-        for est in estimators:
-            has_dependencies = _check_estimator_deps(est, severity="none")
-
-            if has_dependencies:
-                if isclass(est):
-                    if issubclass(est, BaseEstimator):
-                        est = est.create_test_instance(
-                            return_first=use_first_parameter_set
-                        )
-                    else:
-                        raise TypeError(
-                            f"Passed class {est} is not a subclass of BaseEstimator."
-                        )
-                elif not isinstance(est, BaseEstimator):
-                    raise TypeError(
-                        f"Passed object {est} is not an instance of BaseEstimator."
-                    )
-
-            if not isinstance(est, list):
-                est = [est]
-
-            for e in est:
-                for check in _yield_all_aeon_checks(e):
-                    yield _check_if_xfail(e, check, has_dependencies)
+    checks = []
+    for est in estimators:
+        has_dependencies = _check_estimator_deps(est, severity="none")
+
+        for check in _yield_all_aeon_checks(est, use_first_parameter_set=use_first_parameter_set, has_dependencies=has_dependencies):
+            checks.append(_check_if_xfail(est, check, has_dependencies))
 
     return pytest.mark.parametrize(
-        "estimator, check",
-        checks_generator(),
+        "check",
+        checks,
         ids=_get_check_estimator_ids,
     )
 
@@ -196,29 +177,9 @@ class is passed.
     """
     _check_estimator_deps(estimator)
 
-    def checks_generator():
-        est = estimator
-        has_dependencies = _check_estimator_deps(est, severity="none")
-
-        if has_dependencies:
-            if isclass(est):
-                if issubclass(est, BaseEstimator):
-                    est = est.create_test_instance(return_first=use_first_parameter_set)
-                else:
-                    raise TypeError(
-                        f"Passed class {est} is not a subclass of BaseEstimator."
-                    )
-            elif not isinstance(est, BaseEstimator):
-                raise TypeError(
-                    f"Passed object {est} is not an instance of BaseEstimator."
-                )
-
-            if not isinstance(est, list):
-                est = [est]
-
-        for e in est:
-            for check in _yield_all_aeon_checks(e):
-                yield _check_if_skip(e, check, has_dependencies)
+    checks = []
+    for check in _yield_all_aeon_checks(estimator, use_first_parameter_set=use_first_parameter_set, has_dependencies=True):
+        checks.append(_check_if_skip(estimator, check, True))
 
     if not isinstance(checks_to_run, (list, tuple)) and checks_to_run is not None:
         checks_to_run = [checks_to_run]
@@ -242,7 +203,7 @@ def checks_generator():
     skipped = 0
     failed = 0
     results = {}
-    for est, check in checks_generator():
+    for est, check in checks:
         check_name = _get_check_estimator_ids(check)
         full_name = f"{_get_check_estimator_ids(est)}-{check_name}"
 
diff --git a/aeon/testing/estimator_checking/_yield_classification_checks.py b/aeon/testing/estimator_checking/_yield_classification_checks.py
index 8a4acb7b7b..13bfb8840d 100644
--- a/aeon/testing/estimator_checking/_yield_classification_checks.py
+++ b/aeon/testing/estimator_checking/_yield_classification_checks.py
@@ -17,7 +17,7 @@
 from aeon.utils.validation import get_n_cases
 
 
-def _yield_classification_checks(estimator, datatypes):
+def _yield_classification_checks(estimator_class, estimator_instances, datatypes):
     """Yield all classification checks for an aeon classifier."""
     # no data needed
     yield test_classifier_against_expected_results
@@ -42,6 +42,10 @@ def _yield_classification_checks(estimator, datatypes):
 def test_classifier_against_expected_results(estimator):
     """Test classifier against stored results."""
     # we only use the first estimator instance for testing
+
+    #todo remove
+    return None
+
     class_name = type(estimator).__name__
 
     # We cannot guarantee same results on ARM macOS
diff --git a/aeon/testing/estimator_checking/_yield_estimator_checks.py b/aeon/testing/estimator_checking/_yield_estimator_checks.py
index fbc580745a..85f44f0a63 100644
--- a/aeon/testing/estimator_checking/_yield_estimator_checks.py
+++ b/aeon/testing/estimator_checking/_yield_estimator_checks.py
@@ -36,26 +36,48 @@
     _run_estimator_method,
 )
 from aeon.transformations.base import BaseTransformer
+from aeon.utils.validation._dependencies import _check_estimator_deps
 
 
-def _yield_all_aeon_checks(estimator):
+def _yield_all_aeon_checks(estimator, use_first_parameter_set=False, has_dependencies=None):
     """Yield all checks for an aeon estimator."""
-    # if a class is passed, all tests are going to be skipped as we could not
-    # instantiate the class
-    datatypes = (
-        _get_datatypes_for_estimator(estimator) if not isclass(estimator) else [None]
-    )
+    if has_dependencies is None:
+        has_dependencies = _check_estimator_deps(estimator, severity="none")
+
+    estimator_class = None
+    estimator_instances = None
+
+    if has_dependencies:
+        if isclass(estimator) and issubclass(estimator, BaseEstimator):
+            estimator_class = estimator
+            estimator_instances = estimator.create_test_instance(return_first=use_first_parameter_set)
+        elif isinstance(estimator, BaseEstimator):
+            estimator_class = type(estimator)
+            estimator_instances = estimator
+        else:
+            raise TypeError(
+                f"Passed estimator is not an instance or subclass of BaseEstimator."
+            )
 
-    yield from _yield_estimator_checks(estimator, datatypes)
+        if not isinstance(estimator_instances, list):
+            estimator_instances = [estimator_instances]
 
-    if isinstance(estimator, BaseClassifier):
-        yield from _yield_classification_checks(estimator, datatypes)
+    # if input does not have all dependencies installed, all tests are going to be
+    # skipped as we cannot instantiate the class
+    datatypes = [
+        _get_datatypes_for_estimator(est) if has_dependencies else [None] for est in estimator_instances
+    ]
 
+    yield from _yield_estimator_checks(estimator_class, estimator_instances, datatypes)
 
-def _yield_estimator_checks(estimator, datatypes):
+    if issubclass(estimator_class, BaseClassifier):
+        yield from _yield_classification_checks(estimator_class, estimator_instances, datatypes)
+
+
+def _yield_estimator_checks(estimator_class, estimator_instances, datatypes):
     """Yield all general checks for an aeon estimator."""
     # no data needed
-    yield check_create_test_instance
+    yield partial(check_create_test_instance, estimator_class=estimator_class)
     yield check_create_test_instances_and_names
     yield check_estimator_tags
     yield check_inheritance
@@ -90,7 +112,7 @@ def _yield_estimator_checks(estimator, datatypes):
         yield partial(check_fit_deterministic, datatype=datatypes[0])
 
 
-def check_create_test_instance(estimator):
+def check_create_test_instance(estimator_class):
     """Check create_test_instance logic and basic constructor functionality.
 
     create_test_instance and create_test_instances_and_names are the
@@ -104,7 +126,6 @@ def check_create_test_instance(estimator):
     * __init__ calls super.__init__
     * _tags_dynamic attribute for tag inspection is present after construction
     """
-    estimator_class = type(estimator)
     estimator = estimator_class.create_test_instance()
 
     # Check that method does not construct object of other class than itself
@@ -419,24 +440,7 @@ def check_valid_estimator_tags(estimator):
 
 def check_dl_constructor_initializes_deeply(estimator):
     """Test DL estimators that they pass custom parameters to underlying Network."""
-    if not hasattr(estimator, "get_test_params"):
-        return None
-
-    params = estimator.get_test_params()
-
-    if isinstance(params, list):
-        params = params[0]
-    if isinstance(params, dict):
-        pass
-    else:
-        raise TypeError(
-            f"`get_test_params()` of estimator: {estimator} returns "
-            f"an expected type: {type(params)}, acceptable formats: [list, dict]"
-        )
-
-    estimator = estimator(**params)
-
-    for key, value in params.items():
+    for key, value in estimator.__dict__.items():
         assert vars(estimator)[key] == value
         # some keys are only relevant to the final model (eg: n_epochs)
         # skip them for the underlying network
@@ -472,7 +476,7 @@ def check_non_state_changing_method(estimator, datatype):
     y = deepcopy(FULL_TEST_DATA_DICT[datatype]["test"][1])
 
     for method in NON_STATE_CHANGING_METHODS:
-        if hasattr(estimator, method):
+        if hasattr(estimator, method) and callable(getattr(estimator, method)):
             _run_estimator_method(estimator, method, datatype, "test")
 
         assert deep_equals(X, FULL_TEST_DATA_DICT[datatype]["test"][0]) and deep_equals(
@@ -579,7 +583,7 @@ def test_persistence_via_pickle(estimator, datatype):
 
     results = []
     for method in NON_STATE_CHANGING_METHODS_ARRAYLIKE:
-        if hasattr(estimator, method):
+        if hasattr(estimator, method) and callable(getattr(estimator, method)):
             output = _run_estimator_method(estimator, method, datatype, "test")
             results.append(output)
 
@@ -612,7 +616,7 @@ def check_fit_deterministic(estimator, datatype):
 
     results = []
     for method in NON_STATE_CHANGING_METHODS_ARRAYLIKE:
-        if hasattr(estimator, method):
+        if hasattr(estimator, method) and callable(getattr(estimator, method)):
             output = _run_estimator_method(estimator, method, datatype, "test")
             results.append(output)
 
@@ -621,7 +625,7 @@ def check_fit_deterministic(estimator, datatype):
 
     i = 0
     for method in NON_STATE_CHANGING_METHODS_ARRAYLIKE:
-        if hasattr(estimator, method):
+        if hasattr(estimator, method) and callable(getattr(estimator, method)):
             output = _run_estimator_method(estimator, method, datatype, "test")
 
             _assert_array_almost_equal(

From 0d8440edbb1268c6e4f91a81ee170f89fb3e77f0 Mon Sep 17 00:00:00 2001
From: MatthewMiddlehurst <m.middlehurst@uea.ac.uk>
Date: Thu, 11 Jul 2024 17:42:34 +0100
Subject: [PATCH 03/15] rework yield checks to allow for class input

---
 aeon/base/_base.py                            |  20 +-
 .../convolution_based/_arsenal.py             |  24 +-
 .../estimator_checking/_estimator_checking.py |  61 +++--
 .../_yield_classification_checks.py           |  75 +++---
 .../_yield_estimator_checks.py                | 233 ++++++++++--------
 .../tests/test_check_estimator.py             |  53 ++--
 .../expected_classifier_outputs.py            |  38 +--
 aeon/testing/test_config.py                   |   7 +-
 aeon/testing/tests/test_all_estimators.py     |   4 +-
 aeon/testing/utils/estimator_checks.py        |   8 +-
 10 files changed, 293 insertions(+), 230 deletions(-)

diff --git a/aeon/base/_base.py b/aeon/base/_base.py
index 76ffd96095..436a6a9d0f 100644
--- a/aeon/base/_base.py
+++ b/aeon/base/_base.py
@@ -283,7 +283,7 @@ class attribute via nested inheritance. NOT overridden by dynamic
         return deepcopy(collected_tags)
 
     @classmethod
-    def get_class_tag(cls, tag_name, tag_value_default=None):
+    def get_class_tag(cls, tag_name, tag_value_default=None, raise_error=False):
         """
         Get tag value from estimator class (only class tags).
 
@@ -293,12 +293,19 @@ def get_class_tag(cls, tag_name, tag_value_default=None):
             Name of tag value.
         tag_value_default : any type
             Default/fallback value if tag is not found.
+        raise_error : bool
+            Whether a ValueError is raised when the tag is not found.
 
         Returns
         -------
         tag_value :
-            Value of the `tag_name` tag in self. If not found, returns
-            `tag_value_default`.
+            Value of the `tag_name` tag in self. If not found, returns an error if
+            raise_error is True, otherwise it returns `tag_value_default`.
+
+        Raises
+        ------
+        ValueError if raise_error is True i.e. if tag_name is not in self.get_tags(
+        ).keys()
 
         See Also
         --------
@@ -314,7 +321,12 @@ def get_class_tag(cls, tag_name, tag_value_default=None):
         """
         collected_tags = cls.get_class_tags()
 
-        return collected_tags.get(tag_name, tag_value_default)
+        tag_value = collected_tags.get(tag_name, tag_value_default)
+
+        if raise_error and tag_name not in collected_tags.keys():
+            raise ValueError(f"Tag with name {tag_name} could not be found.")
+
+        return tag_value
 
     def get_tags(self):
         """
diff --git a/aeon/classification/convolution_based/_arsenal.py b/aeon/classification/convolution_based/_arsenal.py
index 2b58aeb6c6..98a1743ee0 100644
--- a/aeon/classification/convolution_based/_arsenal.py
+++ b/aeon/classification/convolution_based/_arsenal.py
@@ -295,6 +295,8 @@ def _fit_arsenal(self, X, y, keep_transformed_data=False):
         else:
             raise ValueError(f"Invalid Rocket transformer: {self.rocket_transform}")
 
+        rng = check_random_state(self.random_state)
+
         if time_limit > 0:
             self.n_estimators_ = 0
             self.estimators_ = []
@@ -307,16 +309,7 @@ def _fit_arsenal(self, X, y, keep_transformed_data=False):
                 fit = Parallel(n_jobs=self._n_jobs, prefer="threads")(
                     delayed(self._fit_ensemble_estimator)(
                         _clone_estimator(
-                            base_rocket,
-                            (
-                                None
-                                if self.random_state is None
-                                else (
-                                    255 if self.random_state == 0 else self.random_state
-                                )
-                                * 37
-                                * (i + 1)
-                            ),
+                            base_rocket, rng.randint(np.iinfo(np.int32).max)
                         ),
                         X,
                         y,
@@ -335,16 +328,7 @@ def _fit_arsenal(self, X, y, keep_transformed_data=False):
         else:
             fit = Parallel(n_jobs=self._n_jobs, prefer="threads")(
                 delayed(self._fit_ensemble_estimator)(
-                    _clone_estimator(
-                        base_rocket,
-                        (
-                            None
-                            if self.random_state is None
-                            else (255 if self.random_state == 0 else self.random_state)
-                            * 37
-                            * (i + 1)
-                        ),
-                    ),
+                    _clone_estimator(base_rocket, rng.randint(np.iinfo(np.int32).max)),
                     X,
                     y,
                     keep_transformed_data=keep_transformed_data,
diff --git a/aeon/testing/estimator_checking/_estimator_checking.py b/aeon/testing/estimator_checking/_estimator_checking.py
index 471c761274..d8a0b0d8ee 100644
--- a/aeon/testing/estimator_checking/_estimator_checking.py
+++ b/aeon/testing/estimator_checking/_estimator_checking.py
@@ -62,8 +62,8 @@ class is passed.
     >>> from aeon.classification.interval_based import TimeSeriesForestClassifier
     >>> from aeon.forecasting.naive import NaiveForecaster
     >>> @parametrize_with_checks([TimeSeriesForestClassifier, NaiveForecaster])
-    ... def test_aeon_compatible_estimator(estimator, check):
-    ...     check(estimator)
+    ... def test_aeon_compatible_estimator(check):
+    ...     check()
     """
     _check_soft_dependencies("pytest")
 
@@ -73,7 +73,11 @@ class is passed.
     for est in estimators:
         has_dependencies = _check_estimator_deps(est, severity="none")
 
-        for check in _yield_all_aeon_checks(est, use_first_parameter_set=use_first_parameter_set, has_dependencies=has_dependencies):
+        for check in _yield_all_aeon_checks(
+            est,
+            use_first_parameter_set=use_first_parameter_set,
+            has_dependencies=has_dependencies,
+        ):
             checks.append(_check_if_xfail(est, check, has_dependencies))
 
     return pytest.mark.parametrize(
@@ -129,15 +133,17 @@ class is passed.
         If None, no checks are excluded (unless excluded elsewhere).
     full_checks_to_run : str or list of str, default=None
         Full check name string(s) of checks to run. This should include the function
-        name of the check to with parameterization, i.e. "MockClassifier()-check_clone"
-        or "MockClassifier()-check_fit_updates_state".
+        name of the check to run with parameterization, i.e.
+        "check_clone(estimator=MockClassifier())" or
+        "check_fit_updates_state(estimator=MockClassifier())".
 
         Checks not passed will be excluded from testing. If None, all checks are run
         (unless excluded elsewhere).
     full_checks_to_exclude : str or list of str, default=None
         Full check name string(s) of checks to exclude. This should include the
         function name of the check to exclude with parameterization, i.e.
-        "MockClassifier()-check_clone" or "MockClassifier()-check_fit_updates_state"
+        "check_clone(estimator=MockClassifier())" or
+        "check_fit_updates_state(estimator=MockClassifier())".
 
         If None, no checks are excluded (unless excluded elsewhere).
     verbose : str, optional, default=False.
@@ -147,8 +153,8 @@ class is passed.
     -------
     results : dict of test results
         The test results. Keys are parameterized check strings. The `id` of each check
-        is set to be a pprint version of the estimator and the name of the check with
-        its keyword arguments.
+        is set to be the name of the check with its keyword arguments, including a
+        pprint version of the estimator.
 
         Entries are the string "PASSED" if the test passed, the exception raised if
         the test did not pass, or the reason for skipping the test.
@@ -173,12 +179,16 @@ class is passed.
 
     Running specific check for MockClassifier
     >>> check_estimator(MockClassifier, checks_to_run="check_clone")
-    {'MockClassifier()-check_clone': 'PASSED'}
+    {'check_clone(estimator=MockClassifier())': 'PASSED'}
     """
     _check_estimator_deps(estimator)
 
     checks = []
-    for check in _yield_all_aeon_checks(estimator, use_first_parameter_set=use_first_parameter_set, has_dependencies=True):
+    for check in _yield_all_aeon_checks(
+        estimator,
+        use_first_parameter_set=use_first_parameter_set,
+        has_dependencies=True,
+    ):
         checks.append(_check_if_skip(estimator, check, True))
 
     if not isinstance(checks_to_run, (list, tuple)) and checks_to_run is not None:
@@ -203,9 +213,8 @@ class is passed.
     skipped = 0
     failed = 0
     results = {}
-    for est, check in checks:
+    for check in checks:
         check_name = _get_check_estimator_ids(check)
-        full_name = f"{_get_check_estimator_ids(est)}-{check_name}"
 
         if checks_to_run is not None and check_name.split("(")[0] not in checks_to_run:
             continue
@@ -214,28 +223,28 @@ class is passed.
             and check_name.split("(")[0] in checks_to_exclude
         ):
             continue
-        if full_checks_to_run is not None and full_name not in full_checks_to_run:
+        if full_checks_to_run is not None and check_name not in full_checks_to_run:
             continue
-        if full_checks_to_exclude is not None and full_name in full_checks_to_exclude:
+        if full_checks_to_exclude is not None and check_name in full_checks_to_exclude:
             continue
 
         try:
-            check(est)
+            check()
             if verbose:
                 print(f"PASSED: {name}")  # noqa T001
-            results[full_name] = "PASSED"
+            results[check_name] = "PASSED"
             passed += 1
         except SkipTest as skip:
             if verbose:
                 print(f"SKIPPED: {name}")  # noqa T001
-            results[full_name] = "SKIPPED: " + str(skip)
+            results[check_name] = "SKIPPED: " + str(skip)
             skipped += 1
         except Exception as exception:
             if raise_exceptions:
                 raise exception
             elif verbose:
                 print(f"FAILED: {name}")  # noqa T001
-            results[full_name] = "FAILED: " + str(exception)
+            results[check_name] = "FAILED: " + str(exception)
             failed += 1
 
     if verbose:
@@ -253,9 +262,9 @@ def _check_if_xfail(estimator, check, has_dependencies):
 
     skip, reason, _ = _should_be_skipped(estimator, check, has_dependencies)
     if skip:
-        return pytest.param(estimator, check, marks=pytest.mark.xfail(reason=reason))
+        return pytest.param(check, marks=pytest.mark.xfail(reason=reason))
 
-    return estimator, check
+    return check
 
 
 def _check_if_skip(estimator, check, has_dependencies):
@@ -272,8 +281,8 @@ def wrapped(*args, **kwargs):
             )
             raise SkipTest(f"Skipping {check_name} for {est_name}: {reason}")
 
-        return estimator, wrapped
-    return estimator, check
+        return wrapped
+    return check
 
 
 def _should_be_skipped(estimator, check, has_dependencies):
@@ -325,11 +334,15 @@ def _get_check_estimator_ids(obj):
         if not obj.keywords:
             return obj.func.__name__
 
-        kwstring = ",".join([f"{k}={v}" for k, v in obj.keywords.items()])
+        kwstring = ",".join(
+            [f"{k}={_get_check_estimator_ids(v)}" for k, v in obj.keywords.items()]
+        )
         return f"{obj.func.__name__}({kwstring})"
+    elif isclass(obj):
+        return obj.__name__
     elif hasattr(obj, "get_params"):
         with config_context(print_changed_only=True):
             s = re.sub(r"\s", "", str(obj))
             return re.sub(r"<function[^)]*>", "func", s)
     else:
-        raise ValueError(f"Unexpected object: {obj}")
+        return obj
diff --git a/aeon/testing/estimator_checking/_yield_classification_checks.py b/aeon/testing/estimator_checking/_yield_classification_checks.py
index 13bfb8840d..b670adbbdf 100644
--- a/aeon/testing/estimator_checking/_yield_classification_checks.py
+++ b/aeon/testing/estimator_checking/_yield_classification_checks.py
@@ -19,34 +19,49 @@
 
 def _yield_classification_checks(estimator_class, estimator_instances, datatypes):
     """Yield all classification checks for an aeon classifier."""
-    # no data needed
-    yield test_classifier_against_expected_results
-    yield test_classifier_tags_consistent
-    yield test_does_not_override_final_methods
+    # only class required
+    yield partial(
+        test_classifier_against_expected_results, estimator_class=estimator_class
+    )
+    yield partial(test_classifier_tags_consistent, estimator_class=estimator_class)
+    yield partial(test_does_not_override_final_methods, estimator_class=estimator_class)
 
     # data type irrelevant
-    if _get_tag(estimator, "capability:contractable"):
-        yield partial(test_contracted_classifier, datatype=datatypes[0])
+    if _get_tag(estimator_class, "capability:contractable", raise_error=True):
+        yield partial(
+            test_contracted_classifier,
+            estimator_class=estimator_class,
+            datatype=datatypes[0][0],
+        )
 
-    if _get_tag(estimator, "capability:train_estimate"):
-        yield partial(test_classifier_train_estimate, datatype=datatypes[0])
+    # test class instances
+    for i, estimator in enumerate(estimator_instances):
+        # data type irrelevant
+        if _get_tag(estimator_class, "capability:train_estimate", raise_error=True):
+            yield partial(
+                test_classifier_train_estimate,
+                estimator=estimator,
+                datatype=datatypes[0][0],
+            )
 
-    if isinstance(estimator, BaseDeepClassifier):
-        yield partial(test_random_state_deep_learning_cls, datatype=datatypes[0])
+        if isinstance(estimator, BaseDeepClassifier):
+            yield partial(
+                check_random_state_deep_learning,
+                estimator=estimator,
+                datatype=datatypes[i][0],
+            )
 
-    # test all data types
-    for datatype in datatypes:
-        yield partial(test_classifier_output, datatype=datatype)
+        # test all data types
+        for datatype in datatypes[i]:
+            yield partial(
+                test_classifier_output, estimator=estimator, datatype=datatype
+            )
 
 
-def test_classifier_against_expected_results(estimator):
+def test_classifier_against_expected_results(estimator_class):
     """Test classifier against stored results."""
     # we only use the first estimator instance for testing
-
-    #todo remove
-    return None
-
-    class_name = type(estimator).__name__
+    class_name = estimator_class.__name__
 
     # We cannot guarantee same results on ARM macOS
     if platform == "darwin":
@@ -70,7 +85,7 @@ def test_classifier_against_expected_results(estimator):
             continue
 
         # we only use the first estimator instance for testing
-        estimator_instance = estimator.create_test_instance(
+        estimator_instance = estimator_class.create_test_instance(
             parameter_set="results_comparison"
         )
         # set random seed if possible
@@ -96,9 +111,8 @@ def test_classifier_against_expected_results(estimator):
         )
 
 
-def test_classifier_tags_consistent(estimator):
+def test_classifier_tags_consistent(estimator_class):
     """Test the tag X_inner_type is consistent with capability:unequal_length."""
-    estimator_class = type(estimator)
     valid_types = {"np-list", "df-list", "pd-multivariate", "nested_univ"}
     unequal = estimator_class.get_class_tag("capability:unequal_length")
     if unequal:  # one of X_inner_types must be capable of storing unequal length
@@ -118,9 +132,8 @@ def test_classifier_tags_consistent(estimator):
         inst.predict_proba(X)
 
 
-def test_does_not_override_final_methods(estimator):
+def test_does_not_override_final_methods(estimator_class):
     """Test does not override final methods."""
-    estimator_class = type(estimator)
     final_methods = [
         "fit",
         "predict",
@@ -136,9 +149,11 @@ def test_does_not_override_final_methods(estimator):
             )
 
 
-def test_contracted_classifier(estimator, datatype):
+def test_contracted_classifier(estimator_class, datatype):
     """Test classifiers that can be contracted."""
-    estimator_class = type(estimator)
+    estimator_instance = estimator_class.create_test_instance(
+        parameter_set="contracting"
+    )
 
     default_params = inspect.signature(estimator_class.__init__).parameters
 
@@ -162,7 +177,7 @@ def test_contracted_classifier(estimator, datatype):
         )
 
     # too short of a contract time can lead to test failures
-    if vars(estimator).get("time_limit_in_minutes", None) < 0.5:
+    if vars(estimator_instance).get("time_limit_in_minutes", None) < 0.5:
         raise ValueError(
             "Test parameters for test_contracted_classifier must set "
             "time_limit_in_minutes to 0.5 or more. It is recommended to make "
@@ -171,11 +186,11 @@ def test_contracted_classifier(estimator, datatype):
         )
 
     # run fit and predict
-    estimator.fit(
+    estimator_instance.fit(
         FULL_TEST_DATA_DICT[datatype]["train"][0],
         FULL_TEST_DATA_DICT[datatype]["train"][1],
     )
-    y_pred = estimator.predict(FULL_TEST_DATA_DICT[datatype]["test"][0])
+    y_pred = estimator_instance.predict(FULL_TEST_DATA_DICT[datatype]["test"][0])
 
     # check predict
     assert isinstance(y_pred, np.ndarray)
@@ -226,7 +241,7 @@ def test_classifier_train_estimate(estimator, datatype):
     np.testing.assert_almost_equal(train_proba.sum(axis=1), 1, decimal=4)
 
 
-def test_random_state_deep_learning_cls(estimator, datatype):
+def check_random_state_deep_learning(estimator, datatype):
     """Test Deep Classifier seeding."""
     random_state = 42
 
diff --git a/aeon/testing/estimator_checking/_yield_estimator_checks.py b/aeon/testing/estimator_checking/_yield_estimator_checks.py
index 85f44f0a63..7426482c56 100644
--- a/aeon/testing/estimator_checking/_yield_estimator_checks.py
+++ b/aeon/testing/estimator_checking/_yield_estimator_checks.py
@@ -39,77 +39,105 @@
 from aeon.utils.validation._dependencies import _check_estimator_deps
 
 
-def _yield_all_aeon_checks(estimator, use_first_parameter_set=False, has_dependencies=None):
+def _yield_all_aeon_checks(
+    estimator, use_first_parameter_set=False, has_dependencies=None
+):
     """Yield all checks for an aeon estimator."""
     if has_dependencies is None:
         has_dependencies = _check_estimator_deps(estimator, severity="none")
 
-    estimator_class = None
-    estimator_instances = None
-
     if has_dependencies:
         if isclass(estimator) and issubclass(estimator, BaseEstimator):
             estimator_class = estimator
-            estimator_instances = estimator.create_test_instance(return_first=use_first_parameter_set)
+            estimator_instances = estimator.create_test_instance(
+                return_first=use_first_parameter_set
+            )
         elif isinstance(estimator, BaseEstimator):
             estimator_class = type(estimator)
             estimator_instances = estimator
         else:
             raise TypeError(
-                f"Passed estimator is not an instance or subclass of BaseEstimator."
+                "Passed estimator is not an instance or subclass of BaseEstimator."
             )
 
         if not isinstance(estimator_instances, list):
             estimator_instances = [estimator_instances]
 
-    # if input does not have all dependencies installed, all tests are going to be
-    # skipped as we cannot instantiate the class
-    datatypes = [
-        _get_datatypes_for_estimator(est) if has_dependencies else [None] for est in estimator_instances
-    ]
+        datatypes = [_get_datatypes_for_estimator(est) for est in estimator_instances]
+    else:
+        # if input does not have all dependencies installed, all tests are going to be
+        # skipped as we cannot instantiate the class
+        estimator_class = estimator if isclass(estimator) else type(estimator)
+        estimator_instances = [None]
+        datatypes = [[None]]
 
+    # start yielding checks
     yield from _yield_estimator_checks(estimator_class, estimator_instances, datatypes)
 
     if issubclass(estimator_class, BaseClassifier):
-        yield from _yield_classification_checks(estimator_class, estimator_instances, datatypes)
+        yield from _yield_classification_checks(
+            estimator_class, estimator_instances, datatypes
+        )
 
 
 def _yield_estimator_checks(estimator_class, estimator_instances, datatypes):
     """Yield all general checks for an aeon estimator."""
-    # no data needed
+    # only class required
     yield partial(check_create_test_instance, estimator_class=estimator_class)
-    yield check_create_test_instances_and_names
-    yield check_estimator_tags
-    yield check_inheritance
-    yield check_has_common_interface
-    yield check_get_params
-    yield check_set_params
-    yield check_set_params_sklearn
-    yield check_clone
-    yield check_repr
-    yield check_constructor
-    yield check_valid_estimator_class_tags
-    yield check_valid_estimator_tags
-
-    if (
-        isinstance(estimator, BaseDeepClassifier)
-        or isinstance(estimator, BaseDeepRegressor)
-        or isinstance(estimator, BaseDeepClusterer)
-    ):
-        yield check_dl_constructor_initializes_deeply
-
-    # data type irrelevant
-    yield partial(check_non_state_changing_method, datatype=datatypes[0])
-    yield partial(check_fit_updates_state, datatype=datatypes[0])
-
-    if not _get_tag(estimator, "fit_is_empty", default=False):
-        yield partial(check_raises_not_fitted_error, datatype=datatypes[0])
-
-    if not _get_tag(estimator, "cant-pickle", default=False):
-        yield partial(test_persistence_via_pickle, datatype=datatypes[0])
-
-    if not _get_tag(estimator, "non-deterministic", default=False):
-        yield partial(check_fit_deterministic, datatype=datatypes[0])
+    yield partial(
+        check_create_test_instances_and_names, estimator_class=estimator_class
+    )
+    yield partial(check_estimator_tags, estimator_class=estimator_class)
+    yield partial(check_inheritance, estimator_class=estimator_class)
+    yield partial(check_has_common_interface, estimator_class=estimator_class)
+    yield partial(check_set_params_sklearn, estimator_class=estimator_class)
+    yield partial(check_constructor, estimator_class=estimator_class)
+    yield partial(check_valid_estimator_class_tags, estimator_class=estimator_class)
+
+    # test class instances
+    for i, estimator in enumerate(estimator_instances):
+        # no data needed
+        yield partial(check_get_params, estimator=estimator)
+        yield partial(check_set_params, estimator=estimator)
+        yield partial(check_clone, estimator=estimator)
+        yield partial(check_repr, estimator=estimator)
+        yield partial(check_valid_estimator_tags, estimator=estimator)
+
+        if (
+            isinstance(estimator, BaseDeepClassifier)
+            or isinstance(estimator, BaseDeepRegressor)
+            or isinstance(estimator, BaseDeepClusterer)
+        ):
+            yield partial(check_dl_constructor_initializes_deeply, estimator=estimator)
+
+        # data type irrelevant
+        yield partial(
+            check_non_state_changing_method,
+            estimator=estimator,
+            datatype=datatypes[i][0],
+        )
+        yield partial(
+            check_fit_updates_state, estimator=estimator, datatype=datatypes[i][0]
+        )
+
+        if not _get_tag(estimator, "fit_is_empty", default=False):
+            yield partial(
+                check_raises_not_fitted_error,
+                estimator=estimator,
+                datatype=datatypes[i][0],
+            )
+
+        if not _get_tag(estimator, "cant-pickle", default=False):
+            yield partial(
+                test_persistence_via_pickle,
+                estimator=estimator,
+                datatype=datatypes[i][0],
+            )
+
+        if not _get_tag(estimator, "non-deterministic", default=False):
+            yield partial(
+                check_fit_deterministic, estimator=estimator, datatype=datatypes[i][0]
+            )
 
 
 def check_create_test_instance(estimator_class):
@@ -142,7 +170,7 @@ def check_create_test_instance(estimator_class):
 
 
 # todo consider deprecation
-def check_create_test_instances_and_names(estimator):
+def check_create_test_instances_and_names(estimator_class):
     """Check that create_test_instances_and_names works.
 
     create_test_instance and create_test_instances_and_names are the
@@ -151,7 +179,6 @@ def check_create_test_instances_and_names(estimator):
 
     Tests expected function signature of create_test_instances_and_names.
     """
-    estimator_class = type(estimator)
     estimators, names = estimator_class.create_test_instances_and_names()
 
     assert isinstance(estimators, list), (
@@ -180,10 +207,8 @@ def check_create_test_instances_and_names(estimator):
 
 
 # todo consider expanding to init and compare against registry classes
-def check_estimator_tags(estimator):
+def check_estimator_tags(estimator_class):
     """Check conventions on estimator tags."""
-    estimator_class = type(estimator)
-
     assert hasattr(estimator_class, "get_class_tags")
     all_tags = estimator_class.get_class_tags()
     assert isinstance(all_tags, dict)
@@ -213,9 +238,7 @@ def check_estimator_tags(estimator):
 
 # todo consider removing the multiple base class allowance. Possibly deprecate
 #  BaseObject and roll it into BaseEstimator?
-def check_inheritance(estimator):
-    estimator_class = type(estimator)
-
+def check_inheritance(estimator_class):
     """Check that estimator inherits from BaseObject and/or BaseEstimator."""
     assert issubclass(
         estimator_class, BaseObject
@@ -241,10 +264,8 @@ def check_inheritance(estimator):
         assert issubclass(estimator_class, BaseTransformer)
 
 
-def check_has_common_interface(estimator):
+def check_has_common_interface(estimator_class):
     """Check estimator implements the common interface."""
-    estimator_class = type(estimator)
-
     # Check class for type of attribute
     if isinstance(estimator_class, BaseEstimator):
         assert isinstance(estimator_class.is_fitted, property)
@@ -262,37 +283,13 @@ def check_has_common_interface(estimator):
         assert hasattr(estimator_class, "predict")
 
 
-def check_get_params(estimator):
-    """Check that get_params works correctly."""
-    params = estimator.get_params()
-    assert isinstance(params, dict)
-    check_get_params_invariance(estimator.__class__.__name__, estimator)
-
-
-def check_set_params(estimator):
-    """Check that set_params works correctly."""
-    params = estimator.get_params()
-
-    msg = f"set_params of {type(estimator).__name__} does not return self"
-    assert estimator.set_params(**params) is estimator, msg
-
-    is_equal, equals_msg = deep_equals(estimator.get_params(), params, return_msg=True)
-    msg = (
-        f"get_params result of {type(estimator).__name__} (x) does not match "
-        f"what was passed to set_params (y). Reason for discrepancy: {equals_msg}"
-    )
-    assert is_equal, msg
-
-
-def check_set_params_sklearn(estimator):
+def check_set_params_sklearn(estimator_class):
     """Check that set_params works correctly, mirrors sklearn check_set_params.
 
     Instead of the "fuzz values" in sklearn's check_set_params,
     we use the other test parameter settings (which are assumed valid).
     This guarantees settings which play along with the __init__ content.
     """
-    estimator_class = type(estimator)
-
     estimator = estimator_class.create_test_instance()
     test_params = estimator_class.get_test_params()
     if not isinstance(test_params, list):
@@ -320,28 +317,7 @@ def check_set_params_sklearn(estimator):
         assert is_equal, msg
 
 
-def check_clone(estimator):
-    """Check that clone method does not raise exceptions and results in a clone.
-
-    A clone of an object x is an object that:
-    * has same class and parameters as x
-    * is not identical with x
-     * is unfitted (even if x was fitted)
-    """
-    est_clone = estimator.clone()
-    assert isinstance(est_clone, type(estimator))
-    assert est_clone is not estimator
-    if hasattr(est_clone, "is_fitted"):
-        assert not est_clone.is_fitted
-
-
-# todo roll into another test
-def check_repr(estimator):
-    """Check that __repr__ call to instance does not raise exceptions."""
-    repr(estimator)
-
-
-def check_constructor(estimator):
+def check_constructor(estimator_class):
     """Check that the constructor has sklearn compatible signature and behaviour.
 
     Based on sklearn check_estimator testing of __init__ logic.
@@ -358,8 +334,6 @@ def check_constructor(estimator):
         (other type parameters should be None, default handling should be by writing
         the default to attribute of a different name, e.g., my_param_ not my_param)
     """
-    estimator_class = type(estimator)
-
     msg = "constructor __init__ should have no varargs"
     assert getfullargspec(estimator_class.__init__).varkw is None, msg
 
@@ -425,13 +399,56 @@ def param_filter(p):
                 assert param_value == param.default, param.name
 
 
-def check_valid_estimator_class_tags(estimator):
+def check_valid_estimator_class_tags(estimator_class):
     """Check that Estimator class tags are in VALID_ESTIMATOR_TAGS."""
-    estimator_class = type(estimator)
     for tag in estimator_class.get_class_tags().keys():
         assert tag in VALID_ESTIMATOR_TAGS
 
 
+def check_get_params(estimator):
+    """Check that get_params works correctly."""
+    params = estimator.get_params()
+    assert isinstance(params, dict)
+    check_get_params_invariance(estimator.__class__.__name__, estimator)
+
+
+def check_set_params(estimator):
+    """Check that set_params works correctly."""
+    estimator = _clone_estimator(estimator)
+    params = estimator.get_params()
+
+    msg = f"set_params of {type(estimator).__name__} does not return self"
+    assert estimator.set_params(**params) is estimator, msg
+
+    is_equal, equals_msg = deep_equals(estimator.get_params(), params, return_msg=True)
+    msg = (
+        f"get_params result of {type(estimator).__name__} (x) does not match "
+        f"what was passed to set_params (y). Reason for discrepancy: {equals_msg}"
+    )
+    assert is_equal, msg
+
+
+def check_clone(estimator):
+    """Check that clone method does not raise exceptions and results in a clone.
+
+    A clone of an object x is an object that:
+    * has same class and parameters as x
+    * is not identical with x
+     * is unfitted (even if x was fitted)
+    """
+    est_clone = estimator.clone()
+    assert isinstance(est_clone, type(estimator))
+    assert est_clone is not estimator
+    if hasattr(est_clone, "is_fitted"):
+        assert not est_clone.is_fitted
+
+
+# todo roll into another test
+def check_repr(estimator):
+    """Check that __repr__ call to instance does not raise exceptions."""
+    repr(estimator)
+
+
 def check_valid_estimator_tags(estimator):
     """Check that Estimator tags are in VALID_ESTIMATOR_TAGS."""
     for tag in estimator.get_tags().keys():
@@ -593,7 +610,7 @@ def test_persistence_via_pickle(estimator, datatype):
 
     i = 0
     for method in NON_STATE_CHANGING_METHODS_ARRAYLIKE:
-        if hasattr(estimator, method):
+        if hasattr(estimator, method) and callable(getattr(estimator, method)):
             output = _run_estimator_method(estimator, method, datatype, "test")
 
             _assert_array_almost_equal(
diff --git a/aeon/testing/estimator_checking/tests/test_check_estimator.py b/aeon/testing/estimator_checking/tests/test_check_estimator.py
index 7e81861e8f..70e1282a0f 100644
--- a/aeon/testing/estimator_checking/tests/test_check_estimator.py
+++ b/aeon/testing/estimator_checking/tests/test_check_estimator.py
@@ -4,9 +4,9 @@
 
 import pytest
 
-from aeon.base import BaseEstimator
 from aeon.clustering import TimeSeriesKMeans
 from aeon.testing.estimator_checking import check_estimator, parametrize_with_checks
+from aeon.testing.estimator_checking._estimator_checking import _get_check_estimator_ids
 from aeon.testing.mock_estimators import (
     MockClassifier,
     MockClassifierMultiTestParams,
@@ -18,7 +18,7 @@
 from aeon.testing.utils.deep_equals import deep_equals
 from aeon.transformations.collection import TimeSeriesScaler
 
-EXAMPLE_CLASSES = [
+test_classes = [
     MockClassifier,
     MockRegressor,
     TimeSeriesKMeans,
@@ -28,31 +28,38 @@
     TimeSeriesScaler,
     MockClassifierMultiTestParams,
 ]
+test_classes = {c.__name__: c for c in test_classes}
 
 
-@parametrize_with_checks(EXAMPLE_CLASSES, use_first_parameter_set=True)
-def test_parametrize_with_checks_classes(estimator, check):
+@parametrize_with_checks(list(test_classes.values()), use_first_parameter_set=True)
+def test_parametrize_with_checks_classes(check):
     """Test parametrize_with_checks with class input."""
-    assert isinstance(estimator, BaseEstimator)
+    name = _get_check_estimator_ids(check).split("=")[1].split("(")[0].split(")")[0]
     assert callable(check)
-    dict_before = estimator.__dict__.copy()
-    check(estimator)
-    assert deep_equals(estimator.__dict__, dict_before)
+    dict_before = test_classes[name].__dict__.copy()
+    check()
+    dict_after = test_classes[name].__dict__.copy()
+    equal, msg = deep_equals(dict_after, dict_before, return_msg=True)
+    assert equal, msg
 
 
-@parametrize_with_checks(
-    [c.create_test_instance() for c in EXAMPLE_CLASSES], use_first_parameter_set=True
-)
-def test_parametrize_with_checks_instances(estimator, check):
+test_instances = [c.create_test_instance() for c in list(test_classes.values())]
+test_instances = {c.__class__.__name__: c for c in test_instances}
+
+
+@parametrize_with_checks(list(test_instances.values()), use_first_parameter_set=True)
+def test_parametrize_with_checks_instances(check):
     """Test parametrize_with_checks with estimator instance input."""
-    assert isinstance(estimator, BaseEstimator)
+    name = _get_check_estimator_ids(check).split("=")[1].split("(")[0].split(")")[0]
     assert callable(check)
-    dict_before = estimator.__dict__.copy()
-    check(estimator)
-    assert deep_equals(estimator.__dict__, dict_before)
+    dict_before = test_instances[name].__dict__.copy()
+    check()
+    dict_after = test_instances[name].__dict__.copy()
+    equal, msg = deep_equals(dict_after, dict_before, return_msg=True)
+    assert equal, msg
 
 
-@pytest.mark.parametrize("estimator_class", EXAMPLE_CLASSES)
+@pytest.mark.parametrize("estimator_class", list(test_classes.values()))
 def test_check_estimator_passed(estimator_class):
     """Test that check_estimator returns only passed tests for examples we know pass."""
     estimator = estimator_class.create_test_instance()
@@ -64,11 +71,17 @@ def test_check_estimator_passed(estimator_class):
     assert all(x == "PASSED" for x in result_instance.values())
 
     # test that no exceptions are raised
+    dict_before = estimator_class.__dict__.copy()
     check_estimator(estimator_class, raise_exceptions=True, verbose=False)
+    dict_after = estimator_class.__dict__.copy()
+    equal, msg = deep_equals(dict_after, dict_before, return_msg=True)
+    assert equal, msg
 
     dict_before = estimator.__dict__.copy()
     check_estimator(estimator, raise_exceptions=True, verbose=False)
-    assert deep_equals(estimator.__dict__, dict_before)
+    dict_after = estimator.__dict__.copy()
+    equal, msg = deep_equals(dict_after, dict_before, return_msg=True)
+    assert equal, msg
 
 
 def test_check_estimator_subset_tests():
@@ -81,8 +94,8 @@ def test_check_estimator_subset_tests():
     tests_to_exclude = ["check_set_params"]
 
     expected_tests = [
-        "MockClassifier()-check_get_params",
-        "MockClassifier()-check_clone",
+        "check_get_params(estimator=MockClassifier())",
+        "check_clone(estimator=MockClassifier())",
     ]
 
     results = check_estimator(
diff --git a/aeon/testing/expected_results/expected_classifier_outputs.py b/aeon/testing/expected_results/expected_classifier_outputs.py
index 1a1432f5d5..b164d70b50 100644
--- a/aeon/testing/expected_results/expected_classifier_outputs.py
+++ b/aeon/testing/expected_results/expected_classifier_outputs.py
@@ -446,15 +446,15 @@
 unit_test_proba["Arsenal"] = np.array(
     [
         [-0.0, 1.0],
-        [0.8175, 0.1825],
+        [0.9226, 0.0774],
         [-0.0, 1.0],
         [1.0, -0.0],
         [1.0, -0.0],
         [1.0, -0.0],
-        [0.8205, 0.1795],
+        [0.9226, 0.0774],
         [-0.0, 1.0],
         [1.0, -0.0],
-        [0.8205, 0.1795],
+        [1.0, -0.0],
     ]
 )
 unit_test_proba["RocketClassifier"] = np.array(
@@ -769,15 +769,15 @@
 basic_motions_proba["HIVECOTEV2"] = np.array(
     [
         [0.0, 0.0, 0.0, 1.0],
-        [0.64, 0.0068, 0.3265, 0.0267],
-        [0.0, 0.1088, 0.6525, 0.2387],
-        [0.0, 0.3458, 0.4742, 0.18],
-        [0.0, 0.0, 0.0068, 0.9932],
+        [0.7184, 0.0053, 0.2554, 0.0209],
+        [0.185, 0.0851, 0.6238, 0.106],
+        [0.0, 0.384, 0.2902, 0.3258],
+        [0.0, 0.0, 0.1919, 0.8081],
         [0.0, 0.0, 0.0, 1.0],
-        [0.6468, 0.0068, 0.1088, 0.2376],
-        [0.0, 0.0, 0.8618, 0.1382],
-        [0.0068, 0.8645, 0.1088, 0.0199],
-        [0.0, 0.8645, 0.1088, 0.0267],
+        [0.7237, 0.0053, 0.0851, 0.1858],
+        [0.185, 0.1866, 0.6129, 0.0155],
+        [0.0053, 0.894, 0.0851, 0.0155],
+        [0.0, 0.894, 0.0851, 0.0209],
     ]
 )
 basic_motions_proba["CanonicalIntervalForestClassifier"] = np.array(
@@ -880,15 +880,15 @@
 )
 basic_motions_proba["Arsenal"] = np.array(
     [
-        [-0.0, 0.158, -0.0, 0.842],
+        [-0.0, -0.0, -0.0, 1.0],
         [1.0, -0.0, -0.0, -0.0],
-        [0.6394, 0.3606, -0.0, -0.0],
-        [-0.0, -0.0, 0.586, 0.414],
-        [-0.0, -0.0, 0.2254, 0.7746],
-        [-0.0, -0.0, 0.256, 0.744],
-        [0.7771, 0.2229, -0.0, -0.0],
-        [0.256, 0.2229, 0.3631, 0.158],
-        [-0.0, 0.842, 0.158, -0.0],
+        [0.131, 0.1897, 0.4916, 0.1877],
+        [0.1877, 0.131, 0.4916, 0.1897],
+        [-0.0, 0.1844, 0.3775, 0.4382],
+        [-0.0, -0.0, -0.0, 1.0],
+        [0.8103, -0.0, -0.0, 0.1897],
+        [-0.0, -0.0, 0.4949, 0.5051],
+        [-0.0, 0.8156, 0.1844, -0.0],
         [-0.0, 1.0, -0.0, -0.0],
     ]
 )
diff --git a/aeon/testing/test_config.py b/aeon/testing/test_config.py
index 2eae519bc3..cb11f3df59 100644
--- a/aeon/testing/test_config.py
+++ b/aeon/testing/test_config.py
@@ -67,7 +67,12 @@
         "test_save_estimators_to_file",
     ],
     # has a keras fail, unknown reason, see #1387
-    "LearningShapeletClassifier": ["test_fit_deterministic"],
+    "LearningShapeletClassifier": ["check_fit_deterministic"],
+    # does not fit structure for test, needs investigation
+    "TapNetClassifier": ["check_random_state_deep_learning"],
+    # needs investigation
+    "SASTClassifier": ["check_fit_deterministic"],
+    "RSASTClassifier": ["check_fit_deterministic"],
 }
 
 # We use estimator tags in addition to class hierarchies to further distinguish
diff --git a/aeon/testing/tests/test_all_estimators.py b/aeon/testing/tests/test_all_estimators.py
index 85aec503f0..bc7b9b4a41 100644
--- a/aeon/testing/tests/test_all_estimators.py
+++ b/aeon/testing/tests/test_all_estimators.py
@@ -10,6 +10,6 @@
 
 
 @parametrize_with_checks(ALL_ESTIMATORS)
-def test_all_estimators(estimator, check):
+def test_all_estimators(check):
     """Run general estimator checks on all aeon estimators."""
-    check(estimator)
+    check()
diff --git a/aeon/testing/utils/estimator_checks.py b/aeon/testing/utils/estimator_checks.py
index 10a86e5864..dbfda81782 100644
--- a/aeon/testing/utils/estimator_checks.py
+++ b/aeon/testing/utils/estimator_checks.py
@@ -36,8 +36,12 @@ def _run_estimator_method(estimator, method_name, datatype, split):
 
 
 def _get_tag(estimator, tag_name, default=None, raise_error=False):
-    if isclass(estimator):
-        return estimator.get_class_tag(tag_name=tag_name, tag_value_default=default)
+    if estimator is None:
+        return None
+    elif isclass(estimator):
+        return estimator.get_class_tag(
+            tag_name=tag_name, tag_value_default=default, raise_error=raise_error
+        )
     else:
         return estimator.get_tag(
             tag_name=tag_name, tag_value_default=default, raise_error=raise_error

From 55a28100321c66f5c08884ba09fcf3c1c938e60d Mon Sep 17 00:00:00 2001
From: MatthewMiddlehurst <m.middlehurst@uea.ac.uk>
Date: Thu, 11 Jul 2024 18:04:01 +0100
Subject: [PATCH 04/15] fixes

---
 aeon/base/tests/test_base_collection.py       | 16 ++---
 aeon/regression/tests/test_base.py            | 10 +--
 .../estimator_checking/_estimator_checking.py |  5 +-
 .../tests/test_convert_collection.py          | 64 +++++++++++--------
 .../utils/validation/tests/test_collection.py | 14 ++--
 aeon/utils/validation/tests/test_input.py     | 12 ++--
 6 files changed, 71 insertions(+), 50 deletions(-)

diff --git a/aeon/base/tests/test_base_collection.py b/aeon/base/tests/test_base_collection.py
index 8dcdc3e293..f055734336 100644
--- a/aeon/base/tests/test_base_collection.py
+++ b/aeon/base/tests/test_base_collection.py
@@ -5,8 +5,8 @@
 
 from aeon.base import BaseCollectionEstimator
 from aeon.testing.testing_data import (
-    EQUAL_LENGTH_UNIVARIATE_COLLECTION,
-    UNEQUAL_LENGTH_UNIVARIATE_COLLECTION,
+    EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION,
+    UNEQUAL_LENGTH_UNIVARIATE_CLASSIFICATION,
 )
 from aeon.utils import COLLECTIONS_DATA_TYPES
 from aeon.utils.validation import get_type
@@ -15,7 +15,7 @@
 @pytest.mark.parametrize("data", COLLECTIONS_DATA_TYPES)
 def test__get_metadata(data):
     """Test get meta data."""
-    X = EQUAL_LENGTH_UNIVARIATE_COLLECTION[data]
+    X = EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[data]["train"][0]
     meta = BaseCollectionEstimator._get_metadata(X)
     assert not meta["multivariate"]
     assert not meta["missing_values"]
@@ -71,7 +71,7 @@ def test__convert_X(internal_type, data):
     """
     cls = BaseCollectionEstimator()
     # Equal length should default to numpy3D
-    X = EQUAL_LENGTH_UNIVARIATE_COLLECTION[data]
+    X = EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[data]["train"][0]
     cls.metadata_ = cls._check_X(X)
     X2 = cls._convert_X(X)
     assert get_type(X2) == cls.get_tag("X_inner_type")
@@ -91,11 +91,11 @@ def test__convert_X(internal_type, data):
     cls.set_tags(**{"X_inner_type": ["nested_univ", internal_type]})
     X2 = cls._convert_X(X)
     assert get_type(X2) == internal_type
-    if data in UNEQUAL_LENGTH_UNIVARIATE_COLLECTION.keys():
-        if internal_type in UNEQUAL_LENGTH_UNIVARIATE_COLLECTION.keys():
+    if data in UNEQUAL_LENGTH_UNIVARIATE_CLASSIFICATION.keys():
+        if internal_type in UNEQUAL_LENGTH_UNIVARIATE_CLASSIFICATION.keys():
             cls.set_tags(**{"capability:unequal_length": True})
             cls.set_tags(**{"X_inner_type": ["nested_univ", "np-list", internal_type]})
-            X = UNEQUAL_LENGTH_UNIVARIATE_COLLECTION[data]
+            X = UNEQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[data]["train"][0]
             X2 = cls._convert_X(X)
             assert get_type(X2) == "np-list"
 
@@ -103,7 +103,7 @@ def test__convert_X(internal_type, data):
 @pytest.mark.parametrize("data", COLLECTIONS_DATA_TYPES)
 def test_preprocess_collection(data):
     """Test the functionality for preprocessing fit."""
-    data = EQUAL_LENGTH_UNIVARIATE_COLLECTION[data]
+    data = EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[data]["train"][0]
     cls = BaseCollectionEstimator()
     X = cls._preprocess_collection(data)
     assert cls._n_jobs == 1
diff --git a/aeon/regression/tests/test_base.py b/aeon/regression/tests/test_base.py
index 88de3ae6eb..450d37b46b 100644
--- a/aeon/regression/tests/test_base.py
+++ b/aeon/regression/tests/test_base.py
@@ -9,8 +9,8 @@
 from aeon.regression.base import BaseRegressor
 from aeon.regression.dummy import DummyRegressor
 from aeon.testing.testing_data import (
-    EQUAL_LENGTH_UNIVARIATE_COLLECTION,
-    UNEQUAL_LENGTH_UNIVARIATE_COLLECTION,
+    EQUAL_LENGTH_UNIVARIATE_REGRESSION,
+    UNEQUAL_LENGTH_UNIVARIATE_REGRESSION,
 )
 from aeon.utils import COLLECTIONS_DATA_TYPES
 
@@ -110,9 +110,9 @@ def test__check_y():
 @pytest.mark.parametrize("data", COLLECTIONS_DATA_TYPES)
 def test_unequal_length_input(data):
     """Test with unequal length failures and passes."""
-    if data in UNEQUAL_LENGTH_UNIVARIATE_COLLECTION.keys():
+    if data in UNEQUAL_LENGTH_UNIVARIATE_REGRESSION.keys():
         dummy = _TestRegressor()
-        X = UNEQUAL_LENGTH_UNIVARIATE_COLLECTION[data]
+        X = UNEQUAL_LENGTH_UNIVARIATE_REGRESSION[data]["train"][0]
         y = np.random.random(size=10)
         with pytest.raises(ValueError, match=r"cannot handle unequal length series"):
             dummy.fit(X, y)
@@ -124,7 +124,7 @@ def test_unequal_length_input(data):
 def test_equal_length_input(data):
     """Test with unequal length failures and passes."""
     dummy = _TestRegressor()
-    X = EQUAL_LENGTH_UNIVARIATE_COLLECTION[data]
+    X = EQUAL_LENGTH_UNIVARIATE_REGRESSION[data]["train"][0]
     y = np.random.random(size=10)
     _assert_fit_predict(dummy, X, y)
     dummy = _TestHandlesAllInput()
diff --git a/aeon/testing/estimator_checking/_estimator_checking.py b/aeon/testing/estimator_checking/_estimator_checking.py
index d8a0b0d8ee..127cf74f4c 100644
--- a/aeon/testing/estimator_checking/_estimator_checking.py
+++ b/aeon/testing/estimator_checking/_estimator_checking.py
@@ -60,8 +60,9 @@ class is passed.
     --------
     >>> from aeon.testing.estimator_checking import parametrize_with_checks
     >>> from aeon.classification.interval_based import TimeSeriesForestClassifier
-    >>> from aeon.forecasting.naive import NaiveForecaster
-    >>> @parametrize_with_checks([TimeSeriesForestClassifier, NaiveForecaster])
+    >>> from aeon.regression.interval_based import TimeSeriesForestRegressor
+    >>> @parametrize_with_checks(
+    ...                     [TimeSeriesForestClassifier, TimeSeriesForestRegressor])
     ... def test_aeon_compatible_estimator(check):
     ...     check()
     """
diff --git a/aeon/utils/conversion/tests/test_convert_collection.py b/aeon/utils/conversion/tests/test_convert_collection.py
index 8fcc73ffec..6a8c223be6 100644
--- a/aeon/utils/conversion/tests/test_convert_collection.py
+++ b/aeon/utils/conversion/tests/test_convert_collection.py
@@ -6,9 +6,9 @@
 
 from aeon.testing.data_generation import make_example_nested_dataframe
 from aeon.testing.testing_data import (
-    EQUAL_LENGTH_MULTIVARIATE_COLLECTION,
-    EQUAL_LENGTH_UNIVARIATE_COLLECTION,
-    UNEQUAL_LENGTH_UNIVARIATE_COLLECTION,
+    EQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION,
+    EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION,
+    UNEQUAL_LENGTH_UNIVARIATE_CLASSIFICATION,
 )
 from aeon.utils import COLLECTIONS_DATA_TYPES
 from aeon.utils.conversion._convert_collection import (
@@ -46,45 +46,53 @@
 def test_convert_collection(input_data, output_data):
     """Test all valid and invalid conversions."""
     # All should work with univariate equal length
-    X = convert_collection(EQUAL_LENGTH_UNIVARIATE_COLLECTION[input_data], output_data)
+    X = convert_collection(
+        EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[input_data]["train"][0], output_data
+    )
     assert get_type(X) == output_data
     # Test with multivariate
-    if input_data in EQUAL_LENGTH_MULTIVARIATE_COLLECTION:
-        if output_data in EQUAL_LENGTH_MULTIVARIATE_COLLECTION:
+    if input_data in EQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION:
+        if output_data in EQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION:
             X = convert_collection(
-                EQUAL_LENGTH_MULTIVARIATE_COLLECTION[input_data], output_data
+                EQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION[input_data]["train"][0],
+                output_data,
             )
             assert get_type(X) == output_data
         else:
             with pytest.raises(TypeError, match="Cannot convert multivariate"):
                 X = convert_collection(
-                    EQUAL_LENGTH_MULTIVARIATE_COLLECTION[input_data], output_data
+                    EQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION[input_data]["train"][0],
+                    output_data,
                 )
     # Test with unequal length
-    if input_data in UNEQUAL_LENGTH_UNIVARIATE_COLLECTION:
+    if input_data in UNEQUAL_LENGTH_UNIVARIATE_CLASSIFICATION:
         if (
-            output_data in UNEQUAL_LENGTH_UNIVARIATE_COLLECTION
+            output_data in UNEQUAL_LENGTH_UNIVARIATE_CLASSIFICATION
             or output_data == "pd-multiindex"
         ):
             X = convert_collection(
-                UNEQUAL_LENGTH_UNIVARIATE_COLLECTION[input_data], output_data
+                UNEQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[input_data]["train"][0],
+                output_data,
             )
             assert get_type(X) == output_data
         else:
             with pytest.raises(TypeError, match="Cannot convert unequal"):
                 X = convert_collection(
-                    UNEQUAL_LENGTH_UNIVARIATE_COLLECTION[input_data], output_data
+                    UNEQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[input_data]["train"][0],
+                    output_data,
                 )
 
 
 @pytest.mark.parametrize("input_data", COLLECTIONS_DATA_TYPES)
 def test_convert_df_list(input_data):
     """Test that df list is correctly transposed."""
-    X = convert_collection(EQUAL_LENGTH_UNIVARIATE_COLLECTION[input_data], "df-list")
+    X = convert_collection(
+        EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[input_data]["train"][0], "df-list"
+    )
     assert X[0].shape == (20, 1)
-    if input_data in EQUAL_LENGTH_MULTIVARIATE_COLLECTION:
+    if input_data in EQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION:
         X = convert_collection(
-            EQUAL_LENGTH_MULTIVARIATE_COLLECTION[input_data], "df-list"
+            EQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION[input_data]["train"][0], "df-list"
         )
         assert X[0].shape == (20, 2)
 
@@ -118,43 +126,47 @@ def test_resolve_unequal_length_inner_type():
 @pytest.mark.parametrize("data", COLLECTIONS_DATA_TYPES)
 def test_get_n_cases(data):
     """Test getting the number of cases."""
-    assert get_n_cases(EQUAL_LENGTH_UNIVARIATE_COLLECTION[data]) == 10
+    assert get_n_cases(EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[data]["train"][0]) == 10
 
 
 @pytest.mark.parametrize("data", COLLECTIONS_DATA_TYPES)
 def test_get_type(data):
     """Test getting the type."""
-    assert get_type(EQUAL_LENGTH_UNIVARIATE_COLLECTION[data]) == data
+    assert get_type(EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[data]["train"][0]) == data
 
 
 @pytest.mark.parametrize("data", COLLECTIONS_DATA_TYPES)
 def test_equal_length(data):
     """Test if equal length series correctly identified."""
-    assert _equal_length(EQUAL_LENGTH_UNIVARIATE_COLLECTION[data], data)
+    assert _equal_length(EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[data]["train"][0], data)
 
 
 @pytest.mark.parametrize("data", COLLECTIONS_DATA_TYPES)
 def test_is_equal_length(data):
     """Test if equal length series correctly identified."""
-    assert is_equal_length(EQUAL_LENGTH_UNIVARIATE_COLLECTION[data])
+    assert is_equal_length(EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[data]["train"][0])
 
 
 @pytest.mark.parametrize("data", ["df-list", "np-list"])
 def test_unequal_length(data):
     """Test if unequal length series correctly identified."""
-    assert not _equal_length(UNEQUAL_LENGTH_UNIVARIATE_COLLECTION[data], data)
+    assert not _equal_length(
+        UNEQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[data]["train"][0], data
+    )
 
 
 @pytest.mark.parametrize("data", ["df-list", "np-list"])
 def test_is_unequal_length(data):
     """Test if unequal length series correctly identified."""
-    assert not is_equal_length(UNEQUAL_LENGTH_UNIVARIATE_COLLECTION[data])
+    assert not is_equal_length(
+        UNEQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[data]["train"][0]
+    )
 
 
 @pytest.mark.parametrize("data", COLLECTIONS_DATA_TYPES)
 def test_has_missing(data):
     """Test if missing values are correctly identified."""
-    assert not has_missing(EQUAL_LENGTH_UNIVARIATE_COLLECTION[data])
+    assert not has_missing(EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[data]["train"][0])
     X = np.random.random(size=(10, 2, 20))
     X[5][1][12] = np.NAN
     assert has_missing(X)
@@ -163,9 +175,11 @@ def test_has_missing(data):
 @pytest.mark.parametrize("data", COLLECTIONS_DATA_TYPES)
 def test_is_univariate(data):
     """Test if univariate series are correctly identified."""
-    assert is_univariate(EQUAL_LENGTH_UNIVARIATE_COLLECTION[data])
-    if data in EQUAL_LENGTH_MULTIVARIATE_COLLECTION.keys():
-        assert not is_univariate(EQUAL_LENGTH_MULTIVARIATE_COLLECTION[data])
+    assert is_univariate(EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[data]["train"][0])
+    if data in EQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION.keys():
+        assert not is_univariate(
+            EQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION[data]["train"][0]
+        )
 
 
 NUMPY3D = [
diff --git a/aeon/utils/validation/tests/test_collection.py b/aeon/utils/validation/tests/test_collection.py
index 30bd467e54..a5cf121d6e 100644
--- a/aeon/utils/validation/tests/test_collection.py
+++ b/aeon/utils/validation/tests/test_collection.py
@@ -5,7 +5,7 @@
 import pytest
 
 from aeon.testing.data_generation import make_example_nested_dataframe
-from aeon.testing.testing_data import EQUAL_LENGTH_UNIVARIATE_COLLECTION
+from aeon.testing.testing_data import EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION
 from aeon.utils import COLLECTIONS_DATA_TYPES
 from aeon.utils.validation.collection import (
     _is_pd_wide,
@@ -21,9 +21,13 @@
 def test_is_nested_univ_dataframe(data):
     """Test is_nested_univ_dataframe function for different datatypes."""
     if data == "nested_univ":
-        assert is_nested_univ_dataframe(EQUAL_LENGTH_UNIVARIATE_COLLECTION[data])
+        assert is_nested_univ_dataframe(
+            EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[data]["train"][0]
+        )
     else:
-        assert not is_nested_univ_dataframe(EQUAL_LENGTH_UNIVARIATE_COLLECTION[data])
+        assert not is_nested_univ_dataframe(
+            EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[data]["train"][0]
+        )
 
 
 def test_nested_univ_is_equal():
@@ -49,9 +53,9 @@ def test_nested_univ_is_equal():
 def test_is_pd_wide(data):
     """Test _is_pd_wide function for different datatypes."""
     if data == "pd-wide":
-        assert _is_pd_wide(EQUAL_LENGTH_UNIVARIATE_COLLECTION[data])
+        assert _is_pd_wide(EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[data]["train"][0])
     else:
-        assert not _is_pd_wide(EQUAL_LENGTH_UNIVARIATE_COLLECTION[data])
+        assert not _is_pd_wide(EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[data]["train"][0])
 
 
 def test_is_tabular():
diff --git a/aeon/utils/validation/tests/test_input.py b/aeon/utils/validation/tests/test_input.py
index 5a120177af..0c489d15a8 100644
--- a/aeon/utils/validation/tests/test_input.py
+++ b/aeon/utils/validation/tests/test_input.py
@@ -3,7 +3,7 @@
 import pytest
 
 from aeon.testing.data_generation._legacy import get_examples
-from aeon.testing.testing_data import EQUAL_LENGTH_UNIVARIATE_COLLECTION
+from aeon.testing.testing_data import EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION
 from aeon.utils.validation._input import (
     COLLECTIONS,
     HIERARCHICAL,
@@ -42,9 +42,11 @@ def test_abstract_types():
 @pytest.mark.parametrize("data", COLLECTIONS)
 def test_input_collections(data):
     """Test is_collection with correct input."""
-    assert is_collection(EQUAL_LENGTH_UNIVARIATE_COLLECTION[data])
-    assert not is_single_series(EQUAL_LENGTH_UNIVARIATE_COLLECTION[data])
-    assert not is_hierarchical(EQUAL_LENGTH_UNIVARIATE_COLLECTION[data])
+    assert is_collection(EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[data]["train"][0])
+    assert not is_single_series(
+        EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[data]["train"][0]
+    )
+    assert not is_hierarchical(EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[data]["train"][0])
 
 
 @pytest.mark.parametrize("data_type", HIERARCHICAL)
@@ -81,7 +83,7 @@ def test_input_series(data_type):
 @pytest.mark.parametrize("data_type", COLLECTIONS)
 def test_input_collection(data_type):
     """Test is_collection with correct input."""
-    d = EQUAL_LENGTH_UNIVARIATE_COLLECTION[data_type]
+    d = EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[data_type]["train"][0]
     assert is_collection(d)
     assert not is_single_series(d)
     assert not is_hierarchical(d)

From 18f1b14cd6dab5eafb29efe74329c04d923f1b0f Mon Sep 17 00:00:00 2001
From: MatthewMiddlehurst <m.middlehurst@uea.ac.uk>
Date: Thu, 11 Jul 2024 22:30:45 +0100
Subject: [PATCH 05/15] fix

---
 aeon/testing/estimator_checking/tests/test_check_estimator.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/aeon/testing/estimator_checking/tests/test_check_estimator.py b/aeon/testing/estimator_checking/tests/test_check_estimator.py
index 70e1282a0f..73a3c028a6 100644
--- a/aeon/testing/estimator_checking/tests/test_check_estimator.py
+++ b/aeon/testing/estimator_checking/tests/test_check_estimator.py
@@ -37,8 +37,10 @@ def test_parametrize_with_checks_classes(check):
     name = _get_check_estimator_ids(check).split("=")[1].split("(")[0].split(")")[0]
     assert callable(check)
     dict_before = test_classes[name].__dict__.copy()
+    dict_before.pop("__slotnames__", None)
     check()
     dict_after = test_classes[name].__dict__.copy()
+    dict_after.pop("__slotnames__", None)
     equal, msg = deep_equals(dict_after, dict_before, return_msg=True)
     assert equal, msg
 
@@ -72,8 +74,10 @@ def test_check_estimator_passed(estimator_class):
 
     # test that no exceptions are raised
     dict_before = estimator_class.__dict__.copy()
+    dict_before.pop("__slotnames__", None)
     check_estimator(estimator_class, raise_exceptions=True, verbose=False)
     dict_after = estimator_class.__dict__.copy()
+    dict_after.pop("__slotnames__", None)
     equal, msg = deep_equals(dict_after, dict_before, return_msg=True)
     assert equal, msg
 

From c1688ead2ac49c48ed043589e14c17946f9762bb Mon Sep 17 00:00:00 2001
From: MatthewMiddlehurst <m.middlehurst@uea.ac.uk>
Date: Thu, 11 Jul 2024 23:02:12 +0100
Subject: [PATCH 06/15] pr testing split

---
 aeon/testing/tests/test_all_estimators.py | 32 +++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/aeon/testing/tests/test_all_estimators.py b/aeon/testing/tests/test_all_estimators.py
index bc7b9b4a41..454eb13c1e 100644
--- a/aeon/testing/tests/test_all_estimators.py
+++ b/aeon/testing/tests/test_all_estimators.py
@@ -1,13 +1,45 @@
 """Test all estimators in aeon."""
 
+import platform
+import sys
+
 from aeon.registry import all_estimators
 from aeon.testing.estimator_checking import parametrize_with_checks
+from aeon.testing.test_config import PR_TESTING
+from aeon.utils.sampling import random_partition
 
 ALL_ESTIMATORS = all_estimators(
     estimator_types=["classifier"],
     return_names=False,
 )
 
+# subsample estimators by OS & python version
+# this ensures that only a 1/3 of estimators are tested for a given combination
+# but all are tested on every OS at least once, and on every python version once
+if PR_TESTING:
+    # only use 3 Python versions in PR
+    ix = sys.version_info.minor
+    if ix == 9:
+        ix = 0
+    elif ix == 11:
+        ix = 1
+    elif ix == 12:
+        ix = 2
+
+    os_str = platform.system()
+    if os_str == "Windows":
+        ix = ix
+    elif os_str == "Linux":
+        ix = ix + 1
+    elif os_str == "Darwin":
+        ix = ix + 2
+
+    ix = ix % 3
+
+    ALL_ESTIMATORS = [
+        ALL_ESTIMATORS[i] for i in random_partition(len(ALL_ESTIMATORS), 3)[ix]
+    ]
+
 
 @parametrize_with_checks(ALL_ESTIMATORS)
 def test_all_estimators(check):

From 254bfed35efe8a33a0a62329814532cec2061a5d Mon Sep 17 00:00:00 2001
From: MatthewMiddlehurst <m.middlehurst@uea.ac.uk>
Date: Tue, 30 Jul 2024 21:25:08 +0100
Subject: [PATCH 07/15] classification fixes

---
 .../test_saving_loading_deep_learning_cls.py  |  83 --------------
 .../_yield_classification_checks.py           | 101 +++++++++++++++---
 .../_yield_regression_checks.py               |   0
 3 files changed, 89 insertions(+), 95 deletions(-)
 delete mode 100644 aeon/classification/deep_learning/tests/test_saving_loading_deep_learning_cls.py
 create mode 100644 aeon/testing/estimator_checking/_yield_regression_checks.py

diff --git a/aeon/classification/deep_learning/tests/test_saving_loading_deep_learning_cls.py b/aeon/classification/deep_learning/tests/test_saving_loading_deep_learning_cls.py
deleted file mode 100644
index d90393a369..0000000000
--- a/aeon/classification/deep_learning/tests/test_saving_loading_deep_learning_cls.py
+++ /dev/null
@@ -1,83 +0,0 @@
-"""Unit tests for classifiers deep learners save/load functionalities."""
-
-import inspect
-import os
-import tempfile
-import time
-
-import numpy as np
-import pytest
-
-from aeon.classification import deep_learning
-from aeon.testing.data_generation import make_example_3d_numpy
-from aeon.utils.validation._dependencies import _check_soft_dependencies
-
-__maintainer__ = ["hadifawaz1999"]
-
-
-_deep_cls_classes = [
-    member[1] for member in inspect.getmembers(deep_learning, inspect.isclass)
-]
-
-
-@pytest.mark.skipif(
-    not _check_soft_dependencies(["tensorflow"], severity="none"),
-    reason="skip test if required soft dependency not available",
-)
-@pytest.mark.parametrize("deep_cls", _deep_cls_classes)
-def test_saving_loading_deep_learning_cls(deep_cls):
-    """Test Deep Classifier saving."""
-    with tempfile.TemporaryDirectory() as tmp:
-        if not (
-            deep_cls.__name__
-            in [
-                "BaseDeepClassifier",
-                "InceptionTimeClassifier",
-                "LITETimeClassifier",
-                "TapNetClassifier",
-            ]
-        ):
-            if tmp[-1] != "/":
-                tmp = tmp + "/"
-            curr_time = str(time.time_ns())
-            last_file_name = curr_time + "last"
-            best_file_name = curr_time + "best"
-            init_file_name = curr_time + "init"
-
-            X, y = make_example_3d_numpy()
-
-            deep_cls_train = deep_cls(
-                n_epochs=2,
-                save_best_model=True,
-                save_last_model=True,
-                save_init_model=True,
-                best_file_name=best_file_name,
-                last_file_name=last_file_name,
-                init_file_name=init_file_name,
-                file_path=tmp,
-            )
-            deep_cls_train.fit(X, y)
-
-            deep_cls_best = deep_cls()
-            deep_cls_best.load_model(
-                model_path=os.path.join(tmp, best_file_name + ".keras"),
-                classes=np.unique(y),
-            )
-            ypred_best = deep_cls_best.predict(X)
-            assert len(ypred_best) == len(y)
-
-            deep_cls_last = deep_cls()
-            deep_cls_last.load_model(
-                model_path=os.path.join(tmp, last_file_name + ".keras"),
-                classes=np.unique(y),
-            )
-            ypred_last = deep_cls_last.predict(X)
-            assert len(ypred_last) == len(y)
-
-            deep_cls_init = deep_cls()
-            deep_cls_init.load_model(
-                model_path=os.path.join(tmp, init_file_name + ".keras"),
-                classes=np.unique(y),
-            )
-            ypred_init = deep_cls_init.predict(X)
-            assert len(ypred_init) == len(y)
diff --git a/aeon/testing/estimator_checking/_yield_classification_checks.py b/aeon/testing/estimator_checking/_yield_classification_checks.py
index b670adbbdf..33a3fca101 100644
--- a/aeon/testing/estimator_checking/_yield_classification_checks.py
+++ b/aeon/testing/estimator_checking/_yield_classification_checks.py
@@ -1,4 +1,7 @@
 import inspect
+import os
+import tempfile
+import time
 from functools import partial
 from sys import platform
 
@@ -21,15 +24,24 @@ def _yield_classification_checks(estimator_class, estimator_instances, datatypes
     """Yield all classification checks for an aeon classifier."""
     # only class required
     yield partial(
-        test_classifier_against_expected_results, estimator_class=estimator_class
+        check_classifier_against_expected_results, estimator_class=estimator_class
+    )
+    yield partial(check_classifier_tags_consistent, estimator_class=estimator_class)
+    yield partial(
+        check_does_not_override_final_methods, estimator_class=estimator_class
     )
-    yield partial(test_classifier_tags_consistent, estimator_class=estimator_class)
-    yield partial(test_does_not_override_final_methods, estimator_class=estimator_class)
 
     # data type irrelevant
     if _get_tag(estimator_class, "capability:contractable", raise_error=True):
         yield partial(
-            test_contracted_classifier,
+            check_contracted_classifier,
+            estimator_class=estimator_class,
+            datatype=datatypes[0][0],
+        )
+
+    if issubclass(estimator_class, BaseDeepClassifier):
+        yield partial(
+            check_saving_loading_deep_learning_cls,
             estimator_class=estimator_class,
             datatype=datatypes[0][0],
         )
@@ -39,7 +51,7 @@ def _yield_classification_checks(estimator_class, estimator_instances, datatypes
         # data type irrelevant
         if _get_tag(estimator_class, "capability:train_estimate", raise_error=True):
             yield partial(
-                test_classifier_train_estimate,
+                check_classifier_train_estimate,
                 estimator=estimator,
                 datatype=datatypes[0][0],
             )
@@ -54,11 +66,11 @@ def _yield_classification_checks(estimator_class, estimator_instances, datatypes
         # test all data types
         for datatype in datatypes[i]:
             yield partial(
-                test_classifier_output, estimator=estimator, datatype=datatype
+                check_classifier_output, estimator=estimator, datatype=datatype
             )
 
 
-def test_classifier_against_expected_results(estimator_class):
+def check_classifier_against_expected_results(estimator_class):
     """Test classifier against stored results."""
     # we only use the first estimator instance for testing
     class_name = estimator_class.__name__
@@ -111,7 +123,7 @@ def test_classifier_against_expected_results(estimator_class):
         )
 
 
-def test_classifier_tags_consistent(estimator_class):
+def check_classifier_tags_consistent(estimator_class):
     """Test the tag X_inner_type is consistent with capability:unequal_length."""
     valid_types = {"np-list", "df-list", "pd-multivariate", "nested_univ"}
     unequal = estimator_class.get_class_tag("capability:unequal_length")
@@ -132,7 +144,7 @@ def test_classifier_tags_consistent(estimator_class):
         inst.predict_proba(X)
 
 
-def test_does_not_override_final_methods(estimator_class):
+def check_does_not_override_final_methods(estimator_class):
     """Test does not override final methods."""
     final_methods = [
         "fit",
@@ -149,7 +161,7 @@ def test_does_not_override_final_methods(estimator_class):
             )
 
 
-def test_contracted_classifier(estimator_class, datatype):
+def check_contracted_classifier(estimator_class, datatype):
     """Test classifiers that can be contracted."""
     estimator_instance = estimator_class.create_test_instance(
         parameter_set="contracting"
@@ -200,7 +212,72 @@ def test_contracted_classifier(estimator_class, datatype):
     )
 
 
-def test_classifier_train_estimate(estimator, datatype):
+def check_saving_loading_deep_learning_cls(estimator_class, datatype):
+    """Test Deep Classifier saving."""
+    with tempfile.TemporaryDirectory() as tmp:
+        if not (
+            estimator_class.__name__
+            in [
+                "BaseDeepClassifier",
+                "InceptionTimeClassifier",
+                "LITETimeClassifier",
+                "TapNetClassifier",
+            ]
+        ):
+            if tmp[-1] != "/":
+                tmp = tmp + "/"
+            curr_time = str(time.time_ns())
+            last_file_name = curr_time + "last"
+            best_file_name = curr_time + "best"
+            init_file_name = curr_time + "init"
+
+            deep_cls_train = estimator_class(
+                n_epochs=2,
+                save_best_model=True,
+                save_last_model=True,
+                save_init_model=True,
+                best_file_name=best_file_name,
+                last_file_name=last_file_name,
+                init_file_name=init_file_name,
+                file_path=tmp,
+            )
+            deep_cls_train.fit(
+                FULL_TEST_DATA_DICT[datatype]["train"][0],
+                FULL_TEST_DATA_DICT[datatype]["train"][1],
+            )
+
+            deep_cls_best = estimator_class()
+            deep_cls_best.load_model(
+                model_path=os.path.join(tmp, best_file_name + ".keras"),
+                classes=np.unique(FULL_TEST_DATA_DICT[datatype]["train"][1]),
+            )
+            ypred_best = deep_cls_best.predict(
+                FULL_TEST_DATA_DICT[datatype]["train"][0]
+            )
+            assert len(ypred_best) == len(FULL_TEST_DATA_DICT[datatype]["train"][1])
+
+            deep_cls_last = estimator_class()
+            deep_cls_last.load_model(
+                model_path=os.path.join(tmp, last_file_name + ".keras"),
+                classes=np.unique(FULL_TEST_DATA_DICT[datatype]["train"][1]),
+            )
+            ypred_last = deep_cls_last.predict(
+                FULL_TEST_DATA_DICT[datatype]["train"][0]
+            )
+            assert len(ypred_last) == len(FULL_TEST_DATA_DICT[datatype]["train"][1])
+
+            deep_cls_init = estimator_class()
+            deep_cls_init.load_model(
+                model_path=os.path.join(tmp, init_file_name + ".keras"),
+                classes=np.unique(FULL_TEST_DATA_DICT[datatype]["train"][1]),
+            )
+            ypred_init = deep_cls_init.predict(
+                FULL_TEST_DATA_DICT[datatype]["train"][0]
+            )
+            assert len(ypred_init) == len(FULL_TEST_DATA_DICT[datatype]["train"][1])
+
+
+def check_classifier_train_estimate(estimator, datatype):
     """Test classifiers that can produce train set probability estimates."""
     estimator = _clone_estimator(estimator)
     estimator_class = type(estimator)
@@ -276,7 +353,7 @@ def check_random_state_deep_learning(estimator, datatype):
             np.testing.assert_almost_equal(_weight1, _weight2, 4)
 
 
-def test_classifier_output(estimator, datatype):
+def check_classifier_output(estimator, datatype):
     """Test classifier outputs the correct data types and values.
 
     Test predict produces a np.array or pd.Series with only values seen in the train
diff --git a/aeon/testing/estimator_checking/_yield_regression_checks.py b/aeon/testing/estimator_checking/_yield_regression_checks.py
new file mode 100644
index 0000000000..e69de29bb2

From e8ea3c9838c07ae4c323d4eb7bfa07c4cc6497b4 Mon Sep 17 00:00:00 2001
From: MatthewMiddlehurst <m.middlehurst@uea.ac.uk>
Date: Wed, 31 Jul 2024 10:56:27 +0100
Subject: [PATCH 08/15] regressor tests

---
 .../tests/test_random_state_deep_regressor.py |  60 -----
 .../test_saving_loading_deep_learning_cls.py  |  79 ------
 aeon/regression/tests/test_all_regressors.py  | 118 ---------
 .../_yield_classification_checks.py           |  15 +-
 .../_yield_estimator_checks.py                |   9 +
 .../_yield_regression_checks.py               | 227 ++++++++++++++++++
 aeon/testing/test_all_estimators.py           |   2 +-
 aeon/testing/tests/test_all_estimators.py     |   2 +-
 8 files changed, 247 insertions(+), 265 deletions(-)
 delete mode 100644 aeon/regression/deep_learning/tests/test_random_state_deep_regressor.py
 delete mode 100644 aeon/regression/deep_learning/tests/test_saving_loading_deep_learning_cls.py
 delete mode 100644 aeon/regression/tests/test_all_regressors.py

diff --git a/aeon/regression/deep_learning/tests/test_random_state_deep_regressor.py b/aeon/regression/deep_learning/tests/test_random_state_deep_regressor.py
deleted file mode 100644
index 3c78367348..0000000000
--- a/aeon/regression/deep_learning/tests/test_random_state_deep_regressor.py
+++ /dev/null
@@ -1,60 +0,0 @@
-"""Unit tests for regressors deep learning random_state functionality."""
-
-import inspect
-
-import numpy as np
-import pytest
-
-from aeon.regression import deep_learning
-from aeon.testing.data_generation import make_example_3d_numpy
-from aeon.utils.validation._dependencies import _check_soft_dependencies
-
-__maintainer__ = ["hadifawaz1999"]
-
-
-@pytest.mark.skipif(
-    not _check_soft_dependencies(["tensorflow"], severity="none"),
-    reason="skip test if required soft dependency not available",
-)
-def test_random_state_deep_learning_rgs():
-    """Test Deep Regressor seeding."""
-    random_state = 42
-
-    X, y = make_example_3d_numpy(random_state=random_state)
-
-    deep_rgs_classes = [
-        member[1] for member in inspect.getmembers(deep_learning, inspect.isclass)
-    ]
-
-    for i in range(len(deep_rgs_classes)):
-        if (
-            "BaseDeepRegressor" in str(deep_rgs_classes[i])
-            or "InceptionTimeRegressor" in str(deep_rgs_classes[i])
-            or "LITETimeRegressor" in str(deep_rgs_classes[i])
-            or "TapNetRegressor" in str(deep_rgs_classes[i])
-        ):
-            continue
-
-        deep_rgs1 = deep_rgs_classes[i](random_state=random_state, n_epochs=4)
-        deep_rgs1.fit(X, y)
-
-        layers1 = deep_rgs1.training_model_.layers[1:]
-
-        deep_rgs2 = deep_rgs_classes[i](random_state=random_state, n_epochs=4)
-        deep_rgs2.fit(X, y)
-
-        layers2 = deep_rgs2.training_model_.layers[1:]
-
-        assert len(layers1) == len(layers2)
-
-        for i in range(len(layers1)):
-            weights1 = layers1[i].get_weights()
-            weights2 = layers2[i].get_weights()
-
-            assert len(weights1) == len(weights2)
-
-            for j in range(len(weights1)):
-                _weight1 = np.asarray(weights1[j])
-                _weight2 = np.asarray(weights2[j])
-
-                np.testing.assert_almost_equal(_weight1, _weight2, 4)
diff --git a/aeon/regression/deep_learning/tests/test_saving_loading_deep_learning_cls.py b/aeon/regression/deep_learning/tests/test_saving_loading_deep_learning_cls.py
deleted file mode 100644
index 736d99baf3..0000000000
--- a/aeon/regression/deep_learning/tests/test_saving_loading_deep_learning_cls.py
+++ /dev/null
@@ -1,79 +0,0 @@
-"""Unit tests for regressors deep learners save/load functionalities."""
-
-import inspect
-import os
-import tempfile
-import time
-
-import pytest
-
-from aeon.regression import deep_learning
-from aeon.testing.data_generation import make_example_3d_numpy
-from aeon.utils.validation._dependencies import _check_soft_dependencies
-
-__maintainer__ = ["hadifawaz1999"]
-
-
-_deep_rgs_classes = [
-    member[1] for member in inspect.getmembers(deep_learning, inspect.isclass)
-]
-
-
-@pytest.mark.skipif(
-    not _check_soft_dependencies(["tensorflow"], severity="none"),
-    reason="skip test if required soft dependency not available",
-)
-@pytest.mark.parametrize("deep_rgs", _deep_rgs_classes)
-def test_saving_loading_deep_learning_rgs(deep_rgs):
-    """Test Deep Regressor saving."""
-    with tempfile.TemporaryDirectory() as tmp:
-        if not (
-            deep_rgs.__name__
-            in [
-                "BaseDeepRegressor",
-                "InceptionTimeRegressor",
-                "LITETimeRegressor",
-                "TapNetRegressor",
-            ]
-        ):
-            if tmp[-1] != "/":
-                tmp = tmp + "/"
-            curr_time = str(time.time_ns())
-            last_file_name = curr_time + "last"
-            best_file_name = curr_time + "best"
-            init_file_name = curr_time + "init"
-
-            X, y = make_example_3d_numpy()
-
-            deep_rgs_train = deep_rgs(
-                n_epochs=2,
-                save_best_model=True,
-                save_last_model=True,
-                save_init_model=True,
-                best_file_name=best_file_name,
-                last_file_name=last_file_name,
-                init_file_name=init_file_name,
-                file_path=tmp,
-            )
-            deep_rgs_train.fit(X, y)
-
-            deep_rgs_best = deep_rgs()
-            deep_rgs_best.load_model(
-                model_path=os.path.join(tmp, best_file_name + ".keras"),
-            )
-            ypred_best = deep_rgs_best.predict(X)
-            assert len(ypred_best) == len(y)
-
-            deep_rgs_last = deep_rgs()
-            deep_rgs_last.load_model(
-                model_path=os.path.join(tmp, last_file_name + ".keras"),
-            )
-            ypred_last = deep_rgs_last.predict(X)
-            assert len(ypred_last) == len(y)
-
-            deep_rgs_init = deep_rgs()
-            deep_rgs_init.load_model(
-                model_path=os.path.join(tmp, init_file_name + ".keras"),
-            )
-            ypred_init = deep_rgs_init.predict(X)
-            assert len(ypred_init) == len(y)
diff --git a/aeon/regression/tests/test_all_regressors.py b/aeon/regression/tests/test_all_regressors.py
deleted file mode 100644
index f8e4670c94..0000000000
--- a/aeon/regression/tests/test_all_regressors.py
+++ /dev/null
@@ -1,118 +0,0 @@
-"""Unit tests for all time series regressors."""
-
-__maintainer__ = []
-
-from sys import platform
-
-import numpy as np
-from sklearn.utils._testing import set_random_state
-
-from aeon.datasets import load_cardano_sentiment, load_covid_3month
-from aeon.testing.expected_results.expected_regressor_outputs import (
-    cardano_sentiment_preds,
-    covid_3month_preds,
-)
-from aeon.testing.test_all_estimators import BaseFixtureGenerator, QuickTester
-from aeon.testing.utils.estimator_checks import _assert_array_almost_equal
-
-
-class RegressorFixtureGenerator(BaseFixtureGenerator):
-    """Fixture generator for regression tests.
-
-    Fixtures parameterized
-    ----------------------
-    estimator_class: estimator inheriting from BaseObject
-        ranges over estimator classes not excluded by EXCLUDE_ESTIMATORS, EXCLUDED_TESTS
-    estimator_instance: instance of estimator inheriting from BaseObject
-        ranges over estimator classes not excluded by EXCLUDE_ESTIMATORS, EXCLUDED_TESTS
-        instances are generated by create_test_instance class method
-    scenario: instance of TestScenario
-        ranges over all scenarios returned by retrieve_scenarios
-    """
-
-    # note: this should be separate from TestAllRegressors
-    #   additional fixtures, parameters, etc should be added here
-    #   TestAllRegressors should contain the tests only
-
-    estimator_type_filter = "regressor"
-
-
-class TestAllRegressors(RegressorFixtureGenerator, QuickTester):
-    """Module level tests for all aeon regressors."""
-
-    def test_regressor_against_expected_results(self, estimator_class):
-        """Test classifier against stored results."""
-        # we only use the first estimator instance for testing
-        classname = estimator_class.__name__
-
-        # We cannot guarantee same results on ARM macOS
-        if platform == "darwin":
-            return None
-
-        for data_name, data_dict, data_loader, data_seed in [
-            ["Covid3Month", covid_3month_preds, load_covid_3month, 0],
-            ["CardanoSentiment", cardano_sentiment_preds, load_cardano_sentiment, 0],
-        ]:
-            # retrieve expected predict output, and skip test if not available
-            if classname in data_dict.keys():
-                expected_preds = data_dict[classname]
-            else:
-                # skip test if no expected preds are registered
-                continue
-
-            # we only use the first estimator instance for testing
-            estimator_instance = estimator_class.create_test_instance(
-                parameter_set="results_comparison"
-            )
-            # set random seed if possible
-            set_random_state(estimator_instance, 0)
-
-            # load test data
-            X_train, y_train = data_loader(split="train")
-            X_test, y_test = data_loader(split="test")
-            indices_train = np.random.RandomState(data_seed).choice(
-                len(y_train), 10, replace=False
-            )
-            indices_test = np.random.RandomState(data_seed).choice(
-                len(y_test), 10, replace=False
-            )
-
-            # train regressor and predict
-            estimator_instance.fit(X_train[indices_train], y_train[indices_train])
-            y_pred = estimator_instance.predict(X_test[indices_test])
-
-            # assert predictions are the same
-            _assert_array_almost_equal(
-                y_pred,
-                expected_preds,
-                decimal=2,
-                err_msg=f"Failed to reproduce results for {classname} on {data_name}",
-            )
-
-    def test_regressor_tags_consistent(self, estimator_class):
-        """Test the tag X_inner_type is consistent with capability:unequal_length."""
-        valid_types = {"np-list", "df-list", "pd-multivariate", "nested_univ"}
-        unequal = estimator_class.get_class_tag("capability:unequal_length")
-        if unequal:  # one of X_inner_types must be capable of storing unequal length
-            internal_types = estimator_class.get_class_tag("X_inner_type")
-            if isinstance(internal_types, str):
-                assert internal_types in valid_types
-            else:  # must be a list
-                assert bool(set(internal_types) & valid_types)
-        # Test can actually fit/predict with multivariate if tag is set
-        multivariate = estimator_class.get_class_tag("capability:multivariate")
-        if multivariate:
-            X = np.random.random((10, 2, 20))
-            y = np.random.random(10)
-            inst = estimator_class.create_test_instance(parameter_set="default")
-            inst.fit(X, y)
-            inst.predict(X)
-
-    def test_does_not_override_final_methods(self, estimator_class):
-        """Test does not override final methods."""
-        if "fit" in estimator_class.__dict__:
-            raise ValueError(f"Classifier {estimator_class} overrides the method fit")
-        if "predict" in estimator_class.__dict__:
-            raise ValueError(
-                f"Classifier {estimator_class} overrides the method " f"predict"
-            )
diff --git a/aeon/testing/estimator_checking/_yield_classification_checks.py b/aeon/testing/estimator_checking/_yield_classification_checks.py
index 33a3fca101..54fc89ffd0 100644
--- a/aeon/testing/estimator_checking/_yield_classification_checks.py
+++ b/aeon/testing/estimator_checking/_yield_classification_checks.py
@@ -1,3 +1,5 @@
+"""Tests for all classifiers."""
+
 import inspect
 import os
 import tempfile
@@ -28,7 +30,8 @@ def _yield_classification_checks(estimator_class, estimator_instances, datatypes
     )
     yield partial(check_classifier_tags_consistent, estimator_class=estimator_class)
     yield partial(
-        check_does_not_override_final_methods, estimator_class=estimator_class
+        check_classifier_does_not_override_final_methods,
+        estimator_class=estimator_class,
     )
 
     # data type irrelevant
@@ -41,7 +44,7 @@ def _yield_classification_checks(estimator_class, estimator_instances, datatypes
 
     if issubclass(estimator_class, BaseDeepClassifier):
         yield partial(
-            check_saving_loading_deep_learning_cls,
+            check_classifier_saving_loading_deep_learning,
             estimator_class=estimator_class,
             datatype=datatypes[0][0],
         )
@@ -58,7 +61,7 @@ def _yield_classification_checks(estimator_class, estimator_instances, datatypes
 
         if isinstance(estimator, BaseDeepClassifier):
             yield partial(
-                check_random_state_deep_learning,
+                check_classifier_random_state_deep_learning,
                 estimator=estimator,
                 datatype=datatypes[i][0],
             )
@@ -144,7 +147,7 @@ def check_classifier_tags_consistent(estimator_class):
         inst.predict_proba(X)
 
 
-def check_does_not_override_final_methods(estimator_class):
+def check_classifier_does_not_override_final_methods(estimator_class):
     """Test does not override final methods."""
     final_methods = [
         "fit",
@@ -212,7 +215,7 @@ def check_contracted_classifier(estimator_class, datatype):
     )
 
 
-def check_saving_loading_deep_learning_cls(estimator_class, datatype):
+def check_classifier_saving_loading_deep_learning(estimator_class, datatype):
     """Test Deep Classifier saving."""
     with tempfile.TemporaryDirectory() as tmp:
         if not (
@@ -318,7 +321,7 @@ def check_classifier_train_estimate(estimator, datatype):
     np.testing.assert_almost_equal(train_proba.sum(axis=1), 1, decimal=4)
 
 
-def check_random_state_deep_learning(estimator, datatype):
+def check_classifier_random_state_deep_learning(estimator, datatype):
     """Test Deep Classifier seeding."""
     random_state = 42
 
diff --git a/aeon/testing/estimator_checking/_yield_estimator_checks.py b/aeon/testing/estimator_checking/_yield_estimator_checks.py
index 7426482c56..3cc212af8b 100644
--- a/aeon/testing/estimator_checking/_yield_estimator_checks.py
+++ b/aeon/testing/estimator_checking/_yield_estimator_checks.py
@@ -16,10 +16,14 @@
 from aeon.classification import BaseClassifier
 from aeon.classification.deep_learning.base import BaseDeepClassifier
 from aeon.clustering.deep_learning.base import BaseDeepClusterer
+from aeon.regression import BaseRegressor
 from aeon.regression.deep_learning.base import BaseDeepRegressor
 from aeon.testing.estimator_checking._yield_classification_checks import (
     _yield_classification_checks,
 )
+from aeon.testing.estimator_checking._yield_regression_checks import (
+    _yield_regression_checks,
+)
 from aeon.testing.test_config import (
     NON_STATE_CHANGING_METHODS,
     NON_STATE_CHANGING_METHODS_ARRAYLIKE,
@@ -79,6 +83,11 @@ def _yield_all_aeon_checks(
             estimator_class, estimator_instances, datatypes
         )
 
+    if issubclass(estimator_class, BaseRegressor):
+        yield from _yield_regression_checks(
+            estimator_class, estimator_instances, datatypes
+        )
+
 
 def _yield_estimator_checks(estimator_class, estimator_instances, datatypes):
     """Yield all general checks for an aeon estimator."""
diff --git a/aeon/testing/estimator_checking/_yield_regression_checks.py b/aeon/testing/estimator_checking/_yield_regression_checks.py
index e69de29bb2..192a60fb00 100644
--- a/aeon/testing/estimator_checking/_yield_regression_checks.py
+++ b/aeon/testing/estimator_checking/_yield_regression_checks.py
@@ -0,0 +1,227 @@
+"""Tests for all regressors."""
+
+import os
+import tempfile
+import time
+from functools import partial
+from sys import platform
+
+import numpy as np
+from sklearn.utils._testing import set_random_state
+
+from aeon.base._base import _clone_estimator
+from aeon.datasets import load_cardano_sentiment, load_covid_3month
+from aeon.regression.deep_learning import BaseDeepRegressor
+from aeon.testing.expected_results.expected_regressor_outputs import (
+    cardano_sentiment_preds,
+    covid_3month_preds,
+)
+from aeon.testing.testing_data import FULL_TEST_DATA_DICT
+from aeon.testing.utils.estimator_checks import _assert_array_almost_equal
+
+
+def _yield_regression_checks(estimator_class, estimator_instances, datatypes):
+    """Yield all regression checks for an aeon regressor."""
+    # only class required
+    yield partial(
+        check_regressor_against_expected_results, estimator_class=estimator_class
+    )
+    yield partial(check_regressor_tags_consistent, estimator_class=estimator_class)
+    yield partial(
+        check_regressor_does_not_override_final_methods, estimator_class=estimator_class
+    )
+
+    # data type irrelevant
+    if issubclass(estimator_class, BaseDeepRegressor):
+        yield partial(
+            check_regressor_saving_loading_deep_learning,
+            estimator_class=estimator_class,
+            datatype=datatypes[0][0],
+        )
+
+    # test class instances
+    for i, estimator in enumerate(estimator_instances):
+        # data type irrelevant
+        if isinstance(estimator, BaseDeepRegressor):
+            yield partial(
+                check_regressor_random_state_deep_learning,
+                estimator=estimator,
+                datatype=datatypes[i][0],
+            )
+
+
+def check_regressor_against_expected_results(estimator_class):
+    """Test classifier against stored results."""
+    # we only use the first estimator instance for testing
+    classname = estimator_class.__name__
+
+    # We cannot guarantee same results on ARM macOS
+    if platform == "darwin":
+        return None
+
+    for data_name, data_dict, data_loader, data_seed in [
+        ["Covid3Month", covid_3month_preds, load_covid_3month, 0],
+        ["CardanoSentiment", cardano_sentiment_preds, load_cardano_sentiment, 0],
+    ]:
+        # retrieve expected predict output, and skip test if not available
+        if classname in data_dict.keys():
+            expected_preds = data_dict[classname]
+        else:
+            # skip test if no expected preds are registered
+            continue
+
+        # we only use the first estimator instance for testing
+        estimator_instance = estimator_class.create_test_instance(
+            parameter_set="results_comparison"
+        )
+        # set random seed if possible
+        set_random_state(estimator_instance, 0)
+
+        # load test data
+        X_train, y_train = data_loader(split="train")
+        X_test, y_test = data_loader(split="test")
+        indices_train = np.random.RandomState(data_seed).choice(
+            len(y_train), 10, replace=False
+        )
+        indices_test = np.random.RandomState(data_seed).choice(
+            len(y_test), 10, replace=False
+        )
+
+        # train regressor and predict
+        estimator_instance.fit(X_train[indices_train], y_train[indices_train])
+        y_pred = estimator_instance.predict(X_test[indices_test])
+
+        # assert predictions are the same
+        _assert_array_almost_equal(
+            y_pred,
+            expected_preds,
+            decimal=2,
+            err_msg=f"Failed to reproduce results for {classname} on {data_name}",
+        )
+
+
+def check_regressor_tags_consistent(estimator_class):
+    """Test the tag X_inner_type is consistent with capability:unequal_length."""
+    valid_types = {"np-list", "df-list", "pd-multivariate", "nested_univ"}
+    unequal = estimator_class.get_class_tag("capability:unequal_length")
+    if unequal:  # one of X_inner_types must be capable of storing unequal length
+        internal_types = estimator_class.get_class_tag("X_inner_type")
+        if isinstance(internal_types, str):
+            assert internal_types in valid_types
+        else:  # must be a list
+            assert bool(set(internal_types) & valid_types)
+    # Test can actually fit/predict with multivariate if tag is set
+    multivariate = estimator_class.get_class_tag("capability:multivariate")
+    if multivariate:
+        X = np.random.random((10, 2, 20))
+        y = np.random.random(10)
+        inst = estimator_class.create_test_instance(parameter_set="default")
+        inst.fit(X, y)
+        inst.predict(X)
+
+
+def check_regressor_does_not_override_final_methods(estimator_class):
+    """Test does not override final methods."""
+    if "fit" in estimator_class.__dict__:
+        raise ValueError(f"Classifier {estimator_class} overrides the method fit")
+    if "predict" in estimator_class.__dict__:
+        raise ValueError(
+            f"Classifier {estimator_class} overrides the method " f"predict"
+        )
+
+
+def check_regressor_saving_loading_deep_learning(estimator_class, datatype):
+    """Test Deep Regressor saving."""
+    with tempfile.TemporaryDirectory() as tmp:
+        if not (
+            estimator_class.__name__
+            in [
+                "BaseDeepRegressor",
+                "InceptionTimeRegressor",
+                "LITETimeRegressor",
+                "TapNetRegressor",
+            ]
+        ):
+            if tmp[-1] != "/":
+                tmp = tmp + "/"
+            curr_time = str(time.time_ns())
+            last_file_name = curr_time + "last"
+            best_file_name = curr_time + "best"
+            init_file_name = curr_time + "init"
+
+            deep_rgs_train = estimator_class(
+                n_epochs=2,
+                save_best_model=True,
+                save_last_model=True,
+                save_init_model=True,
+                best_file_name=best_file_name,
+                last_file_name=last_file_name,
+                init_file_name=init_file_name,
+                file_path=tmp,
+            )
+            deep_rgs_train.fit(
+                FULL_TEST_DATA_DICT[datatype]["train"][0],
+                FULL_TEST_DATA_DICT[datatype]["train"][1],
+            )
+
+            deep_rgs_best = estimator_class()
+            deep_rgs_best.load_model(
+                model_path=os.path.join(tmp, best_file_name + ".keras"),
+            )
+            ypred_best = deep_rgs_best.predict(
+                FULL_TEST_DATA_DICT[datatype]["train"][0]
+            )
+            assert len(ypred_best) == len(FULL_TEST_DATA_DICT[datatype]["train"][1])
+
+            deep_rgs_last = estimator_class()
+            deep_rgs_last.load_model(
+                model_path=os.path.join(tmp, last_file_name + ".keras"),
+            )
+            ypred_last = deep_rgs_last.predict(
+                FULL_TEST_DATA_DICT[datatype]["train"][0]
+            )
+            assert len(ypred_last) == len(FULL_TEST_DATA_DICT[datatype]["train"][1])
+
+            deep_rgs_init = estimator_class()
+            deep_rgs_init.load_model(
+                model_path=os.path.join(tmp, init_file_name + ".keras"),
+            )
+            ypred_init = deep_rgs_init.predict(
+                FULL_TEST_DATA_DICT[datatype]["train"][0]
+            )
+            assert len(ypred_init) == len(FULL_TEST_DATA_DICT[datatype]["train"][1])
+
+
+def check_regressor_random_state_deep_learning(estimator, datatype):
+    """Test Deep Regressor seeding."""
+    random_state = 42
+
+    deep_rgs1 = _clone_estimator(estimator, random_state=random_state)
+    deep_rgs1.fit(
+        FULL_TEST_DATA_DICT[datatype]["train"][0],
+        FULL_TEST_DATA_DICT[datatype]["train"][1],
+    )
+
+    layers1 = deep_rgs1.training_model_.layers[1:]
+
+    deep_rgs2 = _clone_estimator(estimator, random_state=random_state)
+    deep_rgs2.fit(
+        FULL_TEST_DATA_DICT[datatype]["train"][0],
+        FULL_TEST_DATA_DICT[datatype]["train"][1],
+    )
+
+    layers2 = deep_rgs2.training_model_.layers[1:]
+
+    assert len(layers1) == len(layers2)
+
+    for i in range(len(layers1)):
+        weights1 = layers1[i].get_weights()
+        weights2 = layers2[i].get_weights()
+
+        assert len(weights1) == len(weights2)
+
+        for j in range(len(weights1)):
+            _weight1 = np.asarray(weights1[j])
+            _weight2 = np.asarray(weights2[j])
+
+            np.testing.assert_almost_equal(_weight1, _weight2, 4)
diff --git a/aeon/testing/test_all_estimators.py b/aeon/testing/test_all_estimators.py
index a6dd584487..b65276b9dc 100644
--- a/aeon/testing/test_all_estimators.py
+++ b/aeon/testing/test_all_estimators.py
@@ -205,7 +205,7 @@ def _all_estimators(self):
             estimator_types=getattr(self, "estimator_type_filter", None),
             return_names=False,
             exclude_estimators=EXCLUDE_ESTIMATORS,
-            exclude_estimator_types=["classifier"],
+            exclude_estimator_types=["classifier", "regressor"],
         )
 
         # subsample estimators by OS & python version
diff --git a/aeon/testing/tests/test_all_estimators.py b/aeon/testing/tests/test_all_estimators.py
index 454eb13c1e..a356c7eb0c 100644
--- a/aeon/testing/tests/test_all_estimators.py
+++ b/aeon/testing/tests/test_all_estimators.py
@@ -9,7 +9,7 @@
 from aeon.utils.sampling import random_partition
 
 ALL_ESTIMATORS = all_estimators(
-    estimator_types=["classifier"],
+    estimator_types=["classifier", "regressor"],
     return_names=False,
 )
 

From 833d8fcecaac5e560d7bf9560f691389d2c404f0 Mon Sep 17 00:00:00 2001
From: MatthewMiddlehurst <m.middlehurst@uea.ac.uk>
Date: Wed, 31 Jul 2024 12:16:42 +0100
Subject: [PATCH 09/15] exclude tapnet

---
 aeon/testing/test_config.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/aeon/testing/test_config.py b/aeon/testing/test_config.py
index cb11f3df59..0079ec6814 100644
--- a/aeon/testing/test_config.py
+++ b/aeon/testing/test_config.py
@@ -69,7 +69,8 @@
     # has a keras fail, unknown reason, see #1387
     "LearningShapeletClassifier": ["check_fit_deterministic"],
     # does not fit structure for test, needs investigation
-    "TapNetClassifier": ["check_random_state_deep_learning"],
+    "TapNetClassifier": ["check_classifier_random_state_deep_learning"],
+    "TapNetRegressor": ["check_regressor_random_state_deep_learning"],
     # needs investigation
     "SASTClassifier": ["check_fit_deterministic"],
     "RSASTClassifier": ["check_fit_deterministic"],

From b437f3c37423118b2077bc15cd0f350d4edcf00e Mon Sep 17 00:00:00 2001
From: MatthewMiddlehurst <m.middlehurst@uea.ac.uk>
Date: Wed, 31 Jul 2024 14:14:35 +0100
Subject: [PATCH 10/15] missing value test data

---
 aeon/testing/testing_data.py            | 12 +++--
 aeon/testing/tests/test_testing_data.py | 61 ++++++++++++++++++++++++-
 2 files changed, 68 insertions(+), 5 deletions(-)

diff --git a/aeon/testing/testing_data.py b/aeon/testing/testing_data.py
index 8d1856d022..69b8594747 100644
--- a/aeon/testing/testing_data.py
+++ b/aeon/testing/testing_data.py
@@ -721,8 +721,10 @@
 X_classification_missing_test[:, :, data_rng.choice(20, 2)] = np.nan
 
 MISSING_VALUES_CLASSIFICATION = {
-    "train": (X_classification_missing_train, y_classification_missing_train),
-    "test": (X_classification_missing_test, y_classification_missing_test),
+    "numpy3D": {
+        "train": (X_classification_missing_train, y_classification_missing_train),
+        "test": (X_classification_missing_test, y_classification_missing_test),
+    }
 }
 
 X_classification_missing_train, y_classification_missing_train = make_example_3d_numpy(
@@ -743,8 +745,10 @@
 X_classification_missing_test[:, :, data_rng.choice(20, 2)] = np.nan
 
 MISSING_VALUES_REGRESSION = {
-    "train": (X_classification_missing_train, y_classification_missing_train),
-    "test": (X_classification_missing_test, y_classification_missing_test),
+    "numpy3D": {
+        "train": (X_classification_missing_train, y_classification_missing_train),
+        "test": (X_classification_missing_test, y_classification_missing_test),
+    }
 }
 
 X_series = make_example_1d_numpy(
diff --git a/aeon/testing/tests/test_testing_data.py b/aeon/testing/tests/test_testing_data.py
index 466d2581d4..0eab61fdc8 100644
--- a/aeon/testing/tests/test_testing_data.py
+++ b/aeon/testing/tests/test_testing_data.py
@@ -9,12 +9,15 @@
     EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION,
     EQUAL_LENGTH_UNIVARIATE_REGRESSION,
     FULL_TEST_DATA_DICT,
+    MISSING_VALUES_CLASSIFICATION,
+    MISSING_VALUES_REGRESSION,
     UNEQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION,
     UNEQUAL_LENGTH_MULTIVARIATE_REGRESSION,
     UNEQUAL_LENGTH_UNIVARIATE_CLASSIFICATION,
     UNEQUAL_LENGTH_UNIVARIATE_REGRESSION,
 )
 from aeon.utils.validation import (
+    has_missing,
     is_collection,
     is_equal_length,
     is_single_series,
@@ -53,6 +56,7 @@ def test_equal_length_univariate_collection():
         )
         assert is_univariate(EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[key]["train"][0])
         assert is_equal_length(EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[key]["train"][0])
+        assert not has_missing(EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[key]["train"][0])
         check_classification_targets(
             EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[key]["train"][1]
         )
@@ -62,6 +66,7 @@ def test_equal_length_univariate_collection():
         )
         assert is_univariate(EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[key]["test"][0])
         assert is_equal_length(EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[key]["test"][0])
+        assert not has_missing(EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[key]["test"][0])
         check_classification_targets(
             EQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[key]["test"][1]
         )
@@ -72,6 +77,7 @@ def test_equal_length_univariate_collection():
         )
         assert is_univariate(EQUAL_LENGTH_UNIVARIATE_REGRESSION[key]["train"][0])
         assert is_equal_length(EQUAL_LENGTH_UNIVARIATE_REGRESSION[key]["train"][0])
+        assert not has_missing(EQUAL_LENGTH_UNIVARIATE_REGRESSION[key]["train"][0])
         assert np.issubdtype(
             EQUAL_LENGTH_UNIVARIATE_REGRESSION[key]["train"][1].dtype, np.integer
         ) or np.issubdtype(
@@ -83,6 +89,7 @@ def test_equal_length_univariate_collection():
         )
         assert is_univariate(EQUAL_LENGTH_UNIVARIATE_REGRESSION[key]["test"][0])
         assert is_equal_length(EQUAL_LENGTH_UNIVARIATE_REGRESSION[key]["test"][0])
+        assert not has_missing(EQUAL_LENGTH_UNIVARIATE_REGRESSION[key]["test"][0])
         assert np.issubdtype(
             EQUAL_LENGTH_UNIVARIATE_REGRESSION[key]["test"][1].dtype, np.integer
         ) or np.issubdtype(
@@ -98,6 +105,9 @@ def test_unequal_length_univariate_collection():
         assert not is_equal_length(
             UNEQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[key]["train"][0]
         )
+        assert not has_missing(
+            UNEQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[key]["train"][0]
+        )
         check_classification_targets(
             UNEQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[key]["train"][1]
         )
@@ -107,6 +117,7 @@ def test_unequal_length_univariate_collection():
         assert not is_equal_length(
             UNEQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[key]["test"][0]
         )
+        assert not has_missing(UNEQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[key]["test"][0])
         check_classification_targets(
             UNEQUAL_LENGTH_UNIVARIATE_CLASSIFICATION[key]["test"][1]
         )
@@ -117,6 +128,7 @@ def test_unequal_length_univariate_collection():
         assert not is_equal_length(
             UNEQUAL_LENGTH_UNIVARIATE_REGRESSION[key]["train"][0]
         )
+        assert not has_missing(UNEQUAL_LENGTH_UNIVARIATE_REGRESSION[key]["train"][0])
         assert np.issubdtype(
             UNEQUAL_LENGTH_UNIVARIATE_REGRESSION[key]["train"][1].dtype, np.integer
         ) or np.issubdtype(
@@ -126,6 +138,7 @@ def test_unequal_length_univariate_collection():
         assert is_collection(UNEQUAL_LENGTH_UNIVARIATE_REGRESSION[key]["test"][0])
         assert is_univariate(UNEQUAL_LENGTH_UNIVARIATE_REGRESSION[key]["test"][0])
         assert not is_equal_length(UNEQUAL_LENGTH_UNIVARIATE_REGRESSION[key]["test"][0])
+        assert not has_missing(UNEQUAL_LENGTH_UNIVARIATE_REGRESSION[key]["test"][0])
         assert np.issubdtype(
             UNEQUAL_LENGTH_UNIVARIATE_REGRESSION[key]["test"][1].dtype, np.integer
         ) or np.issubdtype(
@@ -143,6 +156,9 @@ def test_equal_length_multivariate_collection():
         assert is_equal_length(
             EQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION[key]["train"][0]
         )
+        assert not has_missing(
+            EQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION[key]["train"][0]
+        )
         check_classification_targets(
             EQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION[key]["train"][1]
         )
@@ -152,6 +168,7 @@ def test_equal_length_multivariate_collection():
             EQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION[key]["test"][0]
         )
         assert is_equal_length(EQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION[key]["test"][0])
+        assert not has_missing(EQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION[key]["test"][0])
         check_classification_targets(
             EQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION[key]["test"][1]
         )
@@ -160,6 +177,7 @@ def test_equal_length_multivariate_collection():
         assert is_collection(EQUAL_LENGTH_MULTIVARIATE_REGRESSION[key]["train"][0])
         assert not is_univariate(EQUAL_LENGTH_MULTIVARIATE_REGRESSION[key]["train"][0])
         assert is_equal_length(EQUAL_LENGTH_MULTIVARIATE_REGRESSION[key]["train"][0])
+        assert not has_missing(EQUAL_LENGTH_MULTIVARIATE_REGRESSION[key]["train"][0])
         assert np.issubdtype(
             EQUAL_LENGTH_MULTIVARIATE_REGRESSION[key]["train"][1].dtype, np.integer
         ) or np.issubdtype(
@@ -169,6 +187,7 @@ def test_equal_length_multivariate_collection():
         assert is_collection(EQUAL_LENGTH_MULTIVARIATE_REGRESSION[key]["test"][0])
         assert not is_univariate(EQUAL_LENGTH_MULTIVARIATE_REGRESSION[key]["test"][0])
         assert is_equal_length(EQUAL_LENGTH_MULTIVARIATE_REGRESSION[key]["test"][0])
+        assert not has_missing(EQUAL_LENGTH_MULTIVARIATE_REGRESSION[key]["test"][0])
         assert np.issubdtype(
             EQUAL_LENGTH_MULTIVARIATE_REGRESSION[key]["test"][1].dtype, np.integer
         ) or np.issubdtype(
@@ -188,6 +207,9 @@ def test_unequal_length_multivariate_collection():
         assert not is_equal_length(
             UNEQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION[key]["train"][0]
         )
+        assert not has_missing(
+            UNEQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION[key]["train"][0]
+        )
         check_classification_targets(
             UNEQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION[key]["train"][1]
         )
@@ -199,6 +221,9 @@ def test_unequal_length_multivariate_collection():
         assert not is_equal_length(
             UNEQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION[key]["test"][0]
         )
+        assert not has_missing(
+            UNEQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION[key]["test"][0]
+        )
         check_classification_targets(
             UNEQUAL_LENGTH_MULTIVARIATE_CLASSIFICATION[key]["test"][1]
         )
@@ -211,6 +236,7 @@ def test_unequal_length_multivariate_collection():
         assert not is_equal_length(
             UNEQUAL_LENGTH_MULTIVARIATE_REGRESSION[key]["train"][0]
         )
+        assert not has_missing(UNEQUAL_LENGTH_MULTIVARIATE_REGRESSION[key]["train"][0])
         assert np.issubdtype(
             UNEQUAL_LENGTH_MULTIVARIATE_REGRESSION[key]["train"][1].dtype, np.integer
         ) or np.issubdtype(
@@ -222,6 +248,7 @@ def test_unequal_length_multivariate_collection():
         assert not is_equal_length(
             UNEQUAL_LENGTH_MULTIVARIATE_REGRESSION[key]["test"][0]
         )
+        assert not has_missing(UNEQUAL_LENGTH_MULTIVARIATE_REGRESSION[key]["test"][0])
         assert np.issubdtype(
             UNEQUAL_LENGTH_MULTIVARIATE_REGRESSION[key]["test"][1].dtype, np.integer
         ) or np.issubdtype(
@@ -230,4 +257,36 @@ def test_unequal_length_multivariate_collection():
 
 
 def test_missing_values_collection():
-    pass
+    """Test the contents of the missing value data dictionary."""
+    for key in MISSING_VALUES_CLASSIFICATION:
+        assert is_collection(MISSING_VALUES_CLASSIFICATION[key]["train"][0])
+        assert is_univariate(MISSING_VALUES_CLASSIFICATION[key]["train"][0])
+        assert is_equal_length(MISSING_VALUES_CLASSIFICATION[key]["train"][0])
+        assert has_missing(MISSING_VALUES_CLASSIFICATION[key]["train"][0])
+
+        check_classification_targets(MISSING_VALUES_CLASSIFICATION[key]["train"][1])
+
+        assert is_collection(MISSING_VALUES_CLASSIFICATION[key]["test"][0])
+        assert is_univariate(MISSING_VALUES_CLASSIFICATION[key]["test"][0])
+        assert is_equal_length(MISSING_VALUES_CLASSIFICATION[key]["test"][0])
+        assert has_missing(MISSING_VALUES_CLASSIFICATION[key]["test"][0])
+        check_classification_targets(MISSING_VALUES_CLASSIFICATION[key]["test"][1])
+
+    for key in MISSING_VALUES_REGRESSION:
+        assert is_collection(MISSING_VALUES_REGRESSION[key]["train"][0])
+        assert is_univariate(MISSING_VALUES_REGRESSION[key]["train"][0])
+        assert is_equal_length(MISSING_VALUES_REGRESSION[key]["train"][0])
+        assert has_missing(MISSING_VALUES_REGRESSION[key]["train"][0])
+        assert np.issubdtype(
+            MISSING_VALUES_REGRESSION[key]["train"][1].dtype, np.integer
+        ) or np.issubdtype(
+            MISSING_VALUES_REGRESSION[key]["train"][1].dtype, np.floating
+        )
+
+        assert is_collection(MISSING_VALUES_REGRESSION[key]["test"][0])
+        assert is_univariate(MISSING_VALUES_REGRESSION[key]["test"][0])
+        assert is_equal_length(MISSING_VALUES_REGRESSION[key]["test"][0])
+        assert has_missing(MISSING_VALUES_REGRESSION[key]["test"][0])
+        assert np.issubdtype(
+            MISSING_VALUES_REGRESSION[key]["test"][1].dtype, np.integer
+        ) or np.issubdtype(MISSING_VALUES_REGRESSION[key]["test"][1].dtype, np.floating)

From e1296bd3e56986a6e63326ad62f33b9f54d0aecf Mon Sep 17 00:00:00 2001
From: MatthewMiddlehurst <m.middlehurst@uea.ac.uk>
Date: Wed, 31 Jul 2024 15:34:36 +0100
Subject: [PATCH 11/15] clusterer tests

---
 ...test_random_state_deep_learning_cluster.py | 60 -------------
 aeon/clustering/tests/test_all_clusterers.py  | 43 ----------
 .../_yield_clustering_checks.py               | 84 +++++++++++++++++++
 .../_yield_estimator_checks.py                |  9 ++
 aeon/testing/test_all_estimators.py           |  2 +-
 aeon/testing/tests/test_all_estimators.py     |  2 +-
 6 files changed, 95 insertions(+), 105 deletions(-)
 delete mode 100644 aeon/clustering/deep_learning/tests/test_random_state_deep_learning_cluster.py
 delete mode 100644 aeon/clustering/tests/test_all_clusterers.py
 create mode 100644 aeon/testing/estimator_checking/_yield_clustering_checks.py

diff --git a/aeon/clustering/deep_learning/tests/test_random_state_deep_learning_cluster.py b/aeon/clustering/deep_learning/tests/test_random_state_deep_learning_cluster.py
deleted file mode 100644
index 580d5eccf0..0000000000
--- a/aeon/clustering/deep_learning/tests/test_random_state_deep_learning_cluster.py
+++ /dev/null
@@ -1,60 +0,0 @@
-"""Unit tests for clusterer deep learning random_state functionality."""
-
-import inspect
-
-import numpy as np
-import pytest
-
-from aeon.clustering import deep_learning
-from aeon.testing.data_generation import make_example_3d_numpy
-
-__maintainer__ = ["hadifawaz1999"]
-
-
-@pytest.mark.skipif(
-    # not _check_soft_dependencies("tensorflow", severity="none"),
-    # See Issue #1761
-    True,
-    reason="skip test if required soft dependency not available",
-)
-def test_random_state_deep_learning_clr():
-    """Test Deep Clusterer seeding."""
-    random_state = 42
-
-    X, _ = make_example_3d_numpy(random_state=random_state)
-
-    deep_clr_classes = [
-        member[1] for member in inspect.getmembers(deep_learning, inspect.isclass)
-    ]
-
-    for i in range(len(deep_clr_classes)):
-        if "BaseDeepClusterer" in str(deep_clr_classes[i]):
-            continue
-
-        deep_clr1 = deep_clr_classes[i](
-            n_clusters=2, random_state=random_state, n_epochs=4
-        )
-        deep_clr1.fit(X)
-
-        layers1 = deep_clr1.training_model_.layers[1:]
-
-        deep_clr2 = deep_clr_classes[i](
-            n_clusters=2, random_state=random_state, n_epochs=4
-        )
-        deep_clr2.fit(X)
-
-        layers2 = deep_clr2.training_model_.layers[1:]
-
-        assert len(layers1) == len(layers2)
-
-        for i in range(len(layers1)):
-            weights1 = layers1[i].get_weights()
-            weights2 = layers2[i].get_weights()
-
-            assert len(weights1) == len(weights2)
-
-            for j in range(len(weights1)):
-                _weight1 = np.asarray(weights1[j])
-                _weight2 = np.asarray(weights2[j])
-
-                np.testing.assert_almost_equal(_weight1, _weight2, 4)
diff --git a/aeon/clustering/tests/test_all_clusterers.py b/aeon/clustering/tests/test_all_clusterers.py
deleted file mode 100644
index 98da489e29..0000000000
--- a/aeon/clustering/tests/test_all_clusterers.py
+++ /dev/null
@@ -1,43 +0,0 @@
-"""Test all clusterers comply to interface."""
-
-import numpy as np
-import pytest
-
-from aeon.registry import all_estimators
-from aeon.utils.validation._dependencies import _check_soft_dependencies
-
-ALL_CLUSTERERS = all_estimators("clusterer", return_names=False)
-
-
-@pytest.mark.parametrize("clst", ALL_CLUSTERERS)
-def test_clusterer_tags_consistent(clst):
-    """Test all estimators capability tags reflect their capabilities."""
-    if not _check_soft_dependencies(
-        clst.get_class_tag("python_dependencies", []), severity="none"
-    ):
-        return
-
-    # Test the tag X_inner_type is consistent with capability:unequal_length
-    unequal_length = clst.get_class_tag("capability:unequal_length")
-    valid_types = {"np-list", "df-list", "pd-multivariate", "nested_univ"}
-    if unequal_length:  # one of X_inner_types must be capable of storing unequal length
-        internal_types = clst.get_class_tag("X_inner_type")
-        if isinstance(internal_types, str):
-            assert internal_types in valid_types
-        else:  # must be a list
-            assert bool(set(internal_types) & valid_types)
-    # Test can actually fit/predict with multivariate if tag is set
-    multivariate = clst.get_class_tag("capability:multivariate")
-    if multivariate:
-        X = np.random.random((10, 2, 10))
-        inst = clst.create_test_instance(parameter_set="default")
-        inst.fit(X)
-        inst.predict(X)
-        inst.predict_proba(X)
-
-
-@pytest.mark.parametrize("clst", ALL_CLUSTERERS)
-def test_does_not_override_final_methods(clst):
-    """Test does not override final methods."""
-    assert "fit" not in clst.__dict__
-    assert "predict" not in clst.__dict__
diff --git a/aeon/testing/estimator_checking/_yield_clustering_checks.py b/aeon/testing/estimator_checking/_yield_clustering_checks.py
new file mode 100644
index 0000000000..cf68855bb4
--- /dev/null
+++ b/aeon/testing/estimator_checking/_yield_clustering_checks.py
@@ -0,0 +1,84 @@
+"""Tests for all clusterers."""
+
+from functools import partial
+
+import numpy as np
+
+from aeon.base._base import _clone_estimator
+from aeon.clustering.deep_learning import BaseDeepClusterer
+from aeon.testing.testing_data import FULL_TEST_DATA_DICT
+
+
+def _yield_clustering_checks(estimator_class, estimator_instances, datatypes):
+    """Yield all clustering checks for an aeon clusterer."""
+    # only class required
+    yield partial(check_clusterer_tags_consistent, estimator_class=estimator_class)
+    yield partial(
+        check_clusterer_does_not_override_final_methods, estimator_class=estimator_class
+    )
+
+    # test class instances
+    for i, estimator in enumerate(estimator_instances):
+        # data type irrelevant
+        if isinstance(estimator, BaseDeepClusterer):
+            yield partial(
+                check_clustering_random_state_deep_learning,
+                estimator=estimator,
+                datatype=datatypes[i][0],
+            )
+
+
+def check_clusterer_tags_consistent(estimator_class):
+    """Test all estimators capability tags reflect their capabilities."""
+    # Test the tag X_inner_type is consistent with capability:unequal_length
+    unequal_length = estimator_class.get_class_tag("capability:unequal_length")
+    valid_types = {"np-list", "df-list", "pd-multivariate", "nested_univ"}
+    if unequal_length:  # one of X_inner_types must be capable of storing unequal length
+        internal_types = estimator_class.get_class_tag("X_inner_type")
+        if isinstance(internal_types, str):
+            assert internal_types in valid_types
+        else:  # must be a list
+            assert bool(set(internal_types) & valid_types)
+    # Test can actually fit/predict with multivariate if tag is set
+    multivariate = estimator_class.get_class_tag("capability:multivariate")
+    if multivariate:
+        X = np.random.random((10, 2, 10))
+        inst = estimator_class.create_test_instance(parameter_set="default")
+        inst.fit(X)
+        inst.predict(X)
+        inst.predict_proba(X)
+
+
+def check_clusterer_does_not_override_final_methods(estimator_class):
+    """Test does not override final methods."""
+    assert "fit" not in estimator_class.__dict__
+    assert "predict" not in estimator_class.__dict__
+
+
+def check_clustering_random_state_deep_learning(estimator, datatype):
+    """Test Deep Clusterer seeding."""
+    random_state = 42
+
+    deep_clr1 = _clone_estimator(estimator, random_state=random_state)
+    deep_clr1.fit(FULL_TEST_DATA_DICT[datatype]["train"][0])
+
+    layers1 = deep_clr1.training_model_.layers[1:]
+
+    deep_clr2 = _clone_estimator(estimator, random_state=random_state)
+    deep_clr2.fit(FULL_TEST_DATA_DICT[datatype]["train"][0])
+
+    layers2 = deep_clr2.training_model_.layers[1:]
+
+    assert len(layers1) == len(layers2)
+
+    for i in range(len(layers1)):
+        weights1 = layers1[i].get_weights()
+        weights2 = layers2[i].get_weights()
+
+        assert len(weights1) == len(weights2)
+
+        for j in range(len(weights1)):
+            _weight1 = np.asarray(weights1[j])
+            _weight2 = np.asarray(weights2[j])
+
+            np.testing.assert_almost_equal(_weight1, _weight2, 4)
diff --git a/aeon/testing/estimator_checking/_yield_estimator_checks.py b/aeon/testing/estimator_checking/_yield_estimator_checks.py
index 3cc212af8b..ca5b36d797 100644
--- a/aeon/testing/estimator_checking/_yield_estimator_checks.py
+++ b/aeon/testing/estimator_checking/_yield_estimator_checks.py
@@ -15,12 +15,16 @@
 from aeon.base._base import _clone_estimator
 from aeon.classification import BaseClassifier
 from aeon.classification.deep_learning.base import BaseDeepClassifier
+from aeon.clustering import BaseClusterer
 from aeon.clustering.deep_learning.base import BaseDeepClusterer
 from aeon.regression import BaseRegressor
 from aeon.regression.deep_learning.base import BaseDeepRegressor
 from aeon.testing.estimator_checking._yield_classification_checks import (
     _yield_classification_checks,
 )
+from aeon.testing.estimator_checking._yield_clustering_checks import (
+    _yield_clustering_checks,
+)
 from aeon.testing.estimator_checking._yield_regression_checks import (
     _yield_regression_checks,
 )
@@ -88,6 +92,11 @@ def _yield_all_aeon_checks(
             estimator_class, estimator_instances, datatypes
         )
 
+    if issubclass(estimator_class, BaseClusterer):
+        yield from _yield_clustering_checks(
+            estimator_class, estimator_instances, datatypes
+        )
+
 
 def _yield_estimator_checks(estimator_class, estimator_instances, datatypes):
     """Yield all general checks for an aeon estimator."""
diff --git a/aeon/testing/test_all_estimators.py b/aeon/testing/test_all_estimators.py
index b65276b9dc..eb18e94051 100644
--- a/aeon/testing/test_all_estimators.py
+++ b/aeon/testing/test_all_estimators.py
@@ -205,7 +205,7 @@ def _all_estimators(self):
             estimator_types=getattr(self, "estimator_type_filter", None),
             return_names=False,
             exclude_estimators=EXCLUDE_ESTIMATORS,
-            exclude_estimator_types=["classifier", "regressor"],
+            exclude_estimator_types=["classifier", "regressor", "clusterer"],
         )
 
         # subsample estimators by OS & python version
diff --git a/aeon/testing/tests/test_all_estimators.py b/aeon/testing/tests/test_all_estimators.py
index a356c7eb0c..372aae833e 100644
--- a/aeon/testing/tests/test_all_estimators.py
+++ b/aeon/testing/tests/test_all_estimators.py
@@ -9,7 +9,7 @@
 from aeon.utils.sampling import random_partition
 
 ALL_ESTIMATORS = all_estimators(
-    estimator_types=["classifier", "regressor"],
+    estimator_types=["classifier", "regressor", "clusterer"],
     return_names=False,
 )
 

From f9bdf97e60f8e91d219c4739e169130e0497a9e5 Mon Sep 17 00:00:00 2001
From: MatthewMiddlehurst <m.middlehurst@uea.ac.uk>
Date: Wed, 31 Jul 2024 22:40:36 +0100
Subject: [PATCH 12/15] fix

---
 aeon/testing/testing_data.py            | 15 ++++++++++-----
 aeon/testing/tests/test_testing_data.py |  2 +-
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/aeon/testing/testing_data.py b/aeon/testing/testing_data.py
index 69b8594747..733736567d 100644
--- a/aeon/testing/testing_data.py
+++ b/aeon/testing/testing_data.py
@@ -831,9 +831,14 @@
     }
 )
 FULL_TEST_DATA_DICT.update(
-    {"MissingValues-Classification": MISSING_VALUES_CLASSIFICATION}
+    {
+        f"MissingValues-Classification-{k}": v
+        for k, v in MISSING_VALUES_CLASSIFICATION.items()
+    }
+)
+FULL_TEST_DATA_DICT.update(
+    {f"MissingValues-Regression-{k}": v for k, v in MISSING_VALUES_REGRESSION.items()}
 )
-FULL_TEST_DATA_DICT.update({"MissingValues-Regression": MISSING_VALUES_REGRESSION})
 # Series
 FULL_TEST_DATA_DICT.update({"UnivariateSeries-NoLabel": UNIVARIATE_SERIES_NOLABEL})
 FULL_TEST_DATA_DICT.update({"MultivariateSeries-NoLabel": MULTIVARIATE_SERIES_NOLABEL})
@@ -885,6 +890,9 @@ def _get_datatypes_for_estimator(estimator):
                     s = f"UnequalLengthMultivariate-{label_type}-{inner_type}"
                     if s in FULL_TEST_DATA_DICT:
                         datatypes.append(s)
+
+        if missing_values:
+            datatypes.append(f"MissingValues-{label_type}-numpy3D")
     elif isinstance(estimator, BaseSeriesEstimator):
         if univariate:
             datatypes.append("UnivariateSeries-NoLabel")
@@ -895,9 +903,6 @@ def _get_datatypes_for_estimator(estimator):
     else:
         raise ValueError(f"Unknown estimator type: {type(estimator)}")
 
-    if missing_values:
-        datatypes.append(f"MissingValues-{label_type}")
-
     if len(datatypes) == 0:
         raise ValueError(f"No valid data types found for estimator {estimator}")
 
diff --git a/aeon/testing/tests/test_testing_data.py b/aeon/testing/tests/test_testing_data.py
index 0eab61fdc8..ecf7ac4e24 100644
--- a/aeon/testing/tests/test_testing_data.py
+++ b/aeon/testing/tests/test_testing_data.py
@@ -25,7 +25,7 @@
 )
 
 
-def test_test_data_dict():
+def test_testing_data_dict():
     """Test the contents of the test data dictionary."""
     for key in FULL_TEST_DATA_DICT:
         # format

From 4ab5a71fda417c6a90f24e463a1bc068f2dcaa42 Mon Sep 17 00:00:00 2001
From: MatthewMiddlehurst <m.middlehurst@uea.ac.uk>
Date: Thu, 15 Aug 2024 15:25:07 +0100
Subject: [PATCH 13/15] dummy random state

---
 aeon/clustering/dummy.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/aeon/clustering/dummy.py b/aeon/clustering/dummy.py
index e46206cd93..e36d2b8d47 100644
--- a/aeon/clustering/dummy.py
+++ b/aeon/clustering/dummy.py
@@ -1,6 +1,7 @@
 """Implements DummyClusterer to be used as Baseline."""
 
 import numpy as np
+from sklearn.utils import check_random_state
 
 from aeon.clustering.base import BaseClusterer
 
@@ -22,10 +23,17 @@ class DummyClusterer(BaseClusterer):
         - "random": Assign clusters randomly.
         - "uniform": Distribute clusters uniformly among samples.
         - "single_cluster": Assign all samples to a single cluster.
-
     n_clusters : int, default=3
         The number of clusters to generate. This is relevant for "random"
         and "uniform" strategies.
+    random_state : int, np.random.RandomState instance or None, default=None
+        Determines random number generation for centroid initialization.
+        Only used when `strategy` is "random".
+        If `int`, random_state is the seed used by the random number generator;
+        If `np.random.RandomState` instance,
+        random_state is the random number generator;
+        If `None`, the random number generator is the `RandomState` instance used
+        by `np.random`.
 
     Attributes
     ----------
@@ -46,11 +54,12 @@ class DummyClusterer(BaseClusterer):
     array([0, 1, 0])
     """
 
-    def __init__(self, strategy="random", n_clusters=3):
-        super().__init__()
+    def __init__(self, strategy="random", n_clusters=3, random_state=None):
         self.strategy = strategy
         self.n_clusters = n_clusters
-        self.labels_ = None
+        self.random_state = random_state
+
+        super().__init__()
 
     def _fit(self, X, y=None):
         """
@@ -72,7 +81,8 @@ def _fit(self, X, y=None):
         n_samples = X.shape[0]
 
         if self.strategy == "random":
-            self.labels_ = np.random.randint(0, self.n_clusters, n_samples)
+            rng = check_random_state(self.random_state)
+            self.labels_ = rng.randint(0, self.n_clusters, n_samples)
         elif self.strategy == "uniform":
             self.labels_ = np.tile(
                 np.arange(self.n_clusters), n_samples // self.n_clusters + 1

From 47d598ecf47d690f3edd31cdbc496e48c49a939a Mon Sep 17 00:00:00 2001
From: MatthewMiddlehurst <m.middlehurst@uea.ac.uk>
Date: Thu, 15 Aug 2024 15:55:43 +0100
Subject: [PATCH 14/15] dummy fix

---
 aeon/clustering/dummy.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/aeon/clustering/dummy.py b/aeon/clustering/dummy.py
index e36d2b8d47..523f0748bb 100644
--- a/aeon/clustering/dummy.py
+++ b/aeon/clustering/dummy.py
@@ -46,20 +46,19 @@ class DummyClusterer(BaseClusterer):
     >>> import numpy as np
     >>> X = np.array([[1, 2], [3, 4], [5, 6]])
     >>> clusterer = DummyClusterer(strategy="uniform", n_clusters=2)
-    >>> clusterer._fit(X)
+    >>> clusterer.fit(X)
     DummyClusterer(n_clusters=2, strategy='uniform')
     >>> clusterer.labels_
     array([0, 1, 0])
-    >>> clusterer._predict(X)
+    >>> clusterer.predict(X)
     array([0, 1, 0])
     """
 
     def __init__(self, strategy="random", n_clusters=3, random_state=None):
         self.strategy = strategy
-        self.n_clusters = n_clusters
         self.random_state = random_state
 
-        super().__init__()
+        super().__init__(n_clusters=n_clusters)
 
     def _fit(self, X, y=None):
         """
@@ -82,7 +81,7 @@ def _fit(self, X, y=None):
 
         if self.strategy == "random":
             rng = check_random_state(self.random_state)
-            self.labels_ = rng.randint(0, self.n_clusters, n_samples)
+            self.labels_ = rng.randint(self.n_clusters, size=n_samples)
         elif self.strategy == "uniform":
             self.labels_ = np.tile(
                 np.arange(self.n_clusters), n_samples // self.n_clusters + 1
@@ -113,7 +112,8 @@ def _predict(self, X, y=None) -> np.ndarray:
         """
         n_samples = X.shape[0]
         if self.strategy == "random":
-            return np.random.randint(0, self.n_clusters, n_samples)
+            rng = check_random_state(self.random_state)
+            return rng.randint(self.n_clusters, size=n_samples)
         elif self.strategy == "uniform":
             return np.tile(
                 np.arange(self.n_clusters), n_samples // self.n_clusters + 1

From 9c5b1c500f09dab4fe7e8f25b4d8b089bbe2f208 Mon Sep 17 00:00:00 2001
From: MatthewMiddlehurst <m.middlehurst@uea.ac.uk>
Date: Thu, 15 Aug 2024 23:49:43 +0100
Subject: [PATCH 15/15] fixes and skips

---
 aeon/clustering/feature_based/_catch22.py | 2 +-
 aeon/testing/test_config.py               | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/aeon/clustering/feature_based/_catch22.py b/aeon/clustering/feature_based/_catch22.py
index eac0af3730..6c716e249d 100644
--- a/aeon/clustering/feature_based/_catch22.py
+++ b/aeon/clustering/feature_based/_catch22.py
@@ -209,7 +209,7 @@ def _predict_proba(self, X) -> np.ndarray:
             n_clusters = self.n_clusters
             if n_clusters is None:
                 n_clusters = int(max(preds)) + 1
-            dists = np.zeros((X.shape[0], n_clusters))
+            dists = np.zeros((len(X), n_clusters))
             for i in range(n_cases):
                 dists[i, preds[i]] = 1
             return dists
diff --git a/aeon/testing/test_config.py b/aeon/testing/test_config.py
index 07b1918c90..25decbc902 100644
--- a/aeon/testing/test_config.py
+++ b/aeon/testing/test_config.py
@@ -72,6 +72,8 @@
     # needs investigation
     "SASTClassifier": ["check_fit_deterministic"],
     "RSASTClassifier": ["check_fit_deterministic"],
+    "AEFCNClusterer": ["check_fit_updates_state"],
+    "AEResNetClusterer": ["check_fit_updates_state"],
 }
 
 # We use estimator tags in addition to class hierarchies to further distinguish