diff --git a/aeon/utils/__init__.py b/aeon/utils/__init__.py index d071a81405..ba84bfa02d 100644 --- a/aeon/utils/__init__.py +++ b/aeon/utils/__init__.py @@ -1,9 +1,6 @@ """Utility functionality.""" __all__ = [ - "get_cutoff", - "update_data", - "get_window", "split_series", "ALL_TIME_SERIES_TYPES", "COLLECTIONS_DATA_TYPES", @@ -21,4 +18,3 @@ ) from aeon.utils._show_versions import show_versions from aeon.utils._split import split_series -from aeon.utils.index_functions import get_cutoff, get_window, update_data diff --git a/aeon/utils/conversion/_convert_collection.py b/aeon/utils/conversion/_convert_collection.py index 10d69b9a25..6dd6510e06 100644 --- a/aeon/utils/conversion/_convert_collection.py +++ b/aeon/utils/conversion/_convert_collection.py @@ -14,7 +14,6 @@ For the seven supported, this gives 42 different converters. Rather than using them directly, we recommend using the conversion function convert_collection. -Legacy code supported "dask_panel" but it is not actually used anywhere; thus, removed. """ from collections.abc import Sequence diff --git a/aeon/utils/index_functions.py b/aeon/utils/index_functions.py deleted file mode 100644 index 66545915cd..0000000000 --- a/aeon/utils/index_functions.py +++ /dev/null @@ -1,548 +0,0 @@ -"""Index functions of dubious worth.""" - -import numpy as np -import pandas as pd - -from aeon.utils.conversion import convert_collection, convert_series -from aeon.utils.validation import ( - is_collection, - is_hierarchical, - is_single_series, - validate_input, -) - - -def _get_index(x): - if hasattr(x, "index"): - return x.index - elif isinstance(x, np.ndarray): - if x.ndim < 3: - return pd.RangeIndex(x.shape[0]) - else: - return pd.RangeIndex(x.shape[-1]) - else: - return pd.RangeIndex(x.shape[-1]) - - -def get_time_index(X): - """Get index of time series data, helper function. - - Parameters - ---------- - X : pd.DataFrame, pd.Series, np.ndarray - in one of the following data container types pd.DataFrame, pd.Series, - np.ndarray, pd-multiindex, nested_univ, pd_multiindex_hier - assumes all time series have equal length and equal index set - will *not* work for list-of-df, pd-wide, pd-long, numpy2D or np-list. - - Returns - ------- - time_index : pandas.Index - Index of time series - """ - # assumes that all samples share the same the time index, only looks at - # first row - if isinstance(X, (pd.DataFrame, pd.Series)): - # pd-multiindex or pd_multiindex_hier - if isinstance(X.index, pd.MultiIndex): - index_tuple = tuple(list(X.index[0])[:-1]) - index = X.loc[index_tuple].index - return index - # nested_univ - elif isinstance(X, pd.DataFrame) and isinstance(X.iloc[0, 0], pd.DataFrame): - return _get_index(X.iloc[0, 0]) - # pd.Series or pd.DataFrame - else: - return X.index - # numpy3D and np.ndarray - elif isinstance(X, np.ndarray): - # np.ndarray - if X.ndim < 3: - return pd.RangeIndex(X.shape[0]) - # numpy3D - else: - return pd.RangeIndex(X.shape[-1]) - elif hasattr(X, "X"): - return get_time_index(X.X) - else: - raise ValueError( - f"X must be pd.DataFrame, pd.Series, or np.ndarray, but found: {type(X)}" - ) - - -def get_index_for_series(obj, cutoff=0): - """Get pandas index for a Series object. - - Returns index even for numpy array, in that case a RangeIndex. - - Assumptions on obj are not checked, these should be validated separately. - Function may return unexpected results without prior validation. - - Parameters - ---------- - obj : aeon data container - must be of one of the following single series data structures: - pd.Series, pd.DataFrame, np.ndarray - cutoff : int, or pd.datetime, optional, default=0 - current cutoff, used to offset index if obj is np.ndarray - - Returns - ------- - index : pandas.Index, index for obj - """ - if hasattr(obj, "index"): - return obj.index - # now we know the object must be an np.ndarray - return pd.RangeIndex(cutoff, cutoff + obj.shape[0]) - - -def _get_cutoff_from_index(idx, return_index=False, reverse_order=False): - """Get cutoff = latest time point of pandas index. - - Assumptions on obj are not checked, these should be validated separately. - Function may return unexpected results without prior validation. - - Parameters - ---------- - obj : pd.Index, possibly MultiIndex, with last level assumed timelike or integer, - e.g., as in the pd.DataFrame, pd-multiindex, or pd_multiindex_hier types - return_index : bool, optional, default=False - whether a pd.Index object should be returned (True) - or a pandas compatible index element (False) - note: return_index=True may set freq attribute of time types to None - return_index=False will typically preserve freq attribute - reverse_order : bool, optional, default=False - if False, returns largest time index. If True, returns smallest time index - - Returns - ------- - cutoff_index : pandas compatible index element (if return_index=False) - pd.Index of length 1 (if return_index=True) - """ - if not isinstance(idx, pd.Index): - raise TypeError(f"idx must be a pd.Index, but found type {type(idx)}") - - # define "first" or "last" index depending on which is desired - if reverse_order: - ix = 0 - agg = min - else: - ix = -1 - agg = max - - if isinstance(idx, pd.MultiIndex): - tdf = pd.DataFrame(index=idx) - hix = idx.droplevel(-1) - freq = None - cutoff = None - for hi in hix: - ss = tdf.loc[hi].index - if hasattr(ss, "freq") and ss.freq is not None: - freq = ss.freq - if cutoff is not None: - cutoff = agg(cutoff, ss[ix]) - else: - cutoff = ss[ix] - time_idx = idx.get_level_values(-1).sort_values() - time_idx = pd.Index([cutoff]) - time_idx.freq = freq - else: - time_idx = idx - if hasattr(idx, "freq") and idx.freq is not None: - freq = idx.freq - else: - freq = None - - if not return_index: - return time_idx[ix] - res = time_idx[[ix]] - if hasattr(time_idx, "freq") and time_idx.freq is not None: - res.freq = time_idx.freq - return res - - -def get_cutoff( - obj, - cutoff=0, - return_index=False, - reverse_order=False, -): - """Get the latest time point of time series or collection of time series. - - Assumptions on obj are not checked, these should be validated separately. - Function may return unexpected results without prior validation. - - Parameters - ---------- - obj : aeon compatible time series data container or pandas.Index - if aeon time series, must be of Series, Collection, or Hierarchical abstract - type. if ``pandas.Index``, it is assumed that last level is time-like or integer - e.g., as in the pd.DataFrame, pd-multiindex, or pd_multiindex_hier internal - types. - cutoff : int, default=0 - Current cutoff, used to offset index if obj is np.ndarray - return_index : bool, default=False - Whether a pd.Index object should be returned (True) or a pandas compatible - index element (False). - note: return_index=True may set freq attribute of time types to None - return_index=False will typically preserve freq attribute. - reverse_order : bool, default=False - if False, returns largest time index. If True, returns smallest time index. - - Returns - ------- - cutoff_index : pandas compatible index element (if return_index=False) - pd.Index of length 1 (if return_index=True). - - Raises - ------ - ValueError, TypeError, if check_input or convert_input are True. - """ - # deal with legacy method of wrapping a Broadcaster - if hasattr(obj, "X"): - obj = obj.X - - if isinstance(obj, pd.Index): - return _get_cutoff_from_index( - idx=obj, return_index=return_index, reverse_order=reverse_order - ) - if not (is_hierarchical(obj) or is_collection(obj) or is_single_series(obj)): - raise ValueError( - "obj must be of Series, Collection, or Hierarchical abstract type" - ) - - if cutoff is None: - cutoff = 0 - elif isinstance(cutoff, pd.Index): - if not len(cutoff) == 1: - raise ValueError( - "if cutoff is a pd.Index, its length must be 1, but" - f" found a pd.Index with length {len(cutoff)}" - ) - if len(obj) == 0 and return_index: - return cutoff - cutoff = cutoff[0] - - if len(obj) == 0: - return cutoff - - # numpy3D (Collection) or np.npdarray (Series) - if isinstance(obj, np.ndarray): - if obj.ndim == 3: - cutoff_ind = obj.shape[-1] + cutoff - 1 - if obj.ndim < 3 and obj.ndim > 0: - cutoff_ind = obj.shape[0] + cutoff - 1 - if reverse_order: - cutoff_ind = cutoff - if return_index: - return pd.RangeIndex(cutoff_ind, cutoff_ind + 1) - else: - return cutoff_ind - - # define "first" or "last" index depending on which is desired - if reverse_order: - ix = 0 - agg = min - else: - ix = -1 - agg = max - - def sub_idx(idx, ix, return_index=True): - """Like sub-setting pd.index, but preserves freq attribute.""" - if not return_index: - return idx[ix] - res = idx[[ix]] - if hasattr(idx, "freq") and idx.freq is not None: - if res.freq != idx.freq: - res.freq = idx.freq - return res - - if isinstance(obj, pd.Series): - return sub_idx(obj.index, ix, return_index) - - # nested_univ (Collection) or pd.DataFrame(Series) - if isinstance(obj, pd.DataFrame) and not isinstance(obj.index, pd.MultiIndex): - objcols = [x for x in obj.columns if obj.dtypes[x] == "object"] - # pd.DataFrame - if len(objcols) == 0: - return sub_idx(obj.index, ix) if return_index else obj.index[ix] - # nested_univ - else: - idxx = [ - sub_idx(x.index, ix, return_index) for col in objcols for x in obj[col] - ] - return agg(idxx) - - # pd-multiindex (Collection) and pd_multiindex_hier (Hierarchical) - if isinstance(obj, pd.DataFrame) and isinstance(obj.index, pd.MultiIndex): - idx = obj.index - series_idx = [ - obj.loc[x].index.get_level_values(-1) for x in idx.droplevel(-1).unique() - ] - cutoffs = [sub_idx(x, ix, return_index) for x in series_idx] - return agg(cutoffs) - - # df-list (Collection) - if isinstance(obj, list): - idxs = [sub_idx(x.index, ix, return_index) for x in obj] - return agg(idxs) - - -SUPPORTED_SERIES = [ - "pd.DataFrame", - "np.ndarray", -] -SUPPORTED_COLLECTIONS = [ - "pd-multiindex", - "numpy3D", -] - - -def update_data(X, X_new=None): - """Update time series container with another one. - - Converts X, X_new to one of the valid internal types, if not one already. - - Parameters - ---------- - X : None, or aeon data container, in one of the following internal type formats - pd.DataFrame, pd.Series, np.ndarray, pd-multiindex, numpy3D, - pd_multiindex_hier. If not of that format, converted. - X_new : None, or aeon data container, should be same type as X, - or convert to same format when converting X - - Returns - ------- - X updated with X_new, with rows/indices in X_new added - entries in X_new overwrite X if at same index - numpy based containers will always be interpreted as having new row index - if one of X, X_new is None, returns the other; if both are None, returns None - """ - # Temporary measure to deal with legacy method of wrapping a Broadcaster - if hasattr(X, "X"): - X = X.X - if hasattr(X_new, "X"): - X_new = X_new.X - # we only need to modify X if X_new is not None - if X_new is None: - return X - - # if X is None, but X_new is not, return N_new - if X is None: - return X_new - - # we want to ensure that X, X_new are either numpy (1D, 2D, 3D) - # or in one of the long pandas formats if they are collections - if is_collection(X): - X = convert_collection(X, output_type="numpy3D") - if is_collection(X_new): - X_new = convert_collection(X_new, output_type="numpy3D") - - # update X with the new rows in X_new - # if X is np.ndarray, we assume all rows are new - if isinstance(X, np.ndarray): - # if 1D or 2D, axis 0 is "time" - if X_new.ndim in [1, 2]: - return np.concatenate([X, X_new], axis=0) - # if 3D, axis 2 is "time" - elif X_new.ndim == 3: - return np.concatenate([X, X_new], axis=2) - # if y is pandas, we use combine_first to update - elif isinstance(X_new, (pd.Series, pd.DataFrame)) and len(X_new) > 0: - return X_new.combine_first(X) - - -def _convert(obj, abstract_type, input_type): - reconvert = False - if abstract_type == "Series": - obj = convert_series(obj, SUPPORTED_SERIES) - if input_type == "pd.Series": - reconvert = True - elif abstract_type == "Panel": - if input_type not in SUPPORTED_COLLECTIONS: - obj = convert_collection(obj, "pd-multiindex") - reconvert = True - return obj, reconvert - - -def get_window(obj, window_length=None, lag=None): - """Slice obj to the time index window with given length and lag. - - Returns time series or time series collection with time indices strictly greater - than cutoff - lag - window_length, and equal or less than cutoff - lag. - Cutoff if of obj, as determined by get_cutoff. - This function does not work with pd.Series, hence the conversion to pd.DataFrame. - It also does not work with unequal length collections of series. - - Parameters - ---------- - obj : aeon compatible time series data container or None - if not None, must be of Series, Collection, or Hierarchical internal types. - all valid internal types are supported via conversion to internally supported - types to avoid conversions, pass data in one of SUPPORTED_SERIES or - SUPPORTED_COLLECTION - window_length : int or timedelta, optional, default=-inf - must be int if obj is int indexed, timedelta if datetime indexed - length of the window to slice to. Default = window of infinite size - lag : int, timedelta, or None optional, default = None (zero of correct type) - lag of the latest time in the window, with respect to cutoff of obj - if None, is internally replaced by a zero of type compatible with obj index - must be int if obj is int indexed or not pandas based - must be timedelta if obj is pandas based and datetime indexed - - Returns - ------- - obj sub-set to time indices in the semi-open interval - (cutoff - window_length - lag, cutoff - lag) - None if obj was None - """ - if obj is None or (window_length is None and lag is None): - return obj - valid, metadata = validate_input(obj) - if not valid: - raise ValueError("obj must be of Series, Collection, or Hierarchical type") - input_type = metadata["mtype"] - abstract_type = metadata["scitype"] - obj, reconvert = _convert(obj, abstract_type, input_type) - - # numpy3D (Collection) or np.npdarray (Series) - if isinstance(obj, np.ndarray): - # if 2D or 3D, we need to subset by last, not first dimension - # if 1D, we need to subset by first dimension - # to achieve that effect, we swap first and last in case of 2D, 3D - # and always subset on first dimension - if obj.ndim > 1: - obj = obj.swapaxes(1, -1) - obj_len = len(obj) - if lag is None: - lag = 0 - if window_length is None: - window_length = obj_len - window_start = max(-window_length - lag, -obj_len) - window_end = max(-lag, -obj_len) - # we need to swap first and last dimension back before returning, if done above - if window_end == 0: - obj_subset = obj[window_start:] - else: - obj_subset = obj[window_start:window_end] - if obj.ndim > 1: - obj_subset = obj_subset.swapaxes(1, -1) - return obj_subset - - # pd.DataFrame(Series), pd-multiindex (Collection) and pd_multiindex_hier ( - # Hierarchical) - if isinstance(obj, pd.DataFrame): - cutoff = get_cutoff(obj) - - if not isinstance(obj.index, pd.MultiIndex): - time_indices = obj.index - else: - time_indices = obj.index.get_level_values(-1) - - if lag is None: - win_end_incl = cutoff - win_select = time_indices <= win_end_incl - if window_length is not None: - win_start_excl = cutoff - window_length - win_select = win_select & (time_indices > win_start_excl) - else: - win_end_incl = cutoff - lag - win_select = time_indices <= win_end_incl - if window_length is not None: - win_start_excl = cutoff - window_length - lag - win_select = win_select & (time_indices > win_start_excl) - - obj_subset = obj.iloc[win_select] - if reconvert: - if abstract_type == "Series" and input_type == "pd.Series": - obj_subset = convert_series(obj_subset, input_type) - elif abstract_type == "Panel": - obj_subset = convert_collection(obj_subset, input_type) - - return obj_subset - - raise ValueError( - "passed get_window an object that is not of type np.ndarray or pd.DataFrame" - ) - - -def get_slice(obj, start=None, end=None): - """Slice obj with start (inclusive) and end (exclusive) indices. - - Returns time series or time series collection with time indices strictly greater - and equal to start index and less than end index. - - Parameters - ---------- - obj : aeon compatible time series data container or None - if not None, must be of Series, Collection, or Hierarchical type - in one of SUPPORTED_SERIES or SUPPORTED_COLLECTION. - start : int or timestamp, optional, default = None - must be int if obj is int indexed, timestamp if datetime indexed - Inclusive start of slice. Default = None. - If None, then no slice at the start - end : int or timestamp, optional, default = None - must be int if obj is int indexed, timestamp if datetime indexed - Exclusive end of slice. Default = None - If None, then no slice at the end - - Returns - ------- - obj sub-set sliced for `start` (inclusive) and `end` (exclusive) indices - None if obj was None - """ - if (start is None and end is None) or obj is None: - return obj - - valid, metadata = validate_input(obj) - if not valid: - raise ValueError("obj must be of Series, Collection or Hierarchical type") - input_type = metadata["mtype"] - abstract_type = metadata["scitype"] - obj, reconvert = _convert(obj, abstract_type, input_type) - - # numpy3D (Collection) or np.npdarray (Series) - # Assumes the index is integer so will be exclusive by default - if isinstance(obj, np.ndarray): - # if 2D or 3D, we need to subset by last, not first dimension - # if 1D, we need to subset by first dimension - # to achieve that effect, we swap first and last in case of 2D, 3D - # and always subset on first dimension - if obj.ndim > 1: - obj = obj.swapaxes(1, -1) - # subsetting - if start and end: - obj_subset = obj[start:end] - elif end: - obj_subset = obj[:end] - else: - obj_subset = obj[start:] - # we need to swap first and last dimension back before returning, if done above - if obj.ndim > 1: - obj_subset = obj_subset.swapaxes(1, -1) - return obj_subset - - # pd.DataFrame(Series), pd-multiindex (Collection) and pd_multiindex_hier ( - # Hierarchical) - # Assumes the index is pd.Timestamp or pd.Period and ensures the end is - # exclusive with slice_select - if isinstance(obj, pd.DataFrame): - if not isinstance(obj.index, pd.MultiIndex): - time_indices = obj.index - else: - time_indices = obj.index.get_level_values(-1) - - if start and end: - slice_select = (time_indices >= start) & (time_indices < end) - elif end: - slice_select = time_indices < end - elif start: - slice_select = time_indices >= start - - obj_subset = obj.iloc[slice_select] - if reconvert: - if abstract_type == "Series" and input_type == "pd.Series": - obj_subset = convert_series(obj_subset, input_type) - elif abstract_type == "Panel": - obj_subset = convert_collection(obj_subset, input_type) - return obj_subset diff --git a/aeon/utils/multiindex.py b/aeon/utils/multiindex.py deleted file mode 100644 index 9853f2653e..0000000000 --- a/aeon/utils/multiindex.py +++ /dev/null @@ -1,37 +0,0 @@ -"""Multiindex formatting related utilities.""" - -__maintainer__ = [] -__all__ = [] - -import pandas as pd - - -def underscore_join(iterable): - """Create flattened column names from multiindex tuple. - - Parameters - ---------- - iterable : an iterable - - Returns - ------- - str, for an iterable (x1, x2, ..., xn), returns the string - str(x1) + "__" + str(x2) + "__" + str(x3) + ... + "__" + str(xn) - """ - iterable_as_str = [str(x) for x in iterable] - return "__".join(iterable_as_str) - - -def flatten_multiindex(idx): - """Flatten a multiindex. - - Parameters - ---------- - idx: pandas.MultiIndex - - Returns - ------- - pandas.Index with str elements - i-th element of return is underscore_join of i-th element of `idx` - """ - return pd.Index([underscore_join(x) for x in idx]) diff --git a/aeon/utils/registry/__init__.py b/aeon/utils/registry/__init__.py deleted file mode 100644 index dce5672ee2..0000000000 --- a/aeon/utils/registry/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Registry.""" diff --git a/aeon/utils/registry/_data_types.py b/aeon/utils/registry/_data_types.py deleted file mode 100644 index dc080d6478..0000000000 --- a/aeon/utils/registry/_data_types.py +++ /dev/null @@ -1,11 +0,0 @@ -COLLECTIONS_DATA_TYPES = [ - "numpy3D", # 3D np.ndarray of format (n_cases, n_channels, n_timepoints) - "np-list", # python list of 2D numpy array of length [n_cases], - # each of shape (n_channels, n_timepoints_i) - "df-list", # python list of 2D pd.DataFrames of length [n_cases], each a of - # shape (n_timepoints_i, n_channels) - "numpy2D", # 2D np.ndarray of shape (n_cases, n_timepoints) - "pd-wide", # 2D pd.DataFrame of shape (n_cases, n_timepoints) - "nested_univ", # pd.DataFrame (n_cases, n_channels) with each cell a pd.Series, - "pd-multiindex", # pd.DataFrame with multi-index, -] diff --git a/aeon/utils/tests/test_index_functions.py b/aeon/utils/tests/test_index_functions.py deleted file mode 100644 index bcfbda4ece..0000000000 --- a/aeon/utils/tests/test_index_functions.py +++ /dev/null @@ -1,313 +0,0 @@ -"""Testing index functions.""" - -import numpy as np -import pandas as pd -import pytest -from pandas.api.types import is_integer_dtype - -from aeon.testing.data_generation import _make_hierarchical -from aeon.utils.index_functions import ( - _get_cutoff_from_index, - get_cutoff, - get_slice, - get_time_index, - get_window, -) - -cols = ["instances", "timepoints"] + [f"var_{i}" for i in range(2)] - -Xlist = [ - pd.DataFrame([[0, 0, 1, 4], [0, 1, 2, 5], [0, 2, 3, 6]], columns=cols), - pd.DataFrame([[1, 0, 1, 4], [1, 1, 2, 55], [1, 2, 3, 6]], columns=cols), - pd.DataFrame([[2, 0, 1, 42], [2, 1, 2, 5], [2, 2, 3, 6]], columns=cols), -] -X = pd.concat(Xlist) -multi_index = X.set_index(["instances", "timepoints"]) - -cols = ["foo", "bar", "timepoints"] + [f"var_{i}" for i in range(2)] -Xlist = [ - pd.DataFrame( - [["a", 0, 0, 1, 4], ["a", 0, 1, 2, 5], ["a", 0, 2, 3, 6]], columns=cols - ), - pd.DataFrame( - [["a", 1, 0, 1, 4], ["a", 1, 1, 2, 55], ["a", 1, 2, 3, 6]], columns=cols - ), - pd.DataFrame( - [["a", 2, 0, 1, 42], ["a", 2, 1, 2, 5], ["a", 2, 2, 3, 6]], columns=cols - ), - pd.DataFrame( - [["b", 0, 0, 1, 4], ["b", 0, 1, 2, 5], ["b", 0, 2, 3, 6]], columns=cols - ), - pd.DataFrame( - [["b", 1, 0, 1, 4], ["b", 1, 1, 2, 55], ["b", 1, 2, 3, 6]], columns=cols - ), - pd.DataFrame( - [["b", 2, 0, 1, 42], ["b", 2, 1, 2, 5], ["b", 2, 2, 3, 6]], columns=cols - ), -] -X = pd.concat(Xlist) -multi_index_hier = X.set_index(["foo", "bar", "timepoints"]) - -EXAMPLE_DATA = { - "pd.Series": pd.Series(np.random.rand(4)), - "pd.DataFrame": pd.DataFrame(np.random.rand(4, 2)), - "np.ndarray": np.random.rand(4, 2), - "numpy3D": np.random.rand(4, 2, 3), - "pd-multiindex": multi_index, - "pd-multiindex-hier": multi_index_hier, -} - - -@pytest.mark.parametrize("datatype", EXAMPLE_DATA.keys()) -def test_get_time_index(datatype): - """Tests that conversions agree with input data.""" - data = EXAMPLE_DATA[datatype] - idx = get_time_index(data) - - msg = f"get_time_index should return pd.Index, but found {type(idx)}" - assert isinstance(idx, pd.Index), msg - - if datatype in ["pd.Series", "pd.DataFrame"]: - assert (idx == data.index).all() - - if datatype in ["np.ndarray", "numpy3D"]: - assert isinstance(idx, pd.RangeIndex) - if datatype == "np.ndarray": - assert len(idx) == data.shape[0] - else: - assert len(idx) == data.shape[-1] - if isinstance(data, pd.MultiIndex): - assert isinstance(idx, pd.Index) - assert (idx == data.get_level_values(-1)).all() - if datatype in ["pd-multiindex", "pd_multiindex_hier"]: - exp_idx = data.index.get_level_values(-1).unique() - assert (idx == exp_idx).all() - - -@pytest.mark.parametrize("reverse_order", [True, False]) -@pytest.mark.parametrize("return_index", [True, False]) -@pytest.mark.parametrize("datatype", EXAMPLE_DATA.keys()) -def test_get_cutoff(datatype, return_index, reverse_order): - """Tests that get_cutoff has correct output. - - Parameters - ---------- - datatype : str - datatype of input - return_index : bool - whether index (True) or index element is returned (False) - reverse_order : bool - whether first (True) or last index (False) is retrieved - - Raises - ------ - AssertionError if get_cutoff does not return a length 1 pandas.index - for any fixture example of given scitype, mtype - """ - # retrieve example data structure - data = EXAMPLE_DATA[datatype] - - cutoff = get_cutoff( - data, - return_index=return_index, - reverse_order=reverse_order, - ) - - if return_index: - expected_types = pd.Index - cutoff_val = cutoff[0] - else: - expected_types = (int, float, np.int64, pd.Timestamp) - cutoff_val = cutoff - - msg = ( - f"incorrect return type of get_cutoff" - f"expected {expected_types}, found {type(cutoff)}" - ) - - assert isinstance(cutoff, expected_types), msg - - if return_index: - assert len(cutoff) == 1 - if isinstance(cutoff_val, (pd.Period, pd.Timestamp)): - assert hasattr(cutoff, "freq") and cutoff.freq is not None - - if isinstance(data, np.ndarray): - if reverse_order: - assert cutoff_val == 0 - else: - assert cutoff_val > 0 - - if datatype in ["pd.Series", "pd.DataFrame"]: - if reverse_order: - assert cutoff_val == data.index[0] - else: - assert cutoff_val == data.index[-1] - - if datatype in ["pd-multiindex", "pd_multiindex_hier"]: - time_idx = data.index.get_level_values(-1) - if reverse_order: - assert cutoff_val == time_idx.min() - else: - assert cutoff_val == time_idx.max() - - -@pytest.mark.parametrize("reverse_order", [True, False]) -def test_get_cutoff_from_index(reverse_order): - """Tests that _get_cutoff_from_index has correct output. - - Parameters - ---------- - return_index : bool - whether index (True) or index element is returned (False) - reverse_order : bool - whether first (True) or last index (False) is retrieved - - Raises - ------ - AssertionError if _get_cutoff_from_index does not return a length 1 pandas.index - AssertionError if _get_cutoff_from_index does not return the correct cutoff value - """ - hier_fixture = _make_hierarchical() - hier_idx = hier_fixture.index - - cutoff = _get_cutoff_from_index( - hier_idx, return_index=True, reverse_order=reverse_order - ) - idx = _get_cutoff_from_index( - hier_idx, return_index=False, reverse_order=reverse_order - ) - - assert isinstance(cutoff, pd.DatetimeIndex) and len(cutoff) == 1 - assert cutoff.freq == "D" - assert idx == cutoff[0] - - if reverse_order: - assert idx == pd.Timestamp("2000-01-01") - else: - assert idx == pd.Timestamp("2000-01-12") - series_fixture = EXAMPLE_DATA["pd.Series"] - series_idx = series_fixture.index - - cutoff = _get_cutoff_from_index( - series_idx, return_index=True, reverse_order=reverse_order - ) - idx = _get_cutoff_from_index( - series_idx, return_index=False, reverse_order=reverse_order - ) - - assert isinstance(cutoff, pd.Index) and len(cutoff) == 1 - assert is_integer_dtype(cutoff) - assert idx == cutoff[0] - - if reverse_order: - assert idx == 0 - else: - assert idx == 3 - - -@pytest.mark.parametrize("bad_inputs", ["foo", 12345, [[[]]]]) -def test_get_cutoff_wrong_input(bad_inputs): - """Tests that get_cutoff raises error on bad input when input checks are enabled. - - Parameters - ---------- - bad_inputs : inputs that should set off the input checks - - Raises - ------ - Exception (from pytest) if the error is not raised as expected - """ - with pytest.raises( - Exception, match="must be of Series, Collection, " "or Hierarchical" - ): - get_cutoff(bad_inputs) - - -@pytest.mark.parametrize("window_length, lag", [(2, 0), (None, 0), (4, 1)]) -# @pytest.mark.parametrize("datatype", EXAMPLE_DATA.keys()) -@pytest.mark.parametrize("datatype", ["pd.Series"]) -def test_get_window_output_type(datatype, window_length, lag): - """Tests that get_window runs for all types, and returns output of same mtype. - - Parameters - ---------- - datatype : str - datatype of input - window_length : int, passed to get_window - lag : int, passed to get_window - - Raises - ------ - Exception if get_window raises one - """ - # retrieve example fixture - data = EXAMPLE_DATA[datatype] - X = get_window(data, window_length=window_length, lag=lag) - assert isinstance(X, type(data)) - - -def test_get_window_expected_result(): - """Tests that get_window produces return of the right length.""" - X_df = EXAMPLE_DATA["pd.DataFrame"] - assert len(get_window(X_df, 2, 1)) == 2 - assert len(get_window(X_df, 3, 1)) == 3 - assert len(get_window(X_df, 1, 2)) == 1 - assert len(get_window(X_df, 3, 4)) == 0 - assert len(get_window(X_df, 3, None)) == 3 - assert len(get_window(X_df, None, 2)) == 2 - assert len(get_window(X_df, None, None)) == 4 - X_mi = EXAMPLE_DATA["pd-multiindex"] - assert len(get_window(X_mi, 3, 1)) == 6 - assert len(get_window(X_mi, 2, 0)) == 6 - assert len(get_window(X_mi, 2, 4)) == 0 - assert len(get_window(X_mi, 1, 2)) == 3 - assert len(get_window(X_mi, 2, None)) == 6 - assert len(get_window(X_mi, None, 2)) == 3 - assert len(get_window(X_mi, None, None)) == 9 - X_hi = EXAMPLE_DATA["pd-multiindex-hier"] - assert len(get_window(X_hi, 3, 1)) == 12 - assert len(get_window(X_hi, 2, 0)) == 12 - assert len(get_window(X_hi, 2, 4)) == 0 - assert len(get_window(X_hi, 1, 2)) == 6 - assert len(get_window(X_hi, 2, None)) == 12 - assert len(get_window(X_hi, None, 2)) == 6 - assert len(get_window(X_hi, None, None)) == 18 - X_np3d = np.random.rand(3, 2, 3) - assert get_window(X_np3d, 3, 1).shape == (2, 2, 3) - assert get_window(X_np3d, 2, 0).shape == (2, 2, 3) - assert get_window(X_np3d, 2, 4).shape == (0, 2, 3) - assert get_window(X_np3d, 1, 2).shape == (1, 2, 3) - assert get_window(X_np3d, 2, None).shape == (2, 2, 3) - assert get_window(X_np3d, None, 2).shape == (1, 2, 3) - assert get_window(X_np3d, None, None).shape == (3, 2, 3) - - -@pytest.mark.parametrize("datatype", EXAMPLE_DATA.keys()) -def test_get_slice_output_type(datatype): - """Tests that get_slice runs for all mtypes, and returns output of same mtype. - - Parameters - ---------- - scitype : str - scitype of input - mtype : str - mtype of input - - Raises - ------ - Exception if get_slice raises one - """ - # retrieve example fixture - data = EXAMPLE_DATA[datatype] - X = get_slice(data) - assert isinstance(X, type(data)) - - -def test_get_slice_expected_result(): - """Tests that get_slice produces return of the right length. - - Raises - ------ - Exception if get_slice raises one - """ - X_df = EXAMPLE_DATA["pd.DataFrame"] - assert len(get_slice(X_df, start=1, end=3)) == 2 - - X_s = EXAMPLE_DATA["pd.Series"] - assert len(get_slice(X_s, start=1, end=3)) == 2 - - X_np = EXAMPLE_DATA["numpy3D"] - assert get_slice(X_np, start=1, end=3).shape == (2, 2, 3) diff --git a/aeon/utils/validation/_input.py b/aeon/utils/validation/_input.py index 73b957cb45..5b0e410b52 100644 --- a/aeon/utils/validation/_input.py +++ b/aeon/utils/validation/_input.py @@ -36,12 +36,12 @@ def _abstract_type(input_type: str) -> str: Returns ------- str - Abstract type of the input, one of Series, Panel, Hierarchical or Unknown. + Abstract type of the input, one of Series, Collection, Hierarchical or Unknown. """ if input_type in SERIES: return "Series" if input_type in COLLECTIONS: - return "Panel" + return "Collection" if input_type in HIERARCHICAL: return "Hierarchical" return "Unknown" @@ -58,7 +58,7 @@ def abstract_types(input_types) -> list: Returns ------- list of str - Abstract type of the input, one of Series, Panel, Hierarchical or Unknown. + Abstract type of the input, one of Series, Collection, Hierarchical or Unknown. """ if isinstance(input_types, str): input_types = [input_types] diff --git a/aeon/utils/validation/collection.py b/aeon/utils/validation/collection.py index 53e124125d..f6e1835aa7 100644 --- a/aeon/utils/validation/collection.py +++ b/aeon/utils/validation/collection.py @@ -315,7 +315,7 @@ def is_equal_length(X): Raises ------ ValueError - input_type equals "dask_panel" or not in COLLECTIONS_DATA_TYPES. + input_type not in COLLECTIONS_DATA_TYPES. Examples -------- @@ -343,7 +343,7 @@ def has_missing(X): Raises ------ ValueError - Input_type equals "dask_panel" or not in COLLECTIONS_DATA_TYPES. + Input_type not in COLLECTIONS_DATA_TYPES. Examples -------- diff --git a/aeon/utils/validation/series.py b/aeon/utils/validation/series.py index 285bf7935d..f42c684a8b 100644 --- a/aeon/utils/validation/series.py +++ b/aeon/utils/validation/series.py @@ -3,7 +3,6 @@ __all__ = [ "check_series", "check_time_index", - "check_equal_time_index", "check_consistent_index_type", "is_hierarchical", "is_single_series", @@ -359,59 +358,6 @@ def check_time_index( return index -def check_equal_time_index(*ys, mode="equal"): - """Check that time series have the same (time) indices. - - Parameters - ---------- - *ys : tuple of aeon compatible time series data containers - must be pd.Series, pd.DataFrame or 1/2D np.ndarray, or None - can be Series, Panel, Hierarchical, but must be pandas or numpy - note: this assumption is not checked by the function itself - mode : str, "equal" or "contained", optional, default = "equal" - if "equal" will check for all indices being exactly equal - if "contained", will check whether all indices are subset of ys[0].index - - Raises - ------ - ValueError - if mode = "equal", raised if there are at least two non-None entries of ys - of which pandas indices are not the same - if mode = "contained, raised if there is at least one non-None ys[i] - such that ys[i].index is not contained in ys[o].index - np.ndarray are considered having (pandas) integer range index on axis 0 - """ - y_not_None = [y for y in ys if y is not None] - - # if there is no or just one element, there is nothing to compare - if len(y_not_None) < 2: - return None - - # only validate indices if data is passed as pd.Series - first_index = get_index_for_series(y_not_None[0]) - - for i, y in enumerate(y_not_None[1:]): - y_index = get_index_for_series(y) - - if mode == "equal": - failure_cond = not first_index.equals(y_index) - msg = ( - f"(time) indices are not the same, series 0 and {i} " - f"differ in the following: {first_index.symmetric_difference(y_index)}." - ) - elif mode == "contains": - failure_cond = not y_index.isin(first_index).all() - msg = ( - f"(time) indices of series {i} are not contained in index of series 0," - f" extra indices are: {y_index.difference(first_index)}" - ) - else: - raise ValueError('mode must be "equal" or "contains"') - - if failure_cond: - raise ValueError(msg) - - def check_consistent_index_type(a, b): """Check that two indices have consistent types. diff --git a/aeon/utils/validation/tests/test_input.py b/aeon/utils/validation/tests/test_input.py index 0c489d15a8..01fc994e61 100644 --- a/aeon/utils/validation/tests/test_input.py +++ b/aeon/utils/validation/tests/test_input.py @@ -25,7 +25,7 @@ def test_abstract_types(): for s in SERIES: assert _abstract_type(s) == "Series" for c in COLLECTIONS: - assert _abstract_type(c) == "Panel" + assert _abstract_type(c) == "Collection" for h in HIERARCHICAL: assert _abstract_type(h) == "Hierarchical" assert _abstract_type("Arsenal") == "Unknown" @@ -33,7 +33,7 @@ def test_abstract_types(): assert abstract_types(comb) == [ "Series", "Series", - "Panel", + "Collection", "Hierarchical", "Unknown", ] diff --git a/aeon/utils/validation/tests/test_series.py b/aeon/utils/validation/tests/test_series.py index ddf1889d54..c442782b4b 100644 --- a/aeon/utils/validation/tests/test_series.py +++ b/aeon/utils/validation/tests/test_series.py @@ -12,7 +12,6 @@ _check_pd_dataframe, _common_checks, check_consistent_index_type, - check_equal_time_index, check_is_univariate, check_series, check_time_index, @@ -36,24 +35,6 @@ def test_is_univariate_series(): assert not is_univariate_series(np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])) -def test_check_equal_time_index(): - """Test check equal time index.""" - assert check_equal_time_index(None) is None - x = (pd.Series([1, 2, 3, 4, 5]), pd.Series([2, 3, 4, 5, 6])) - with pytest.raises(ValueError, match="mode must be "): - check_equal_time_index(*x, mode="FOO") - index1 = pd.date_range(start="2023-01-01", end="2023-01-05") - index2 = pd.date_range(start="2023-01-06", end="2023-01-10") - ys = ( - pd.Series([1, 2, 3, 4, 5], index=index1), - pd.Series([6, 7, 8, 9, 10], index=index2), - ) - with pytest.raises(ValueError): - check_equal_time_index(*ys, mode="contains") - with pytest.raises(ValueError): - check_equal_time_index(*ys, mode="equal") - - def test__check_is_univariate(): """Test check_is_univariate.""" X = np.random.random(size=(10, 1, 20))