diff --git a/aeon/utils/conversion/_convert_series.py b/aeon/utils/conversion/_convert_series.py index be6db5bb76..7693d52443 100644 --- a/aeon/utils/conversion/_convert_series.py +++ b/aeon/utils/conversion/_convert_series.py @@ -99,13 +99,25 @@ def convert_series(y, output_type): if output_type == "np.ndarray": return y.to_numpy() if output_type == "pd.Series": - y = y.squeeze() - if y.ndim > 1: - raise ValueError( - "DataFrame of more than one row or column, cannot convert to " - "pd.Series" - ) - return pd.Series(y) + if input_type == "pd.DataFrame": + if y.shape == (1, 1): # special case of single element, cant squeeze + y = y[y.columns[0]] + else: + y = y.squeeze() + if y.ndim > 1: + raise ValueError( + "pd.DataFrame of more than one row or column, cannot convert to " + "pd.Series" + ) + return y + elif input_type == "np.ndarray": + y = y.squeeze() + if y.ndim > 1: + raise ValueError( + "np.ndarray of more than one row or column, cannot convert to " + "pd.Series" + ) + return pd.Series(y) if output_type == "pd.DataFrame": if input_type == "pd.Series": return y.to_frame() diff --git a/aeon/utils/conversion/tests/test_convert_series.py b/aeon/utils/conversion/tests/test_convert_series.py index a950f5b714..439c074f6c 100644 --- a/aeon/utils/conversion/tests/test_convert_series.py +++ b/aeon/utils/conversion/tests/test_convert_series.py @@ -106,3 +106,12 @@ def test_convert_series_lists(): test.insert(0, "FOO") x = convert_series(UNIVARIATE[i], test) assert x.__class__.__name__ == test[1].split(".")[1] + + +def test_convert_series_single_element(): + """Test a DataFrame with a single element is correctly converted to a pd.Series.""" + x = pd.DataFrame([1]) + x = pd.DataFrame({"0": [10]}, index=pd.to_datetime(["2000-02-18"])) + y = convert_series(x, "pd.Series") + assert isinstance(y, pd.Series) + assert isinstance(y.index, pd.DatetimeIndex) diff --git a/aeon/utils/index_functions.py b/aeon/utils/index_functions.py index 867727c3d7..66545915cd 100644 --- a/aeon/utils/index_functions.py +++ b/aeon/utils/index_functions.py @@ -3,8 +3,7 @@ import numpy as np import pandas as pd -from aeon.datatypes import convert_to -from aeon.datatypes._vec_df import _VectorizedDF +from aeon.utils.conversion import convert_collection, convert_series from aeon.utils.validation import ( is_collection, is_hierarchical, @@ -30,11 +29,11 @@ def get_time_index(X): Parameters ---------- - X : pd.DataFrame, pd.Series, np.ndarray, or VectorizedDF - in one of the following aeon mtype specifications for Series, Panel, Hierarchical: - pd.DataFrame, pd.Series, np.ndarray, pd-multiindex, nested_univ, pd_multiindex_hier - assumes all time series have equal length and equal index set - will *not* work for list-of-df, pd-wide, pd-long, numpy2D + X : pd.DataFrame, pd.Series, np.ndarray + in one of the following data container types pd.DataFrame, pd.Series, + np.ndarray, pd-multiindex, nested_univ, pd_multiindex_hier + assumes all time series have equal length and equal index set + will *not* work for list-of-df, pd-wide, pd-long, numpy2D or np-list. Returns ------- @@ -82,8 +81,8 @@ def get_index_for_series(obj, cutoff=0): Parameters ---------- obj : aeon data container - must be of one of the following mtypes: - pd.Series, pd.DataFrame, np.ndarray, of Series scitype + must be of one of the following single series data structures: + pd.Series, pd.DataFrame, np.ndarray cutoff : int, or pd.datetime, optional, default=0 current cutoff, used to offset index if obj is np.ndarray @@ -106,7 +105,7 @@ def _get_cutoff_from_index(idx, return_index=False, reverse_order=False): Parameters ---------- obj : pd.Index, possibly MultiIndex, with last level assumed timelike or integer, - e.g., as in the pd.DataFrame, pd-multiindex, or pd_multiindex_hier mtypes + e.g., as in the pd.DataFrame, pd-multiindex, or pd_multiindex_hier types return_index : bool, optional, default=False whether a pd.Index object should be returned (True) or a pandas compatible index element (False) @@ -199,7 +198,7 @@ def get_cutoff( ------ ValueError, TypeError, if check_input or convert_input are True. """ - # deal with VectorizedDF + # deal with legacy method of wrapping a Broadcaster if hasattr(obj, "X"): obj = obj.X @@ -208,7 +207,9 @@ def get_cutoff( idx=obj, return_index=return_index, reverse_order=reverse_order ) if not (is_hierarchical(obj) or is_collection(obj) or is_single_series(obj)): - raise ValueError("obj must be of Series, Panel, or Hierarchical abstract type") + raise ValueError( + "obj must be of Series, Collection, or Hierarchical abstract type" + ) if cutoff is None: cutoff = 0 @@ -225,7 +226,7 @@ def get_cutoff( if len(obj) == 0: return cutoff - # numpy3D (Panel) or np.npdarray (Series) + # numpy3D (Collection) or np.npdarray (Series) if isinstance(obj, np.ndarray): if obj.ndim == 3: cutoff_ind = obj.shape[-1] + cutoff - 1 @@ -259,7 +260,7 @@ def sub_idx(idx, ix, return_index=True): if isinstance(obj, pd.Series): return sub_idx(obj.index, ix, return_index) - # nested_univ (Panel) or pd.DataFrame(Series) + # nested_univ (Collection) or pd.DataFrame(Series) if isinstance(obj, pd.DataFrame) and not isinstance(obj.index, pd.MultiIndex): objcols = [x for x in obj.columns if obj.dtypes[x] == "object"] # pd.DataFrame @@ -272,7 +273,7 @@ def sub_idx(idx, ix, return_index=True): ] return agg(idxx) - # pd-multiindex (Panel) and pd_multiindex_hier (Hierarchical) + # pd-multiindex (Collection) and pd_multiindex_hier (Hierarchical) if isinstance(obj, pd.DataFrame) and isinstance(obj.index, pd.MultiIndex): idx = obj.index series_idx = [ @@ -281,34 +282,34 @@ def sub_idx(idx, ix, return_index=True): cutoffs = [sub_idx(x, ix, return_index) for x in series_idx] return agg(cutoffs) - # df-list (Panel) + # df-list (Collection) if isinstance(obj, list): idxs = [sub_idx(x.index, ix, return_index) for x in obj] return agg(idxs) -UPDATE_DATA_INTERNAL_MTYPES = [ +SUPPORTED_SERIES = [ "pd.DataFrame", - "pd.Series", "np.ndarray", +] +SUPPORTED_COLLECTIONS = [ "pd-multiindex", "numpy3D", - "pd_multiindex_hier", ] def update_data(X, X_new=None): """Update time series container with another one. - Coerces X, X_new to one of the assumed mtypes, if not already of that type. + Converts X, X_new to one of the valid internal types, if not one already. Parameters ---------- - X : None, or aeon data container, in one of the following mtype formats + X : None, or aeon data container, in one of the following internal type formats pd.DataFrame, pd.Series, np.ndarray, pd-multiindex, numpy3D, - pd_multiindex_hier. If not of that format, coerced. - X_new : None, or aeon data container, should be same mtype as X, - or convert to same format when converting to format list via convert_to + pd_multiindex_hier. If not of that format, converted. + X_new : None, or aeon data container, should be same type as X, + or convert to same format when converting X Returns ------- @@ -317,17 +318,11 @@ def update_data(X, X_new=None): numpy based containers will always be interpreted as having new row index if one of X, X_new is None, returns the other; if both are None, returns None """ - # if X or X_new is vectorized, unwrap it first - if isinstance(X, _VectorizedDF): + # Temporary measure to deal with legacy method of wrapping a Broadcaster + if hasattr(X, "X"): X = X.X - if isinstance(X_new, _VectorizedDF): + if hasattr(X_new, "X"): X_new = X_new.X - - # we want to ensure that X, X_new are either numpy (1D, 2D, 3D) - # or in one of the long pandas formats - X = convert_to(X, to_type=UPDATE_DATA_INTERNAL_MTYPES) - X_new = convert_to(X_new, to_type=UPDATE_DATA_INTERNAL_MTYPES) - # we only need to modify X if X_new is not None if X_new is None: return X @@ -336,6 +331,13 @@ def update_data(X, X_new=None): if X is None: return X_new + # we want to ensure that X, X_new are either numpy (1D, 2D, 3D) + # or in one of the long pandas formats if they are collections + if is_collection(X): + X = convert_collection(X, output_type="numpy3D") + if is_collection(X_new): + X_new = convert_collection(X_new, output_type="numpy3D") + # update X with the new rows in X_new # if X is np.ndarray, we assume all rows are new if isinstance(X, np.ndarray): @@ -350,29 +352,35 @@ def update_data(X, X_new=None): return X_new.combine_first(X) -GET_WINDOW_SUPPORTED_MTYPES = [ - "pd.DataFrame", - "pd-multiindex", - "pd_multiindex_hier", - "np.ndarray", - "numpy3D", -] +def _convert(obj, abstract_type, input_type): + reconvert = False + if abstract_type == "Series": + obj = convert_series(obj, SUPPORTED_SERIES) + if input_type == "pd.Series": + reconvert = True + elif abstract_type == "Panel": + if input_type not in SUPPORTED_COLLECTIONS: + obj = convert_collection(obj, "pd-multiindex") + reconvert = True + return obj, reconvert def get_window(obj, window_length=None, lag=None): """Slice obj to the time index window with given length and lag. - Returns time series or time series panel with time indices - strictly greater than cutoff - lag - window_length, and - equal or less than cutoff - lag. + Returns time series or time series collection with time indices strictly greater + than cutoff - lag - window_length, and equal or less than cutoff - lag. Cutoff if of obj, as determined by get_cutoff. + This function does not work with pd.Series, hence the conversion to pd.DataFrame. + It also does not work with unequal length collections of series. Parameters ---------- obj : aeon compatible time series data container or None - if not None, must be of Series, Panel, or Hierarchical scitype - all mtypes are supported via conversion to internally supported types - to avoid conversions, pass data in one of GET_WINDOW_SUPPORTED_MTYPES + if not None, must be of Series, Collection, or Hierarchical internal types. + all valid internal types are supported via conversion to internally supported + types to avoid conversions, pass data in one of SUPPORTED_SERIES or + SUPPORTED_COLLECTION window_length : int or timedelta, optional, default=-inf must be int if obj is int indexed, timedelta if datetime indexed length of the window to slice to. Default = window of infinite size @@ -392,10 +400,10 @@ def get_window(obj, window_length=None, lag=None): return obj valid, metadata = validate_input(obj) if not valid: - raise ValueError("obj must be of Series, Collection, or Hierarchical scitype") - obj_in_mtype = metadata["mtype"] - - obj = convert_to(obj, GET_WINDOW_SUPPORTED_MTYPES) + raise ValueError("obj must be of Series, Collection, or Hierarchical type") + input_type = metadata["mtype"] + abstract_type = metadata["scitype"] + obj, reconvert = _convert(obj, abstract_type, input_type) # numpy3D (Collection) or np.npdarray (Series) if isinstance(obj, np.ndarray): @@ -445,8 +453,13 @@ def get_window(obj, window_length=None, lag=None): win_select = win_select & (time_indices > win_start_excl) obj_subset = obj.iloc[win_select] + if reconvert: + if abstract_type == "Series" and input_type == "pd.Series": + obj_subset = convert_series(obj_subset, input_type) + elif abstract_type == "Panel": + obj_subset = convert_collection(obj_subset, input_type) - return convert_to(obj_subset, obj_in_mtype) + return obj_subset raise ValueError( "passed get_window an object that is not of type np.ndarray or pd.DataFrame" @@ -456,16 +469,14 @@ def get_window(obj, window_length=None, lag=None): def get_slice(obj, start=None, end=None): """Slice obj with start (inclusive) and end (exclusive) indices. - Returns time series or time series panel with time indices - strictly greater and equal to start index and less than - end index. + Returns time series or time series collection with time indices strictly greater + and equal to start index and less than end index. Parameters ---------- obj : aeon compatible time series data container or None - if not None, must be of Series, Panel, or Hierarchical scitype - all mtypes are supported via conversion to internally supported types - to avoid conversions, pass data in one of GET_WINDOW_SUPPORTED_MTYPES + if not None, must be of Series, Collection, or Hierarchical type + in one of SUPPORTED_SERIES or SUPPORTED_COLLECTION. start : int or timestamp, optional, default = None must be int if obj is int indexed, timestamp if datetime indexed Inclusive start of slice. Default = None. @@ -485,12 +496,12 @@ def get_slice(obj, start=None, end=None): valid, metadata = validate_input(obj) if not valid: - raise ValueError("obj must be of Series, Panel, or Hierarchical scitype") - obj_in_mtype = metadata["mtype"] - - obj = convert_to(obj, GET_WINDOW_SUPPORTED_MTYPES) + raise ValueError("obj must be of Series, Collection or Hierarchical type") + input_type = metadata["mtype"] + abstract_type = metadata["scitype"] + obj, reconvert = _convert(obj, abstract_type, input_type) - # numpy3D (Panel) or np.npdarray (Series) + # numpy3D (Collection) or np.npdarray (Series) # Assumes the index is integer so will be exclusive by default if isinstance(obj, np.ndarray): # if 2D or 3D, we need to subset by last, not first dimension @@ -511,7 +522,8 @@ def get_slice(obj, start=None, end=None): obj_subset = obj_subset.swapaxes(1, -1) return obj_subset - # pd.DataFrame(Series), pd-multiindex (Panel) and pd_multiindex_hier (Hierarchical) + # pd.DataFrame(Series), pd-multiindex (Collection) and pd_multiindex_hier ( + # Hierarchical) # Assumes the index is pd.Timestamp or pd.Period and ensures the end is # exclusive with slice_select if isinstance(obj, pd.DataFrame): @@ -528,8 +540,9 @@ def get_slice(obj, start=None, end=None): slice_select = time_indices >= start obj_subset = obj.iloc[slice_select] - return convert_to(obj_subset, obj_in_mtype) - - raise ValueError( - "bug in get_slice, unreachable condition, ifs should be exhaustive" - ) + if reconvert: + if abstract_type == "Series" and input_type == "pd.Series": + obj_subset = convert_series(obj_subset, input_type) + elif abstract_type == "Panel": + obj_subset = convert_collection(obj_subset, input_type) + return obj_subset diff --git a/aeon/utils/tests/test_index_functions.py b/aeon/utils/tests/test_index_functions.py index 4caf3a3d88..5e9e75459e 100644 --- a/aeon/utils/tests/test_index_functions.py +++ b/aeon/utils/tests/test_index_functions.py @@ -213,14 +213,17 @@ def test_get_cutoff_wrong_input(bad_inputs): ------ Exception (from pytest) if the error is not raised as expected """ - with pytest.raises(Exception, match="must be of Series, Panel, or Hierarchical"): + with pytest.raises( + Exception, match="must be of Series, Collection, " "or Hierarchical" + ): get_cutoff(bad_inputs) @pytest.mark.parametrize("window_length, lag", [(2, 0), (None, 0), (4, 1)]) -@pytest.mark.parametrize("datatype", EXAMPLE_DATA.keys()) +# @pytest.mark.parametrize("datatype", EXAMPLE_DATA.keys()) +@pytest.mark.parametrize("datatype", ["pd.Series"]) def test_get_window_output_type(datatype, window_length, lag): - """Tests that get_window runs for all mtypes, and returns output of same mtype. + """Tests that get_window runs for all types, and returns output of same mtype. Parameters ----------