[ENH] remove convert_to and vectorized import from index_functions (#…

…1433) * remove conversion * tidy * comments * examples tests * vectorise DF * convert tests * add list for output types and tests for convert_series * docstring for get_cutoff * convert_to in get window * convert_series single element pd.Series * remove Vectorized input from index_functions.py * remove Vectorized input from index_functions.py * remove Vectorized input from index_functions.py * sort out 1 element conversion * change get_slice * change update * change update * convert_collection * typo * comments * tests * docstrings
aeon-toolkit · Apr 18, 2024 · 5bc5c31 · 5bc5c31
1 parent 3e9d959
commit 5bc5c31
Show file tree

Hide file tree

Showing 4 changed files with 115 additions and 78 deletions.
diff --git a/aeon/utils/conversion/_convert_series.py b/aeon/utils/conversion/_convert_series.py
@@ -99,13 +99,25 @@ def convert_series(y, output_type):
     if output_type == "np.ndarray":
         return y.to_numpy()
     if output_type == "pd.Series":
-        y = y.squeeze()
-        if y.ndim > 1:
-            raise ValueError(
-                "DataFrame of more than one row or column, cannot convert to "
-                "pd.Series"
-            )
-        return pd.Series(y)
+        if input_type == "pd.DataFrame":
+            if y.shape == (1, 1):  # special case of single element, cant squeeze
+                y = y[y.columns[0]]
+            else:
+                y = y.squeeze()
+            if y.ndim > 1:
+                raise ValueError(
+                    "pd.DataFrame of more than one row or column, cannot convert to "
+                    "pd.Series"
+                )
+            return y
+        elif input_type == "np.ndarray":
+            y = y.squeeze()
+            if y.ndim > 1:
+                raise ValueError(
+                    "np.ndarray of more than one row or column, cannot convert to "
+                    "pd.Series"
+                )
+            return pd.Series(y)
     if output_type == "pd.DataFrame":
         if input_type == "pd.Series":
             return y.to_frame()

diff --git a/aeon/utils/conversion/tests/test_convert_series.py b/aeon/utils/conversion/tests/test_convert_series.py
@@ -106,3 +106,12 @@ def test_convert_series_lists():
         test.insert(0, "FOO")
         x = convert_series(UNIVARIATE[i], test)
         assert x.__class__.__name__ == test[1].split(".")[1]
+
+
+def test_convert_series_single_element():
+    """Test a DataFrame with a single element is correctly converted to a pd.Series."""
+    x = pd.DataFrame([1])
+    x = pd.DataFrame({"0": [10]}, index=pd.to_datetime(["2000-02-18"]))
+    y = convert_series(x, "pd.Series")
+    assert isinstance(y, pd.Series)
+    assert isinstance(y.index, pd.DatetimeIndex)
diff --git a/aeon/utils/index_functions.py b/aeon/utils/index_functions.py
@@ -3,8 +3,7 @@
 import numpy as np
 import pandas as pd
 
-from aeon.datatypes import convert_to
-from aeon.datatypes._vec_df import _VectorizedDF
+from aeon.utils.conversion import convert_collection, convert_series
 from aeon.utils.validation import (
     is_collection,
     is_hierarchical,
@@ -30,11 +29,11 @@ def get_time_index(X):
 
     Parameters
     ----------
-    X : pd.DataFrame, pd.Series, np.ndarray, or VectorizedDF
-    in one of the following aeon mtype specifications for Series, Panel, Hierarchical:
-    pd.DataFrame, pd.Series, np.ndarray, pd-multiindex, nested_univ, pd_multiindex_hier
-    assumes all time series have equal length and equal index set
-    will *not* work for list-of-df, pd-wide, pd-long, numpy2D
+    X : pd.DataFrame, pd.Series, np.ndarray
+        in one of the following data container types pd.DataFrame, pd.Series,
+        np.ndarray, pd-multiindex, nested_univ, pd_multiindex_hier
+        assumes all time series have equal length and equal index set
+        will *not* work for list-of-df, pd-wide, pd-long, numpy2D or np-list.
 
     Returns
     -------
@@ -82,8 +81,8 @@ def get_index_for_series(obj, cutoff=0):
     Parameters
     ----------
     obj : aeon data container
-        must be of one of the following mtypes:
-            pd.Series, pd.DataFrame, np.ndarray, of Series scitype
+        must be of one of the following single series data structures:
+            pd.Series, pd.DataFrame, np.ndarray
     cutoff : int, or pd.datetime, optional, default=0
         current cutoff, used to offset index if obj is np.ndarray
 
@@ -106,7 +105,7 @@ def _get_cutoff_from_index(idx, return_index=False, reverse_order=False):
     Parameters
     ----------
     obj : pd.Index, possibly MultiIndex, with last level assumed timelike or integer,
-        e.g., as in the pd.DataFrame, pd-multiindex, or pd_multiindex_hier mtypes
+        e.g., as in the pd.DataFrame, pd-multiindex, or pd_multiindex_hier types
     return_index : bool, optional, default=False
         whether a pd.Index object should be returned (True)
             or a pandas compatible index element (False)
@@ -199,7 +198,7 @@ def get_cutoff(
     ------
     ValueError, TypeError, if check_input or convert_input are True.
     """
-    # deal with VectorizedDF
+    # deal with legacy method of wrapping a Broadcaster
     if hasattr(obj, "X"):
         obj = obj.X
 
@@ -208,7 +207,9 @@ def get_cutoff(
             idx=obj, return_index=return_index, reverse_order=reverse_order
         )
     if not (is_hierarchical(obj) or is_collection(obj) or is_single_series(obj)):
-        raise ValueError("obj must be of Series, Panel, or Hierarchical abstract type")
+        raise ValueError(
+            "obj must be of Series, Collection, or Hierarchical abstract type"
+        )
 
     if cutoff is None:
         cutoff = 0
@@ -225,7 +226,7 @@ def get_cutoff(
     if len(obj) == 0:
         return cutoff
 
-    # numpy3D (Panel) or np.npdarray (Series)
+    # numpy3D (Collection) or np.npdarray (Series)
     if isinstance(obj, np.ndarray):
         if obj.ndim == 3:
             cutoff_ind = obj.shape[-1] + cutoff - 1
@@ -259,7 +260,7 @@ def sub_idx(idx, ix, return_index=True):
     if isinstance(obj, pd.Series):
         return sub_idx(obj.index, ix, return_index)
 
-    # nested_univ (Panel) or pd.DataFrame(Series)
+    # nested_univ (Collection) or pd.DataFrame(Series)
     if isinstance(obj, pd.DataFrame) and not isinstance(obj.index, pd.MultiIndex):
         objcols = [x for x in obj.columns if obj.dtypes[x] == "object"]
         # pd.DataFrame
@@ -272,7 +273,7 @@ def sub_idx(idx, ix, return_index=True):
             ]
             return agg(idxx)
 
-    # pd-multiindex (Panel) and pd_multiindex_hier (Hierarchical)
+    # pd-multiindex (Collection) and pd_multiindex_hier (Hierarchical)
     if isinstance(obj, pd.DataFrame) and isinstance(obj.index, pd.MultiIndex):
         idx = obj.index
         series_idx = [
@@ -281,34 +282,34 @@ def sub_idx(idx, ix, return_index=True):
         cutoffs = [sub_idx(x, ix, return_index) for x in series_idx]
         return agg(cutoffs)
 
-    # df-list (Panel)
+    # df-list (Collection)
     if isinstance(obj, list):
         idxs = [sub_idx(x.index, ix, return_index) for x in obj]
         return agg(idxs)
 
 
-UPDATE_DATA_INTERNAL_MTYPES = [
+SUPPORTED_SERIES = [
     "pd.DataFrame",
-    "pd.Series",
     "np.ndarray",
+]
+SUPPORTED_COLLECTIONS = [
     "pd-multiindex",
     "numpy3D",
-    "pd_multiindex_hier",
 ]
 
 
 def update_data(X, X_new=None):
     """Update time series container with another one.
 
-    Coerces X, X_new to one of the assumed mtypes, if not already of that type.
+    Converts X, X_new to one of the valid internal types, if not one already.
 
     Parameters
     ----------
-    X : None, or aeon data container, in one of the following mtype formats
+    X : None, or aeon data container, in one of the following internal type formats
         pd.DataFrame, pd.Series, np.ndarray, pd-multiindex, numpy3D,
-        pd_multiindex_hier. If not of that format, coerced.
-    X_new : None, or aeon data container, should be same mtype as X,
-        or convert to same format when converting to format list via convert_to
+        pd_multiindex_hier. If not of that format, converted.
+    X_new : None, or aeon data container, should be same type as X,
+        or convert to same format when converting X
 
     Returns
     -------
@@ -317,17 +318,11 @@ def update_data(X, X_new=None):
         numpy based containers will always be interpreted as having new row index
         if one of X, X_new is None, returns the other; if both are None, returns None
     """
-    # if X or X_new is vectorized, unwrap it first
-    if isinstance(X, _VectorizedDF):
+    # Temporary measure to deal with legacy method of wrapping a Broadcaster
+    if hasattr(X, "X"):
         X = X.X
-    if isinstance(X_new, _VectorizedDF):
+    if hasattr(X_new, "X"):
         X_new = X_new.X
-
-    # we want to ensure that X, X_new are either numpy (1D, 2D, 3D)
-    # or in one of the long pandas formats
-    X = convert_to(X, to_type=UPDATE_DATA_INTERNAL_MTYPES)
-    X_new = convert_to(X_new, to_type=UPDATE_DATA_INTERNAL_MTYPES)
-
     # we only need to modify X if X_new is not None
     if X_new is None:
         return X
@@ -336,6 +331,13 @@ def update_data(X, X_new=None):
     if X is None:
         return X_new
 
+    # we want to ensure that X, X_new are either numpy (1D, 2D, 3D)
+    # or in one of the long pandas formats if they are collections
+    if is_collection(X):
+        X = convert_collection(X, output_type="numpy3D")
+    if is_collection(X_new):
+        X_new = convert_collection(X_new, output_type="numpy3D")
+
     # update X with the new rows in X_new
     #  if X is np.ndarray, we assume all rows are new
     if isinstance(X, np.ndarray):
@@ -350,29 +352,35 @@ def update_data(X, X_new=None):
         return X_new.combine_first(X)
 
 
-GET_WINDOW_SUPPORTED_MTYPES = [
-    "pd.DataFrame",
-    "pd-multiindex",
-    "pd_multiindex_hier",
-    "np.ndarray",
-    "numpy3D",
-]
+def _convert(obj, abstract_type, input_type):
+    reconvert = False
+    if abstract_type == "Series":
+        obj = convert_series(obj, SUPPORTED_SERIES)
+        if input_type == "pd.Series":
+            reconvert = True
+    elif abstract_type == "Panel":
+        if input_type not in SUPPORTED_COLLECTIONS:
+            obj = convert_collection(obj, "pd-multiindex")
+            reconvert = True
+    return obj, reconvert
 
 
 def get_window(obj, window_length=None, lag=None):
     """Slice obj to the time index window with given length and lag.
 
-    Returns time series or time series panel with time indices
-        strictly greater than cutoff - lag - window_length, and
-        equal or less than cutoff - lag.
+    Returns time series or time series collection with time indices strictly greater
+    than cutoff - lag - window_length, and equal or less than cutoff - lag.
     Cutoff if of obj, as determined by get_cutoff.
+    This function does not work with pd.Series, hence the conversion to pd.DataFrame.
+    It also does not work with unequal length collections of series.
 
     Parameters
     ----------
     obj : aeon compatible time series data container or None
-        if not None, must be of Series, Panel, or Hierarchical scitype
-        all mtypes are supported via conversion to internally supported types
-        to avoid conversions, pass data in one of GET_WINDOW_SUPPORTED_MTYPES
+        if not None, must be of Series, Collection, or Hierarchical internal types.
+        all valid internal types are supported via conversion to internally supported
+        types to avoid conversions, pass data in one of SUPPORTED_SERIES or
+        SUPPORTED_COLLECTION
     window_length : int or timedelta, optional, default=-inf
         must be int if obj is int indexed, timedelta if datetime indexed
         length of the window to slice to. Default = window of infinite size
@@ -392,10 +400,10 @@ def get_window(obj, window_length=None, lag=None):
         return obj
     valid, metadata = validate_input(obj)
     if not valid:
-        raise ValueError("obj must be of Series, Collection, or Hierarchical scitype")
-    obj_in_mtype = metadata["mtype"]
-
-    obj = convert_to(obj, GET_WINDOW_SUPPORTED_MTYPES)
+        raise ValueError("obj must be of Series, Collection, or Hierarchical type")
+    input_type = metadata["mtype"]
+    abstract_type = metadata["scitype"]
+    obj, reconvert = _convert(obj, abstract_type, input_type)
 
     # numpy3D (Collection) or np.npdarray (Series)
     if isinstance(obj, np.ndarray):
@@ -445,8 +453,13 @@ def get_window(obj, window_length=None, lag=None):
                 win_select = win_select & (time_indices > win_start_excl)
 
         obj_subset = obj.iloc[win_select]
+        if reconvert:
+            if abstract_type == "Series" and input_type == "pd.Series":
+                obj_subset = convert_series(obj_subset, input_type)
+            elif abstract_type == "Panel":
+                obj_subset = convert_collection(obj_subset, input_type)
 
-        return convert_to(obj_subset, obj_in_mtype)
+        return obj_subset
 
     raise ValueError(
         "passed get_window an object that is not of type np.ndarray or pd.DataFrame"
@@ -456,16 +469,14 @@ def get_window(obj, window_length=None, lag=None):
 def get_slice(obj, start=None, end=None):
     """Slice obj with start (inclusive) and end (exclusive) indices.
 
-    Returns time series or time series panel with time indices
-        strictly greater and equal to start index and less than
-        end index.
+    Returns time series or time series collection with time indices strictly greater
+    and equal to start index and less than end index.
 
     Parameters
     ----------
     obj : aeon compatible time series data container or None
-        if not None, must be of Series, Panel, or Hierarchical scitype
-        all mtypes are supported via conversion to internally supported types
-        to avoid conversions, pass data in one of GET_WINDOW_SUPPORTED_MTYPES
+        if not None, must be of Series, Collection, or Hierarchical type
+        in one of SUPPORTED_SERIES or SUPPORTED_COLLECTION.
     start : int or timestamp, optional, default = None
         must be int if obj is int indexed, timestamp if datetime indexed
         Inclusive start of slice. Default = None.
@@ -485,12 +496,12 @@ def get_slice(obj, start=None, end=None):
 
     valid, metadata = validate_input(obj)
     if not valid:
-        raise ValueError("obj must be of Series, Panel, or Hierarchical scitype")
-    obj_in_mtype = metadata["mtype"]
-
-    obj = convert_to(obj, GET_WINDOW_SUPPORTED_MTYPES)
+        raise ValueError("obj must be of Series, Collection or Hierarchical type")
+    input_type = metadata["mtype"]
+    abstract_type = metadata["scitype"]
+    obj, reconvert = _convert(obj, abstract_type, input_type)
 
-    # numpy3D (Panel) or np.npdarray (Series)
+    # numpy3D (Collection) or np.npdarray (Series)
     # Assumes the index is integer so will be exclusive by default
     if isinstance(obj, np.ndarray):
         # if 2D or 3D, we need to subset by last, not first dimension
@@ -511,7 +522,8 @@ def get_slice(obj, start=None, end=None):
             obj_subset = obj_subset.swapaxes(1, -1)
         return obj_subset
 
-    # pd.DataFrame(Series), pd-multiindex (Panel) and pd_multiindex_hier (Hierarchical)
+    # pd.DataFrame(Series), pd-multiindex (Collection) and pd_multiindex_hier (
+    # Hierarchical)
     # Assumes the index is pd.Timestamp or pd.Period and ensures the end is
     # exclusive with slice_select
     if isinstance(obj, pd.DataFrame):
@@ -528,8 +540,9 @@ def get_slice(obj, start=None, end=None):
             slice_select = time_indices >= start
 
         obj_subset = obj.iloc[slice_select]
-        return convert_to(obj_subset, obj_in_mtype)
-
-    raise ValueError(
-        "bug in get_slice, unreachable condition, ifs should be exhaustive"
-    )
+        if reconvert:
+            if abstract_type == "Series" and input_type == "pd.Series":
+                obj_subset = convert_series(obj_subset, input_type)
+            elif abstract_type == "Panel":
+                obj_subset = convert_collection(obj_subset, input_type)
+        return obj_subset
diff --git a/aeon/utils/tests/test_index_functions.py b/aeon/utils/tests/test_index_functions.py
@@ -213,14 +213,17 @@ def test_get_cutoff_wrong_input(bad_inputs):
     ------
     Exception (from pytest) if the error is not raised as expected
     """
-    with pytest.raises(Exception, match="must be of Series, Panel, or Hierarchical"):
+    with pytest.raises(
+        Exception, match="must be of Series, Collection, " "or Hierarchical"
+    ):
         get_cutoff(bad_inputs)
 
 
 @pytest.mark.parametrize("window_length, lag", [(2, 0), (None, 0), (4, 1)])
-@pytest.mark.parametrize("datatype", EXAMPLE_DATA.keys())
+# @pytest.mark.parametrize("datatype", EXAMPLE_DATA.keys())
+@pytest.mark.parametrize("datatype", ["pd.Series"])
 def test_get_window_output_type(datatype, window_length, lag):
-    """Tests that get_window runs for all mtypes, and returns output of same mtype.
+    """Tests that get_window runs for all types, and returns output of same mtype.
 
     Parameters
     ----------