Skip to content

Commit

Permalink
[ENH] remove convert_to and vectorized import from index_functions (#…
Browse files Browse the repository at this point in the history
…1433)

* remove conversion

* tidy

* comments

* examples tests

* vectorise DF

* convert tests

* add list for output types and tests for convert_series

* docstring for get_cutoff

* convert_to in get window

* convert_series single element pd.Series

* remove Vectorized input from index_functions.py

* remove Vectorized input from index_functions.py

* remove Vectorized input from index_functions.py

* sort out 1 element conversion

* change get_slice

* change update

* change update

* convert_collection

* typo

* comments

* tests

* docstrings
  • Loading branch information
TonyBagnall authored Apr 18, 2024
1 parent 3e9d959 commit 5bc5c31
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 78 deletions.
26 changes: 19 additions & 7 deletions aeon/utils/conversion/_convert_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,13 +99,25 @@ def convert_series(y, output_type):
if output_type == "np.ndarray":
return y.to_numpy()
if output_type == "pd.Series":
y = y.squeeze()
if y.ndim > 1:
raise ValueError(
"DataFrame of more than one row or column, cannot convert to "
"pd.Series"
)
return pd.Series(y)
if input_type == "pd.DataFrame":
if y.shape == (1, 1): # special case of single element, cant squeeze
y = y[y.columns[0]]
else:
y = y.squeeze()
if y.ndim > 1:
raise ValueError(
"pd.DataFrame of more than one row or column, cannot convert to "
"pd.Series"
)
return y
elif input_type == "np.ndarray":
y = y.squeeze()
if y.ndim > 1:
raise ValueError(
"np.ndarray of more than one row or column, cannot convert to "
"pd.Series"
)
return pd.Series(y)
if output_type == "pd.DataFrame":
if input_type == "pd.Series":
return y.to_frame()
Expand Down
9 changes: 9 additions & 0 deletions aeon/utils/conversion/tests/test_convert_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,3 +106,12 @@ def test_convert_series_lists():
test.insert(0, "FOO")
x = convert_series(UNIVARIATE[i], test)
assert x.__class__.__name__ == test[1].split(".")[1]


def test_convert_series_single_element():
"""Test a DataFrame with a single element is correctly converted to a pd.Series."""
x = pd.DataFrame([1])
x = pd.DataFrame({"0": [10]}, index=pd.to_datetime(["2000-02-18"]))
y = convert_series(x, "pd.Series")
assert isinstance(y, pd.Series)
assert isinstance(y.index, pd.DatetimeIndex)
149 changes: 81 additions & 68 deletions aeon/utils/index_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
import numpy as np
import pandas as pd

from aeon.datatypes import convert_to
from aeon.datatypes._vec_df import _VectorizedDF
from aeon.utils.conversion import convert_collection, convert_series
from aeon.utils.validation import (
is_collection,
is_hierarchical,
Expand All @@ -30,11 +29,11 @@ def get_time_index(X):
Parameters
----------
X : pd.DataFrame, pd.Series, np.ndarray, or VectorizedDF
in one of the following aeon mtype specifications for Series, Panel, Hierarchical:
pd.DataFrame, pd.Series, np.ndarray, pd-multiindex, nested_univ, pd_multiindex_hier
assumes all time series have equal length and equal index set
will *not* work for list-of-df, pd-wide, pd-long, numpy2D
X : pd.DataFrame, pd.Series, np.ndarray
in one of the following data container types pd.DataFrame, pd.Series,
np.ndarray, pd-multiindex, nested_univ, pd_multiindex_hier
assumes all time series have equal length and equal index set
will *not* work for list-of-df, pd-wide, pd-long, numpy2D or np-list.
Returns
-------
Expand Down Expand Up @@ -82,8 +81,8 @@ def get_index_for_series(obj, cutoff=0):
Parameters
----------
obj : aeon data container
must be of one of the following mtypes:
pd.Series, pd.DataFrame, np.ndarray, of Series scitype
must be of one of the following single series data structures:
pd.Series, pd.DataFrame, np.ndarray
cutoff : int, or pd.datetime, optional, default=0
current cutoff, used to offset index if obj is np.ndarray
Expand All @@ -106,7 +105,7 @@ def _get_cutoff_from_index(idx, return_index=False, reverse_order=False):
Parameters
----------
obj : pd.Index, possibly MultiIndex, with last level assumed timelike or integer,
e.g., as in the pd.DataFrame, pd-multiindex, or pd_multiindex_hier mtypes
e.g., as in the pd.DataFrame, pd-multiindex, or pd_multiindex_hier types
return_index : bool, optional, default=False
whether a pd.Index object should be returned (True)
or a pandas compatible index element (False)
Expand Down Expand Up @@ -199,7 +198,7 @@ def get_cutoff(
------
ValueError, TypeError, if check_input or convert_input are True.
"""
# deal with VectorizedDF
# deal with legacy method of wrapping a Broadcaster
if hasattr(obj, "X"):
obj = obj.X

Expand All @@ -208,7 +207,9 @@ def get_cutoff(
idx=obj, return_index=return_index, reverse_order=reverse_order
)
if not (is_hierarchical(obj) or is_collection(obj) or is_single_series(obj)):
raise ValueError("obj must be of Series, Panel, or Hierarchical abstract type")
raise ValueError(
"obj must be of Series, Collection, or Hierarchical abstract type"
)

if cutoff is None:
cutoff = 0
Expand All @@ -225,7 +226,7 @@ def get_cutoff(
if len(obj) == 0:
return cutoff

# numpy3D (Panel) or np.npdarray (Series)
# numpy3D (Collection) or np.npdarray (Series)
if isinstance(obj, np.ndarray):
if obj.ndim == 3:
cutoff_ind = obj.shape[-1] + cutoff - 1
Expand Down Expand Up @@ -259,7 +260,7 @@ def sub_idx(idx, ix, return_index=True):
if isinstance(obj, pd.Series):
return sub_idx(obj.index, ix, return_index)

# nested_univ (Panel) or pd.DataFrame(Series)
# nested_univ (Collection) or pd.DataFrame(Series)
if isinstance(obj, pd.DataFrame) and not isinstance(obj.index, pd.MultiIndex):
objcols = [x for x in obj.columns if obj.dtypes[x] == "object"]
# pd.DataFrame
Expand All @@ -272,7 +273,7 @@ def sub_idx(idx, ix, return_index=True):
]
return agg(idxx)

# pd-multiindex (Panel) and pd_multiindex_hier (Hierarchical)
# pd-multiindex (Collection) and pd_multiindex_hier (Hierarchical)
if isinstance(obj, pd.DataFrame) and isinstance(obj.index, pd.MultiIndex):
idx = obj.index
series_idx = [
Expand All @@ -281,34 +282,34 @@ def sub_idx(idx, ix, return_index=True):
cutoffs = [sub_idx(x, ix, return_index) for x in series_idx]
return agg(cutoffs)

# df-list (Panel)
# df-list (Collection)
if isinstance(obj, list):
idxs = [sub_idx(x.index, ix, return_index) for x in obj]
return agg(idxs)


UPDATE_DATA_INTERNAL_MTYPES = [
SUPPORTED_SERIES = [
"pd.DataFrame",
"pd.Series",
"np.ndarray",
]
SUPPORTED_COLLECTIONS = [
"pd-multiindex",
"numpy3D",
"pd_multiindex_hier",
]


def update_data(X, X_new=None):
"""Update time series container with another one.
Coerces X, X_new to one of the assumed mtypes, if not already of that type.
Converts X, X_new to one of the valid internal types, if not one already.
Parameters
----------
X : None, or aeon data container, in one of the following mtype formats
X : None, or aeon data container, in one of the following internal type formats
pd.DataFrame, pd.Series, np.ndarray, pd-multiindex, numpy3D,
pd_multiindex_hier. If not of that format, coerced.
X_new : None, or aeon data container, should be same mtype as X,
or convert to same format when converting to format list via convert_to
pd_multiindex_hier. If not of that format, converted.
X_new : None, or aeon data container, should be same type as X,
or convert to same format when converting X
Returns
-------
Expand All @@ -317,17 +318,11 @@ def update_data(X, X_new=None):
numpy based containers will always be interpreted as having new row index
if one of X, X_new is None, returns the other; if both are None, returns None
"""
# if X or X_new is vectorized, unwrap it first
if isinstance(X, _VectorizedDF):
# Temporary measure to deal with legacy method of wrapping a Broadcaster
if hasattr(X, "X"):
X = X.X
if isinstance(X_new, _VectorizedDF):
if hasattr(X_new, "X"):
X_new = X_new.X

# we want to ensure that X, X_new are either numpy (1D, 2D, 3D)
# or in one of the long pandas formats
X = convert_to(X, to_type=UPDATE_DATA_INTERNAL_MTYPES)
X_new = convert_to(X_new, to_type=UPDATE_DATA_INTERNAL_MTYPES)

# we only need to modify X if X_new is not None
if X_new is None:
return X
Expand All @@ -336,6 +331,13 @@ def update_data(X, X_new=None):
if X is None:
return X_new

# we want to ensure that X, X_new are either numpy (1D, 2D, 3D)
# or in one of the long pandas formats if they are collections
if is_collection(X):
X = convert_collection(X, output_type="numpy3D")
if is_collection(X_new):
X_new = convert_collection(X_new, output_type="numpy3D")

# update X with the new rows in X_new
# if X is np.ndarray, we assume all rows are new
if isinstance(X, np.ndarray):
Expand All @@ -350,29 +352,35 @@ def update_data(X, X_new=None):
return X_new.combine_first(X)


GET_WINDOW_SUPPORTED_MTYPES = [
"pd.DataFrame",
"pd-multiindex",
"pd_multiindex_hier",
"np.ndarray",
"numpy3D",
]
def _convert(obj, abstract_type, input_type):
reconvert = False
if abstract_type == "Series":
obj = convert_series(obj, SUPPORTED_SERIES)
if input_type == "pd.Series":
reconvert = True
elif abstract_type == "Panel":
if input_type not in SUPPORTED_COLLECTIONS:
obj = convert_collection(obj, "pd-multiindex")
reconvert = True
return obj, reconvert


def get_window(obj, window_length=None, lag=None):
"""Slice obj to the time index window with given length and lag.
Returns time series or time series panel with time indices
strictly greater than cutoff - lag - window_length, and
equal or less than cutoff - lag.
Returns time series or time series collection with time indices strictly greater
than cutoff - lag - window_length, and equal or less than cutoff - lag.
Cutoff if of obj, as determined by get_cutoff.
This function does not work with pd.Series, hence the conversion to pd.DataFrame.
It also does not work with unequal length collections of series.
Parameters
----------
obj : aeon compatible time series data container or None
if not None, must be of Series, Panel, or Hierarchical scitype
all mtypes are supported via conversion to internally supported types
to avoid conversions, pass data in one of GET_WINDOW_SUPPORTED_MTYPES
if not None, must be of Series, Collection, or Hierarchical internal types.
all valid internal types are supported via conversion to internally supported
types to avoid conversions, pass data in one of SUPPORTED_SERIES or
SUPPORTED_COLLECTION
window_length : int or timedelta, optional, default=-inf
must be int if obj is int indexed, timedelta if datetime indexed
length of the window to slice to. Default = window of infinite size
Expand All @@ -392,10 +400,10 @@ def get_window(obj, window_length=None, lag=None):
return obj
valid, metadata = validate_input(obj)
if not valid:
raise ValueError("obj must be of Series, Collection, or Hierarchical scitype")
obj_in_mtype = metadata["mtype"]

obj = convert_to(obj, GET_WINDOW_SUPPORTED_MTYPES)
raise ValueError("obj must be of Series, Collection, or Hierarchical type")
input_type = metadata["mtype"]
abstract_type = metadata["scitype"]
obj, reconvert = _convert(obj, abstract_type, input_type)

# numpy3D (Collection) or np.npdarray (Series)
if isinstance(obj, np.ndarray):
Expand Down Expand Up @@ -445,8 +453,13 @@ def get_window(obj, window_length=None, lag=None):
win_select = win_select & (time_indices > win_start_excl)

obj_subset = obj.iloc[win_select]
if reconvert:
if abstract_type == "Series" and input_type == "pd.Series":
obj_subset = convert_series(obj_subset, input_type)
elif abstract_type == "Panel":
obj_subset = convert_collection(obj_subset, input_type)

return convert_to(obj_subset, obj_in_mtype)
return obj_subset

raise ValueError(
"passed get_window an object that is not of type np.ndarray or pd.DataFrame"
Expand All @@ -456,16 +469,14 @@ def get_window(obj, window_length=None, lag=None):
def get_slice(obj, start=None, end=None):
"""Slice obj with start (inclusive) and end (exclusive) indices.
Returns time series or time series panel with time indices
strictly greater and equal to start index and less than
end index.
Returns time series or time series collection with time indices strictly greater
and equal to start index and less than end index.
Parameters
----------
obj : aeon compatible time series data container or None
if not None, must be of Series, Panel, or Hierarchical scitype
all mtypes are supported via conversion to internally supported types
to avoid conversions, pass data in one of GET_WINDOW_SUPPORTED_MTYPES
if not None, must be of Series, Collection, or Hierarchical type
in one of SUPPORTED_SERIES or SUPPORTED_COLLECTION.
start : int or timestamp, optional, default = None
must be int if obj is int indexed, timestamp if datetime indexed
Inclusive start of slice. Default = None.
Expand All @@ -485,12 +496,12 @@ def get_slice(obj, start=None, end=None):

valid, metadata = validate_input(obj)
if not valid:
raise ValueError("obj must be of Series, Panel, or Hierarchical scitype")
obj_in_mtype = metadata["mtype"]

obj = convert_to(obj, GET_WINDOW_SUPPORTED_MTYPES)
raise ValueError("obj must be of Series, Collection or Hierarchical type")
input_type = metadata["mtype"]
abstract_type = metadata["scitype"]
obj, reconvert = _convert(obj, abstract_type, input_type)

# numpy3D (Panel) or np.npdarray (Series)
# numpy3D (Collection) or np.npdarray (Series)
# Assumes the index is integer so will be exclusive by default
if isinstance(obj, np.ndarray):
# if 2D or 3D, we need to subset by last, not first dimension
Expand All @@ -511,7 +522,8 @@ def get_slice(obj, start=None, end=None):
obj_subset = obj_subset.swapaxes(1, -1)
return obj_subset

# pd.DataFrame(Series), pd-multiindex (Panel) and pd_multiindex_hier (Hierarchical)
# pd.DataFrame(Series), pd-multiindex (Collection) and pd_multiindex_hier (
# Hierarchical)
# Assumes the index is pd.Timestamp or pd.Period and ensures the end is
# exclusive with slice_select
if isinstance(obj, pd.DataFrame):
Expand All @@ -528,8 +540,9 @@ def get_slice(obj, start=None, end=None):
slice_select = time_indices >= start

obj_subset = obj.iloc[slice_select]
return convert_to(obj_subset, obj_in_mtype)

raise ValueError(
"bug in get_slice, unreachable condition, ifs should be exhaustive"
)
if reconvert:
if abstract_type == "Series" and input_type == "pd.Series":
obj_subset = convert_series(obj_subset, input_type)
elif abstract_type == "Panel":
obj_subset = convert_collection(obj_subset, input_type)
return obj_subset
9 changes: 6 additions & 3 deletions aeon/utils/tests/test_index_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,14 +213,17 @@ def test_get_cutoff_wrong_input(bad_inputs):
------
Exception (from pytest) if the error is not raised as expected
"""
with pytest.raises(Exception, match="must be of Series, Panel, or Hierarchical"):
with pytest.raises(
Exception, match="must be of Series, Collection, " "or Hierarchical"
):
get_cutoff(bad_inputs)


@pytest.mark.parametrize("window_length, lag", [(2, 0), (None, 0), (4, 1)])
@pytest.mark.parametrize("datatype", EXAMPLE_DATA.keys())
# @pytest.mark.parametrize("datatype", EXAMPLE_DATA.keys())
@pytest.mark.parametrize("datatype", ["pd.Series"])
def test_get_window_output_type(datatype, window_length, lag):
"""Tests that get_window runs for all mtypes, and returns output of same mtype.
"""Tests that get_window runs for all types, and returns output of same mtype.
Parameters
----------
Expand Down

0 comments on commit 5bc5c31

Please sign in to comment.