From 2b69ad59e4cecfd061b2c42fa8f942bb683cb2fc Mon Sep 17 00:00:00 2001 From: Tyler White Date: Tue, 12 Dec 2023 14:39:06 -0500 Subject: [PATCH] feat: ExpandDateTime convenience --- docs/_quarto.yml | 165 +++++++++++++++++----------------- ibisml/steps/__init__.py | 3 +- ibisml/steps/temporal.py | 111 +++++++++++++++++++++++ ibisml/transforms/__init__.py | 3 +- ibisml/transforms/temporal.py | 53 +++++++++++ tests/test_temporal.py | 25 ++++++ 6 files changed, 276 insertions(+), 84 deletions(-) diff --git a/docs/_quarto.yml b/docs/_quarto.yml index c21e4d4..7338742 100644 --- a/docs/_quarto.yml +++ b/docs/_quarto.yml @@ -94,90 +94,91 @@ quartodoc: - title: Core package: ibisml contents: - - kind: page - path: core - summary: - name: Common - desc: Core APIs - contents: - - Recipe - - RecipeTransform - - TransformResult - - - kind: page - path: selectors - summary: - name: Selectors - desc: Select sets of columns by name, type, or other properties - contents: - - cols - - contains - - endswith - - startswith - - matches - - numeric - - nominal - - categorical - - string - - integer - - floating - - temporal - - date - - time - - timestamp - - has_type - - where - - everything - - selector + - kind: page + path: core + summary: + name: Common + desc: Core APIs + contents: + - Recipe + - RecipeTransform + - TransformResult + + - kind: page + path: selectors + summary: + name: Selectors + desc: Select sets of columns by name, type, or other properties + contents: + - cols + - contains + - endswith + - startswith + - matches + - numeric + - nominal + - categorical + - string + - integer + - floating + - temporal + - date + - time + - timestamp + - has_type + - where + - everything + - selector - title: Steps desc: Define steps in a recipe package: ibisml contents: - - kind: page - path: steps-imputation - summary: - name: Imputation - desc: Imputation and handling of missing values - contents: - - ImputeMean - - ImputeMode - - ImputeMedian - - FillNA - - - kind: page - path: steps-encoding - summary: - name: Encoding - desc: Encoding of categorical and string columns - contents: - - OneHotEncode - - CategoricalEncode - - - kind: page - path: steps-standardization - summary: - name: Standardization - desc: Standardization and normalization of numeric columns - contents: - - ScaleStandard - - - kind: page - path: steps-temporal - summary: - name: Temporal - desc: Feature extraction for temporal columns - contents: - - ExpandDate - - ExpandTime - - - kind: page - path: steps-other - summary: - name: Other - desc: Other common tabular operations - contents: - - Cast - - Drop - - MutateAt - - Mutate + - kind: page + path: steps-imputation + summary: + name: Imputation + desc: Imputation and handling of missing values + contents: + - ImputeMean + - ImputeMode + - ImputeMedian + - FillNA + + - kind: page + path: steps-encoding + summary: + name: Encoding + desc: Encoding of categorical and string columns + contents: + - OneHotEncode + - CategoricalEncode + + - kind: page + path: steps-standardization + summary: + name: Standardization + desc: Standardization and normalization of numeric columns + contents: + - ScaleStandard + + - kind: page + path: steps-temporal + summary: + name: Temporal + desc: Feature extraction for temporal columns + contents: + - ExpandDateTime + - ExpandDate + - ExpandTime + + - kind: page + path: steps-other + summary: + name: Other + desc: Other common tabular operations + contents: + - Cast + - Drop + - MutateAt + - Mutate diff --git a/ibisml/steps/__init__.py b/ibisml/steps/__init__.py index 69841d6..03c93f4 100644 --- a/ibisml/steps/__init__.py +++ b/ibisml/steps/__init__.py @@ -2,7 +2,7 @@ from ibisml.steps.impute import FillNA, ImputeMean, ImputeMedian, ImputeMode from ibisml.steps.standardize import ScaleStandard from ibisml.steps.encode import OneHotEncode, CategoricalEncode -from ibisml.steps.temporal import ExpandDate, ExpandTime +from ibisml.steps.temporal import ExpandDateTime, ExpandDate, ExpandTime __all__ = ( @@ -17,6 +17,7 @@ "ScaleStandard", "OneHotEncode", "CategoricalEncode", + "ExpandDateTime", "ExpandDate", "ExpandTime", ) diff --git a/ibisml/steps/temporal.py b/ibisml/steps/temporal.py index c434b23..66e97b4 100644 --- a/ibisml/steps/temporal.py +++ b/ibisml/steps/temporal.py @@ -9,6 +9,117 @@ from ibisml.select import SelectionType, selector +class ExpandDateTime(Step): + """A step for expanding date and time columns into one or more features. + + New features will be named ``{input_column}_{component}``. For example, if + expanding a ``"year"`` component from column ``"x"``, the feature column + would be named ``"x_year"``. + + Parameters + ---------- + inputs + A selection of date and time columns to expand into new features. + date_components + A sequence of date components to expand. Options include + + - ``day``: the day of the month as a numeric value + - ``week``: the week of the year as a numeric value + - ``month``: the month of the year as a categorical value + - ``year``: the year as a numeric value + - ``dow``: the day of the week as a categorical value + - ``doy``: the day of the year as a numeric value + + Defaults to ``["dow", "month", "year"]``. + time_components + A sequence of time components to expand. Options include ``hour``, + ``minute``, ``second``, and ``millisecond``. + + Defaults to ``["hour", "minute", "second"]``. + + Examples + -------- + >>> import ibisml as ml + + Expand date and time columns using the default components + + >>> step = ml.ExpandDateTime(ml.datetime()) + + Expand specific columns using specific components for date and time + + >>> step = ml.ExpandDateTime(["x", "y"], ["day", "year"], ["hour", "minute"]) + """ + + def __init__( + self, + inputs: SelectionType, + date_components: Sequence[ + Literal["day", "week", "month", "year", "dow", "doy"] + ] = ( + "dow", + "month", + "year", + ), + time_components: Sequence[ + Literal["hour", "minute", "second", "millisecond"] + ] = ( + "hour", + "minute", + "second", + ), + ): + self.inputs = selector(inputs) + self.date_components = list(date_components) + self.time_components = list(time_components) + + def _repr(self) -> Iterable[tuple[str, Any]]: + yield ("", self.inputs) + yield ("date_components", self.date_components) + yield ("time_components", self.time_components) + + def fit(self, table: ir.Table, metadata: Metadata) -> Transform: + date_columns = self.inputs.select_columns(table, metadata) + + if "month" in self.date_components: + for col in date_columns: + metadata.set_categories( + f"{col}_month", + [ + "January", + "February", + "March", + "April", + "May", + "June", + "July", + "August", + "September", + "October", + "November", + "December", + ], + ) + if "dow" in self.date_components: + for col in date_columns: + metadata.set_categories( + f"{col}_dow", + [ + "Monday", + "Tuesday", + "Wednesday", + "Thurday", + "Friday", + "Saturday", + "Sunday", + ], + ) + + time_columns = self.inputs.select_columns(table, metadata) + return ml.transforms.ExpandDateTime( + date_columns, self.date_components, time_columns, self.time_components + ) + + class ExpandDate(Step): """A step for expanding date columns into one or more features. diff --git a/ibisml/transforms/__init__.py b/ibisml/transforms/__init__.py index a342932..c09116b 100644 --- a/ibisml/transforms/__init__.py +++ b/ibisml/transforms/__init__.py @@ -2,7 +2,7 @@ from ibisml.transforms.impute import FillNA from ibisml.transforms.standardize import ScaleStandard from ibisml.transforms.encode import OneHotEncode, CategoricalEncode -from ibisml.transforms.temporal import ExpandDate, ExpandTime +from ibisml.transforms.temporal import ExpandDateTime, ExpandDate, ExpandTime __all__ = ( "Cast", @@ -13,6 +13,7 @@ "ScaleStandard", "OneHotEncode", "CategoricalEncode", + "ExpandDateTime", "ExpandDate", "ExpandTime", ) diff --git a/ibisml/transforms/temporal.py b/ibisml/transforms/temporal.py index 30addc9..3dd37e8 100644 --- a/ibisml/transforms/temporal.py +++ b/ibisml/transforms/temporal.py @@ -7,6 +7,59 @@ import ibis.expr.types as ir +class ExpandDateTime(Transform): + def __init__( + self, + date_columns: list[str], + date_components: list[Literal["day", "week", "month", "year", "dow", "doy"]], + time_columns: list[str], + time_components: list[Literal["hour", "minute", "second", "millisecond"]], + ): + self.date_columns = date_columns + self.date_components = date_components + self.time_columns = time_columns + self.time_components = time_components + + @property + def input_columns(self) -> list[str]: + return self.date_columns + self.time_columns + + def transform(self, table: ir.Table) -> ir.Table: + new_cols = [] + + for name in self.date_columns: + col = table[name] + for comp in self.date_components: + if comp == "day": + feat = col.day() + elif comp == "week": + feat = col.week_of_year() + elif comp == "month": + feat = col.month() - 1 + elif comp == "year": + feat = col.year() + elif comp == "dow": + feat = col.day_of_week.index() + elif comp == "doy": + feat = col.day_of_year() + new_cols.append(feat.name(f"{name}_{comp}")) + + for name in self.time_columns: + col = table[name] + for comp in self.time_components: + if comp == "hour": + feat = col.hour() + elif comp == "minute": + feat = col.minute() + elif comp == "second": + feat = col.second() + elif comp == "millisecond": + feat = col.millisecond() + new_cols.append(feat.name(f"{name}_{comp}")) + + return table.mutate(new_cols) + + class ExpandDate(Transform): def __init__( self, diff --git a/tests/test_temporal.py b/tests/test_temporal.py index 94e7966..1c0ecd3 100644 --- a/tests/test_temporal.py +++ b/tests/test_temporal.py @@ -66,3 +66,28 @@ def test_expand_time(): x_millisecond=_.x.millisecond(), ) assert res.equals(sol) + + +def test_expand_datetime(): + t = ibis.table({"y": "timestamp", "z": "int"}) + step = ml.ExpandDateTime( + ml.timestamp(), + date_components=["dow", "doy", "day", "week", "month", "year"], + time_components=["hour", "minute", "second", "millisecond"], + ) + transform = step.fit(t, ml.core.Metadata()) + + res = transform.transform(t) + sol = t.mutate( + y_dow=_.y.day_of_week.index(), + y_doy=_.y.day_of_year(), + y_day=_.y.day(), + y_week=_.y.week_of_year(), + y_month=_.y.month() - 1, + y_year=_.y.year(), + y_hour=_.y.hour(), + y_minute=_.y.minute(), + y_second=_.y.second(), + y_millisecond=_.y.millisecond(), + ) + assert res.equals(sol)