diff --git a/README.md b/README.md index baeccbd..aaae72b 100644 --- a/README.md +++ b/README.md @@ -27,17 +27,21 @@ pip install enfobench ## Usage -Import your dataset and make sure that the timestamp column in named 'ds' and the target values named 'y'. +Load your own data and create a dataset. ```python import pandas as pd +from enfobench.evaluation import Dataset + # Load your dataset and make sure that the timestamp column in named 'ds' and the target values named 'y' -data = ( - pd.read_csv("../path/to/your/data.csv") - .rename(columns={"timestamp": "ds", "value": "y"}) +data = pd.read_csv("../path/to/your/data.csv", parse_dates=['timestamp'], index_col='timestamp') +covariates = data.drop(columns=['target_column']) + +dataset = Dataset( + target=data['target_column'], + covariates=covariates, ) -y = data.set_index("ds")["y"] ``` You can perform a cross validation on any model locally that adheres to the `enfobench.Model` protocol. @@ -52,11 +56,11 @@ model = MyModel() # Run cross validation on your model cv_results = cross_validate( model, + dataset, start_date=pd.Timestamp("2018-01-01"), end_date=pd.Timestamp("2018-01-31"), horizon=pd.Timedelta("24 hours"), step=pd.Timedelta("1 day"), - y=y, ) ``` @@ -71,11 +75,11 @@ client = ForecastClient(host='localhost', port=3000) # Run cross validation on your model cv_results = cross_validate( client, + dataset, start_date=pd.Timestamp("2018-01-01"), end_date=pd.Timestamp("2018-01-31"), horizon=pd.Timedelta("24 hours"), step=pd.Timedelta("1 day"), - y=y, ) ``` @@ -89,7 +93,7 @@ from enfobench.evaluation.metrics import ( ) # Merge the cross validation results with the original data -forecasts = cv_results.merge(data, on="ds", how="left") +forecasts = cv_results.merge(dataset.target, on="ds", how="left") metrics = evaluate_metrics_on_forecasts( forecasts, diff --git a/src/enfobench/__version__.py b/src/enfobench/__version__.py index 485f44a..d3ec452 100644 --- a/src/enfobench/__version__.py +++ b/src/enfobench/__version__.py @@ -1 +1 @@ -__version__ = "0.1.1" +__version__ = "0.2.0" diff --git a/src/enfobench/evaluation/evaluate.py b/src/enfobench/evaluation/evaluate.py index 5025dd1..dae0171 100644 --- a/src/enfobench/evaluation/evaluate.py +++ b/src/enfobench/evaluation/evaluate.py @@ -6,7 +6,7 @@ from enfobench.evaluation.client import ForecastClient from enfobench.evaluation.protocols import Dataset, Model -from enfobench.utils import steps_in_horizon +from enfobench.evaluation.utils import steps_in_horizon def evaluate_metric_on_forecast(forecast: pd.DataFrame, metric: Callable) -> float: diff --git a/src/enfobench/evaluation/utils.py b/src/enfobench/evaluation/utils.py new file mode 100644 index 0000000..6a8f9a4 --- /dev/null +++ b/src/enfobench/evaluation/utils.py @@ -0,0 +1,102 @@ +import warnings + +import pandas as pd + + +def steps_in_horizon(horizon: pd.Timedelta, freq: str) -> int: + """Return the number of steps in a given horizon. + + Parameters + ---------- + horizon: + The horizon to be split into steps. + freq: + The frequency of the horizon. + + Returns + ------- + The number of steps in the horizon. + """ + freq = "1" + freq if not freq[0].isdigit() else freq + periods = horizon / pd.Timedelta(freq) + if not periods.is_integer(): + raise ValueError("Horizon is not a multiple of the frequency") + return int(periods) + + +def create_forecast_index(history: pd.DataFrame, horizon: int) -> pd.DatetimeIndex: + """Create time index for a forecast horizon. + + Parameters + ---------- + history: + The history of the time series. + horizon: + The forecast horizon. + + Returns + ------- + The time index for the forecast horizon. + """ + last_date = history["ds"].iloc[-1] + inferred_freq = history["ds"].dt.freq + freq = "1" + inferred_freq if not inferred_freq[0].isdigit() else inferred_freq + return pd.date_range( + start=last_date + pd.Timedelta(freq), + periods=horizon, + freq=freq, + ) + + +def create_perfect_forecasts_from_covariates( + covariates: pd.DataFrame, + horizon: pd.Timedelta, + step: pd.Timedelta, + **kwargs, +) -> pd.DataFrame: + """Create forecasts from covariates. + + Sometimes external forecasts are not available for the entire horizon. This function creates + external forecast dataframe from external covariates as a perfect forecast. + + Parameters + ---------- + covariates: + The external covariates. + horizon: + The forecast horizon. + step: + The step size between forecasts. + + Returns + ------- + The external forecast dataframe. + """ + if kwargs.get("start") is not None: + start = kwargs.get("start") + else: + start = covariates.index[0] + + last_date = covariates.index[-1] + + forecasts = [] + while start + horizon <= last_date: + forecast = covariates.loc[ + (covariates.index > start) & (covariates.index <= start + horizon) + ] + forecast.insert(0, "cutoff_date", start) + forecast.rename_axis("ds", inplace=True) + forecast.reset_index(inplace=True) + + if len(forecast) == 0: + warnings.warn( + f"Covariates not found for {start} - {start + horizon}, cannot make forecast at step {start}", + UserWarning, + stacklevel=2, + ) + + forecasts.append(forecast) + start += step + + forecast_df = pd.concat(forecasts, ignore_index=True) + return forecast_df diff --git a/src/enfobench/utils.py b/src/enfobench/utils.py deleted file mode 100644 index 702513c..0000000 --- a/src/enfobench/utils.py +++ /dev/null @@ -1,21 +0,0 @@ -import pandas as pd -from pandas import Timedelta - - -def steps_in_horizon(horizon: Timedelta, freq: str) -> int: - """Return the number of steps in a given horizon.""" - freq = "1" + freq if not freq[0].isdigit() else freq - periods = horizon / pd.Timedelta(freq) - assert periods.is_integer(), "Horizon is not a multiple of the frequency" - return int(periods) - - -def create_forecast_index(history: pd.DataFrame, horizon: int) -> pd.DatetimeIndex: - last_date = history["ds"].iloc[-1] - inferred_freq = history["ds"].dt.freq - freq = "1" + inferred_freq if not inferred_freq[0].isdigit() else inferred_freq - return pd.date_range( - start=last_date + pd.Timedelta(freq), - periods=horizon, - freq=freq, - ) diff --git a/tests/conftest.py b/tests/conftest.py index fc70d3f..39b0553 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,7 +3,7 @@ import pytest from enfobench.evaluation import ForecasterType, ModelInfo -from enfobench.utils import create_forecast_index +from enfobench.evaluation.utils import create_forecast_index class TestModel: @@ -43,7 +43,7 @@ def model(): @pytest.fixture(scope="session") -def target() -> pd.Series: +def target() -> pd.DataFrame: index = pd.date_range("2020-01-01", "2020-02-01", freq="30T") y = pd.Series(np.random.random(len(index)), index=index) return y diff --git a/tests/test_evaluations/__init__.py b/tests/test_evaluations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_dataset.py b/tests/test_evaluations/test_dataset.py similarity index 100% rename from tests/test_dataset.py rename to tests/test_evaluations/test_dataset.py diff --git a/tests/test_evaluate.py b/tests/test_evaluations/test_evaluate.py similarity index 100% rename from tests/test_evaluate.py rename to tests/test_evaluations/test_evaluate.py diff --git a/tests/test_metrics.py b/tests/test_evaluations/test_metrics.py similarity index 100% rename from tests/test_metrics.py rename to tests/test_evaluations/test_metrics.py diff --git a/tests/test_server.py b/tests/test_evaluations/test_server.py similarity index 100% rename from tests/test_server.py rename to tests/test_evaluations/test_server.py diff --git a/tests/test_evaluations/test_utils.py b/tests/test_evaluations/test_utils.py new file mode 100644 index 0000000..c3a790b --- /dev/null +++ b/tests/test_evaluations/test_utils.py @@ -0,0 +1,49 @@ +import pandas as pd +import pytest + +from enfobench.evaluation import utils + + +@pytest.mark.parametrize( + "horizon, freq, expected", + [ + ("1 day", "15T", 96), + ("1 day", "1H", 24), + ("7 days", "1H", 7 * 24), + ("1D", "1D", 1), + ("1H", "1H", 1), + ], +) +def test_steps_in_horizon(horizon, freq, expected): + assert utils.steps_in_horizon(pd.Timedelta(horizon), freq) == expected + + +def test_steps_in_horizon_raises_with_non_multiple_horizon(): + with pytest.raises(ValueError): + utils.steps_in_horizon(pd.Timedelta("36 minutes"), "15T") + + +def test_create_forecast_index(target): + history = target.to_frame("y").rename_axis("ds").reset_index() + horizon = 96 + last_date = history["ds"].iloc[-1] + + index = utils.create_forecast_index(history, horizon) + + assert isinstance(index, pd.DatetimeIndex) + assert index.freq == target.index.freq + assert len(index) == horizon + assert all(idx > last_date for idx in index) + + +def test_create_perfect_forecasts_from_covariates(covariates): + forecasts = utils.create_perfect_forecasts_from_covariates( + covariates, + horizon=pd.Timedelta("7 days"), + step=pd.Timedelta("1D"), + ) + + assert isinstance(forecasts, pd.DataFrame) + assert "ds" in forecasts.columns + assert "cutoff_date" in forecasts.columns + assert all(col in forecasts.columns for col in covariates.columns)