Update versions of pandas, sqlalchemy, numpy, tensorflow, and scikit-…

…learn. (#84) * 📌 Updated versions for pandas, tensorflow, scikit-learn, sqlalchemy, numpy, * 📌 Updated versions for pandas, tensorflow, scikit-learn, sqlalchemy, numpy, * 🔖 Removed support for Python 3.9, added to CHANGELOG.md * 🔖 Added support to Python 3.11 * 🚨 Fixed linter issues. 🐛 Fixed bug checking utc timezone. * 🚨 Fixed more linting issues * 🚨 Fixed black compatibility * 🔖 Removed support for python 3.11 * 🐛 onehot features in feature engineer now return int32 * 🐛 Added numeric_only instead of skipna in the rolling median. * 🐛 Changed keras argument from "lr" to "learning_rate" * 🚨 Fixed linting issue * 📝 Fixed docstring example to have the right History object. * 🚨 Fixed warnings, and made some examples working. * 🚨 Fixed some whitespace * 🚨 Fixed some whitespace * 🚨 Linting? * 🐛 Fixes the SHAP example. * 🎨➖ Removes dependency on eli5, and now creates own feature importance scores. * 🎨 changed pandas hour character in some files * 🚨 Applied linter * 📝 Added documentation and removed ELI5 comments in code. --------- Co-authored-by: Adjorn <[email protected]>
RoyalHaskoningDHV · Sep 2, 2024 · 7963c7b · 7963c7b
1 parent adfa231
commit 7963c7b
Show file tree

Hide file tree

Showing 35 changed files with 206 additions and 156 deletions.
diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml
@@ -14,7 +14,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8", "3.9", "3.10"]
+        python-version: ["3.9", "3.10"]
     steps:
     - uses: actions/checkout@v3
     - name: Set up Python ${{ matrix.python-version }}

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -3,7 +3,7 @@ version: 2
 build:
   os: "ubuntu-20.04"
   tools:
-    python: "3.8"
+    python: "3.9"
 
 # Build from the docs/ directory with Sphinx
 sphinx:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,13 @@ Version X.Y.Z stands for:
 
 -------------
 
+## Version 3.2.0
+
+### Changes
+- Removed support for Python 3.9
+- Updated pandas, sqlalchemy, tensorflow, numpy, and scikit-learn.
+- Implemented necessary changes to keep behaviour unchanged.
+
 ## Version 3.1.11
 
 ### Changes

diff --git a/pyproject.toml b/pyproject.toml
@@ -22,10 +22,10 @@ packages = [
 
 [project]
 name = "sam"
-version = "3.1.11"
+version = "3.2.0"
 description = "Time series anomaly detection and forecasting"
 readme = "README.md"
-requires-python = ">=3.8"
+requires-python = ">=3.9"
 license = {text = "MIT"}
 authors = [{name = "Royal HaskoningDHV", email = "[email protected]"}]
 keywords = ["python", "data-science", "time-series", "forecasting", "anomaly-detection", "asset-management"]
@@ -35,7 +35,7 @@ classifiers = [
     "Operating System :: OS Independent",
 ]
 
-dependencies = ["pandas~=1.3", "numpy>=1.22,<1.24", "scikit-learn~=1.1,<1.3"]
+dependencies = ["pandas~=2.2", "numpy~=1.26", "scikit-learn~=1.5"]
 
 [project.optional-dependencies]
 all = [
@@ -46,8 +46,7 @@ all = [
     "requests",
     "scipy",
     "seaborn",
-    "tensorflow>=2.9.1,<2.13.0",
-    "eli5",
+    "tensorflow~=2.17,<3",
     "Jinja2~=3.0.3",
     "shap",
     "plotly",
@@ -56,12 +55,11 @@ all = [
 plotting = ["matplotlib", "plotly", "seaborn"]
 data-engineering = ["requests", "pymongo"]
 data-science = [
-    "tensorflow>=2.9.1,<3",
+    "tensorflow~=2.17,<3",
     "cloudpickle",
     "nfft",
     "scipy",
     "shap",
-    "eli5",
     "Jinja2~=3.0.3",
     "statsmodels"
 ]
@@ -74,7 +72,7 @@ docs = [
     "readthedocs-sphinx-search",
     "sphinx-markdown-tables",
     "toml",
-    "tensorflow>=2.9.1,<3",
+    "tensorflow~=2.17,<3",
 ]
 
 [project.urls]
@@ -85,7 +83,7 @@ documentation = "https://sam-rhdhv.readthedocs.io/en/latest/"
 
 [tool.black]
 line-length = 99
-target-version = ['py38', 'py39', 'py310']
+target-version = ['py39', 'py310']
 
 [tool.isort]
 profile = "black"
diff --git a/sam/data_sources/synthetic_data.py b/sam/data_sources/synthetic_data.py
@@ -224,7 +224,7 @@ def synthetic_timeseries(
 def synthetic_date_range(
     start="2016-01-01",
     end="2017-01-01",
-    freq="H",
+    freq="h",
     max_delay=0,
     random_stop_freq=0,
     random_stop_max_length=1,
@@ -252,7 +252,7 @@ def synthetic_date_range(
         Left bound for generating dates.
     end: str or datetime-like, optional (default='2017-01-01')
         Right bound for generating dates. Exclusive bound.
-    freq: str or DateOffset, optional (default='H') (hourly)
+    freq: str or DateOffset, optional (default='h') (hourly)
         Frequency strings can have multiples, e.g. '5H'. See `here for a list of frequency aliases.
         <https://pandas.pydata.org/pandas-docs/stable/timeseries.html#timeseries-offset-aliases`_
     max_delay: numeric, optional (default=0)
@@ -277,15 +277,15 @@ def synthetic_date_range(
     --------
     >>> # Generate times with point approximately every 6 hours
     >>> from sam.data_sources.synthetic_data import synthetic_date_range
-    >>> synthetic_date_range('2016-01-01', '2016-01-02', '6H', 600, 0, 1, seed=0)
+    >>> synthetic_date_range('2016-01-01', '2016-01-02', '6h', 600, 0, 1, seed=0)
     DatetimeIndex(['2016-01-01 00:05:29.288102356',
                    '2016-01-01 06:12:38.401722180',
                    '2016-01-01 12:18:40.059747823',
                    '2016-01-01 18:24:06.989657621'],
                   dtype='datetime64[ns]', freq=None)
 
     >>> # Generate times with very likely stops of length 1
-    >>> synthetic_date_range('2016-01-01', '2016-01-02', 'H', 0, 0.5, 1, seed=0)
+    >>> synthetic_date_range('2016-01-01', '2016-01-02', 'h', 0, 0.5, 1, seed=0)
     DatetimeIndex(['2016-01-01 00:00:00', '2016-01-01 01:00:00',
                    '2016-01-01 02:00:00', '2016-01-01 03:00:00',
                    '2016-01-01 04:00:00', '2016-01-01 05:00:00',

diff --git a/sam/data_sources/tests/test_synthetic_date_range.py b/sam/data_sources/tests/test_synthetic_date_range.py
@@ -8,19 +8,19 @@
 
 class TestCreateSyntheticTimes(unittest.TestCase):
     def test_nonoise(self):
-        result = synthetic_date_range(start="2016-01-01", end="2016-01-01 03:00:00", freq="H")
+        result = synthetic_date_range(start="2016-01-01", end="2016-01-01 03:00:00", freq="h")
         expected = pd.DatetimeIndex(
             np.array(["2016-01-01 00:00:00", "2016-01-01 01:00:00", "2016-01-01 02:00:00"])
         )
         assert_array_equal(result, expected)
 
     def test_shortseries(self):
-        result = synthetic_date_range(start="2016-01-01", end="2016-01-01 01:00:00", freq="H")
+        result = synthetic_date_range(start="2016-01-01", end="2016-01-01 01:00:00", freq="h")
         expected = pd.DatetimeIndex(np.array(["2016-01-01 00:00:00"]))
         assert_array_equal(result, expected)
 
     def test_emptyseries(self):
-        result = synthetic_date_range(start="2016-01-01", end="2016-01-01 00:30:00", freq="2H")
+        result = synthetic_date_range(start="2016-01-01", end="2016-01-01 00:30:00", freq="2h")
         expected = pd.DatetimeIndex(np.array(["2016-01-01 00:00:00"]))
         assert_array_equal(result, expected)
 
@@ -30,7 +30,7 @@ def test_incorrect_input(self):
             synthetic_date_range,
             "2016-01-01 02:00:00",
             "2016-01-01 00:30:00",
-            "2H",
+            "2h",
         )
         self.assertRaises(Exception, synthetic_date_range, "2016-01-01", "2017-01-01", "1 hour")
         self.assertRaises(
@@ -65,7 +65,7 @@ def test_delays(self):
         result = synthetic_date_range(
             start="2016-01-01 00:00:00",
             end="2017-01-01 00:30:00",
-            freq="H",
+            freq="h",
             max_delay=600,
         )
         # Delays should be between 1 hour and 1H10M (1 hour normal, 0-10 min delay)
@@ -83,7 +83,7 @@ def test_random_stops(self):
         result = synthetic_date_range(
             start="2016-01-01 00:00:00",
             end="2016-02-01 00:30:00",
-            freq="H",
+            freq="h",
             random_stop_freq=0.5,
         )
         # approx half of all points should have been removed.
@@ -97,7 +97,7 @@ def test_longer_stops(self):
         result = synthetic_date_range(
             start="2016-01-01 00:00:00",
             end="2016-02-01 00:30:00",
-            freq="H",
+            freq="h",
             random_stop_freq=0.1,
             random_stop_max_length=3,
         )
@@ -111,7 +111,7 @@ def test_seed(self):
         foo = synthetic_date_range(
             start="2016-01-01 00:00:00",
             end="2016-02-01 00:30:00",
-            freq="H",
+            freq="h",
             max_delay=100,
             random_stop_freq=0.1,
             random_stop_max_length=3,
@@ -120,7 +120,7 @@ def test_seed(self):
         bar = synthetic_date_range(
             start="2016-01-01 00:00:00",
             end="2016-02-01 00:30:00",
-            freq="H",
+            freq="h",
             max_delay=100,
             random_stop_freq=0.1,
             random_stop_max_length=3,

diff --git a/sam/data_sources/tests/test_synthetic_timeseries.py b/sam/data_sources/tests/test_synthetic_timeseries.py
@@ -8,8 +8,8 @@
 
 class TestSyntheticTimeseries(unittest.TestCase):
     def setUp(self):
-        self.dates = pd.date_range("2015-01-01", "2015-01-01 03:00:00", freq="H").to_series()
-        self.many_dates = pd.date_range("2015-01-01", "2015-02-01 00:00:00", freq="H").to_series()
+        self.dates = pd.date_range("2015-01-01", "2015-01-01 03:00:00", freq="h").to_series()
+        self.many_dates = pd.date_range("2015-01-01", "2015-02-01 00:00:00", freq="h").to_series()
 
     def test_nonoise(self):
         result = synthetic_timeseries(self.dates)

diff --git a/sam/data_sources/tests/test_weather.py b/sam/data_sources/tests/test_weather.py
@@ -57,7 +57,7 @@ def test_read_knmi_hourly(self):
         self.assertEqual(result.columns.tolist(), ["RH", "SQ", "N", "TIME"])
 
         expected_time = pd.Series(
-            pd.date_range("2016-03-07 06:00:00", "2016-03-07 12:00:00", freq="H")
+            pd.date_range("2016-03-07 06:00:00", "2016-03-07 12:00:00", freq="h")
         )
         expected_time.name = "TIME"
         assert_series_equal(expected_time, result["TIME"])

diff --git a/sam/data_sources/weather/knmi.py b/sam/data_sources/weather/knmi.py
@@ -93,15 +93,15 @@ def _parse_knmi_measurements(knmi_raw, freq, start=None, end=None):
     knmi.columns = columns
 
     if freq == "hourly":
-        knmi["H"] = pd.to_numeric(knmi["H"])  # needs to be numeric to subtract 1
+        knmi["HH"] = pd.to_numeric(knmi["HH"])  # needs to be numeric to subtract 1
         # Subtract 1 from H since it runs from 1 to 24, which will make datetime conversion fail
-        knmi["TIME"] = knmi["YYYYMMDD"].astype(str) + " " + (knmi["H"] - 1).astype(str) + ":00:00"
+        knmi["TIME"] = knmi["YYYYMMDD"].astype(str) + " " + (knmi["HH"] - 1).astype(str) + ":00:00"
     elif freq == "daily":
         knmi["TIME"] = knmi["YYYYMMDD"].astype(str) + " 00:00:00"
     else:
         raise ValueError('freq must be either "hourly" or "daily"')
 
-    knmi = knmi.drop(["YYYYMMDD", "H"], axis=1, errors="ignore")
+    knmi = knmi.drop(["YYYYMMDD", "HH"], axis=1, errors="ignore")
     knmi["TIME"] = pd.to_datetime(knmi["TIME"], format="%Y%m%d %H:%M:%S")
 
     if freq == "hourly":

diff --git a/sam/data_sources/weather/openweathermap.py b/sam/data_sources/weather/openweathermap.py
@@ -1,7 +1,7 @@
 import logging
 
 import pandas as pd
-from pandas.io.json import json_normalize
+from pandas import json_normalize
 from sam import config  # Credentials file
 from sam.logging_functions import log_dataframe_characteristics
 

diff --git a/sam/exploration/find_incidents.py b/sam/exploration/find_incidents.py
@@ -279,9 +279,11 @@ def incident_curves_information(
         np.where(
             data.ACTUAL > data.PREDICT_HIGH,
             (data.ACTUAL - data.PREDICT_HIGH) / (1 + data.PREDICT_HIGH - data.PREDICT),
-            (data.PREDICT_LOW - data.ACTUAL) / (1 + data.PREDICT - data.PREDICT_LOW)
-            if under_conf_interval
-            else 0,
+            (
+                (data.PREDICT_LOW - data.ACTUAL) / (1 + data.PREDICT - data.PREDICT_LOW)
+                if under_conf_interval
+                else 0
+            ),
         ),
         0,
     )

diff --git a/sam/feature_engineering/decompose_datetime.py b/sam/feature_engineering/decompose_datetime.py
@@ -1,10 +1,10 @@
+import datetime
 import logging
 from dataclasses import dataclass
 from typing import List, Optional, Sequence, Tuple, Union, cast
 
 import numpy as np
 import pandas as pd
-import pytz
 from sam.logging_functions import log_dataframe_characteristics, log_new_columns
 
 logger = logging.getLogger(__name__)
@@ -162,7 +162,7 @@ def decompose_datetime(
     # Fix timezone
     if timezone is not None:
         if timecol.dt.tz is not None:
-            if timecol.dt.tz != pytz.utc:
+            if timecol.dt.tz != datetime.timezone.utc:
                 raise ValueError(
                     "Data should either be in UTC timezone or it should have no"
                     " timezone information (assumed to be in UTC)"
@@ -387,13 +387,13 @@ def recode_onehot_features(
             raise ValueError(f"{col} is not in input dataframe")
 
         # get the onehot encoded dummies
-        dummies: pd.DataFrame = pd.get_dummies(df[col], prefix=col).astype(int)
+        dummies: pd.DataFrame = pd.get_dummies(df[col], prefix=col)
 
         # fill in the weekdays not in the dataset
         for i in range(onehot_min, onehot_max):
             if not "%s_%d" % (col, i) in dummies.columns:
                 dummies["%s_%d" % (col, i)] = 0
-        dummies_sorted = dummies[np.sort(dummies.columns)]
+        dummies_sorted = dummies[np.sort(dummies.columns)].astype("int32")
         new_df = new_df.join(dummies_sorted)
 
         # drop the original. if keep_original is False, this is unneeded: it was already removed

diff --git a/sam/feature_engineering/rolling_features.py b/sam/feature_engineering/rolling_features.py
@@ -37,7 +37,7 @@ def _nfft_helper(
     # first convert time index to nanoseconds since epoch
     # If a non-pandas timestamp index is used, these units aren't nanoseconds
     # However, since we normalize to [-0.5, 0.5] anyway, it doesn't matter
-    time = np.array(series.index.astype(int))
+    time = np.array(series.index.astype("int64"))
     # then make first value 0
     time -= np.min(time)
     # normalize to run from -0.5 to 0.5

diff --git a/sam/feature_engineering/simple_feature_engineering.py b/sam/feature_engineering/simple_feature_engineering.py
@@ -178,7 +178,7 @@ def _get_time_features(self, X: pd.DataFrame) -> pd.DataFrame:
                 for value in range(comp_min, comp_max + 1):
                     comp_series = self._get_time_column(X, component)
                     colname_ = f"{colname}_{value}"
-                    X_out[colname_] = (comp_series == value).astype(int)
+                    X_out[colname_] = (comp_series == value).astype("int32")
 
             elif type == "cyclical":
                 comp_series = self._get_time_column(X, component)

diff --git a/sam/feature_engineering/tests/test_decompose_datetime.py b/sam/feature_engineering/tests/test_decompose_datetime.py
@@ -56,6 +56,7 @@ def test_cyclicals(self):
             {"TIME": daterange, "OTHER": 1, "TIME_hour": [0, 1, 2, 3, 4]},
             columns=["TIME", "OTHER", "TIME_hour"],
         )
+        expected = expected.astype({"TIME_hour": "int32"})
         assert_frame_equal(result, expected)
 
         # add cyclical test without keeping original
@@ -90,6 +91,7 @@ def test_cyclicals(self):
             },
             columns=["TIME", "OTHER", "TIME_hour", "TIME_hour_sin", "TIME_hour_cos"],
         )
+        expected = expected.astype({"TIME_hour": "int32"})
         assert_frame_equal(result, expected)
 
     def test_incorrect_cyclical_cols(self):
@@ -175,7 +177,7 @@ def test_remove_original(self):
         daterange = pd.date_range(time1, time2, freq=freq)
 
         data = pd.DataFrame({"TIME": daterange, "OTHER": 1})
-        expected = pd.DataFrame({"TIME_minute": [8, 23, 38, 53, 8]})
+        expected = pd.DataFrame({"TIME_minute": [8, 23, 38, 53, 8]}, dtype="int32")
 
         result = decompose_datetime(data, "TIME", ["minute"], keep_original=False)
         assert_frame_equal(result, expected)
@@ -256,7 +258,7 @@ def test_recode_onehot_datetime(self):
         test_dataframe = pd.DataFrame({"TIME": daterange, "OTHER": 1})
 
         result = decompose_datetime(test_dataframe, components=["weekday"], onehots=["weekday"])
-
+        expected_data_columns = [f"TIME_weekday_{i}" for i in range(7)]
         expected = pd.DataFrame(
             {
                 "TIME": test_dataframe["TIME"],
@@ -270,6 +272,7 @@ def test_recode_onehot_datetime(self):
                 "TIME_weekday_6": [0, 0, 0, 0],
             }
         )
+        expected = expected.astype({col: "int32" for col in expected_data_columns})
 
         assert_frame_equal(result, expected)