Skip to content

Commit

Permalink
Update versions of pandas, sqlalchemy, numpy, tensorflow, and scikit-…
Browse files Browse the repository at this point in the history
…learn. (#84)

* 📌 Updated versions for pandas, tensorflow, scikit-learn, sqlalchemy, numpy,

* 📌 Updated versions for pandas, tensorflow, scikit-learn, sqlalchemy, numpy,

* 🔖 Removed support for Python 3.9, added to CHANGELOG.md

* 🔖 Added support to Python 3.11

* 🚨 Fixed linter issues. 🐛 Fixed bug checking utc timezone.

* 🚨 Fixed more linting issues

* 🚨 Fixed black compatibility

* 🔖 Removed support for python 3.11

* 🐛 onehot features in feature engineer now return int32

* 🐛 Added numeric_only instead of skipna in the rolling median.

* 🐛 Changed keras argument from "lr" to "learning_rate"

* 🚨 Fixed linting issue

* 📝 Fixed docstring example to have the right History object.

* 🚨 Fixed warnings, and made some examples working.

* 🚨 Fixed some whitespace

* 🚨 Fixed some whitespace

* 🚨 Linting?

* 🐛 Fixes the SHAP example.

* 🎨➖ Removes dependency on eli5, and now creates own feature importance scores.

* 🎨 changed pandas hour character in some files

* 🚨 Applied linter

* 📝 Added documentation and removed ELI5 comments in code.

---------

Co-authored-by: Adjorn <[email protected]>
  • Loading branch information
amobular and Adjorn authored Sep 2, 2024
1 parent adfa231 commit 7963c7b
Show file tree
Hide file tree
Showing 35 changed files with 206 additions and 156 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/unittest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.8", "3.9", "3.10"]
python-version: ["3.9", "3.10"]
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
Expand Down
2 changes: 1 addition & 1 deletion .readthedocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ version: 2
build:
os: "ubuntu-20.04"
tools:
python: "3.8"
python: "3.9"

# Build from the docs/ directory with Sphinx
sphinx:
Expand Down
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,13 @@ Version X.Y.Z stands for:

-------------

## Version 3.2.0

### Changes
- Removed support for Python 3.9
- Updated pandas, sqlalchemy, tensorflow, numpy, and scikit-learn.
- Implemented necessary changes to keep behaviour unchanged.

## Version 3.1.11

### Changes
Expand Down
16 changes: 7 additions & 9 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@ packages = [

[project]
name = "sam"
version = "3.1.11"
version = "3.2.0"
description = "Time series anomaly detection and forecasting"
readme = "README.md"
requires-python = ">=3.8"
requires-python = ">=3.9"
license = {text = "MIT"}
authors = [{name = "Royal HaskoningDHV", email = "[email protected]"}]
keywords = ["python", "data-science", "time-series", "forecasting", "anomaly-detection", "asset-management"]
Expand All @@ -35,7 +35,7 @@ classifiers = [
"Operating System :: OS Independent",
]

dependencies = ["pandas~=1.3", "numpy>=1.22,<1.24", "scikit-learn~=1.1,<1.3"]
dependencies = ["pandas~=2.2", "numpy~=1.26", "scikit-learn~=1.5"]

[project.optional-dependencies]
all = [
Expand All @@ -46,8 +46,7 @@ all = [
"requests",
"scipy",
"seaborn",
"tensorflow>=2.9.1,<2.13.0",
"eli5",
"tensorflow~=2.17,<3",
"Jinja2~=3.0.3",
"shap",
"plotly",
Expand All @@ -56,12 +55,11 @@ all = [
plotting = ["matplotlib", "plotly", "seaborn"]
data-engineering = ["requests", "pymongo"]
data-science = [
"tensorflow>=2.9.1,<3",
"tensorflow~=2.17,<3",
"cloudpickle",
"nfft",
"scipy",
"shap",
"eli5",
"Jinja2~=3.0.3",
"statsmodels"
]
Expand All @@ -74,7 +72,7 @@ docs = [
"readthedocs-sphinx-search",
"sphinx-markdown-tables",
"toml",
"tensorflow>=2.9.1,<3",
"tensorflow~=2.17,<3",
]

[project.urls]
Expand All @@ -85,7 +83,7 @@ documentation = "https://sam-rhdhv.readthedocs.io/en/latest/"

[tool.black]
line-length = 99
target-version = ['py38', 'py39', 'py310']
target-version = ['py39', 'py310']

[tool.isort]
profile = "black"
8 changes: 4 additions & 4 deletions sam/data_sources/synthetic_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ def synthetic_timeseries(
def synthetic_date_range(
start="2016-01-01",
end="2017-01-01",
freq="H",
freq="h",
max_delay=0,
random_stop_freq=0,
random_stop_max_length=1,
Expand Down Expand Up @@ -252,7 +252,7 @@ def synthetic_date_range(
Left bound for generating dates.
end: str or datetime-like, optional (default='2017-01-01')
Right bound for generating dates. Exclusive bound.
freq: str or DateOffset, optional (default='H') (hourly)
freq: str or DateOffset, optional (default='h') (hourly)
Frequency strings can have multiples, e.g. '5H'. See `here for a list of frequency aliases.
<https://pandas.pydata.org/pandas-docs/stable/timeseries.html#timeseries-offset-aliases`_
max_delay: numeric, optional (default=0)
Expand All @@ -277,15 +277,15 @@ def synthetic_date_range(
--------
>>> # Generate times with point approximately every 6 hours
>>> from sam.data_sources.synthetic_data import synthetic_date_range
>>> synthetic_date_range('2016-01-01', '2016-01-02', '6H', 600, 0, 1, seed=0)
>>> synthetic_date_range('2016-01-01', '2016-01-02', '6h', 600, 0, 1, seed=0)
DatetimeIndex(['2016-01-01 00:05:29.288102356',
'2016-01-01 06:12:38.401722180',
'2016-01-01 12:18:40.059747823',
'2016-01-01 18:24:06.989657621'],
dtype='datetime64[ns]', freq=None)
>>> # Generate times with very likely stops of length 1
>>> synthetic_date_range('2016-01-01', '2016-01-02', 'H', 0, 0.5, 1, seed=0)
>>> synthetic_date_range('2016-01-01', '2016-01-02', 'h', 0, 0.5, 1, seed=0)
DatetimeIndex(['2016-01-01 00:00:00', '2016-01-01 01:00:00',
'2016-01-01 02:00:00', '2016-01-01 03:00:00',
'2016-01-01 04:00:00', '2016-01-01 05:00:00',
Expand Down
18 changes: 9 additions & 9 deletions sam/data_sources/tests/test_synthetic_date_range.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,19 @@

class TestCreateSyntheticTimes(unittest.TestCase):
def test_nonoise(self):
result = synthetic_date_range(start="2016-01-01", end="2016-01-01 03:00:00", freq="H")
result = synthetic_date_range(start="2016-01-01", end="2016-01-01 03:00:00", freq="h")
expected = pd.DatetimeIndex(
np.array(["2016-01-01 00:00:00", "2016-01-01 01:00:00", "2016-01-01 02:00:00"])
)
assert_array_equal(result, expected)

def test_shortseries(self):
result = synthetic_date_range(start="2016-01-01", end="2016-01-01 01:00:00", freq="H")
result = synthetic_date_range(start="2016-01-01", end="2016-01-01 01:00:00", freq="h")
expected = pd.DatetimeIndex(np.array(["2016-01-01 00:00:00"]))
assert_array_equal(result, expected)

def test_emptyseries(self):
result = synthetic_date_range(start="2016-01-01", end="2016-01-01 00:30:00", freq="2H")
result = synthetic_date_range(start="2016-01-01", end="2016-01-01 00:30:00", freq="2h")
expected = pd.DatetimeIndex(np.array(["2016-01-01 00:00:00"]))
assert_array_equal(result, expected)

Expand All @@ -30,7 +30,7 @@ def test_incorrect_input(self):
synthetic_date_range,
"2016-01-01 02:00:00",
"2016-01-01 00:30:00",
"2H",
"2h",
)
self.assertRaises(Exception, synthetic_date_range, "2016-01-01", "2017-01-01", "1 hour")
self.assertRaises(
Expand Down Expand Up @@ -65,7 +65,7 @@ def test_delays(self):
result = synthetic_date_range(
start="2016-01-01 00:00:00",
end="2017-01-01 00:30:00",
freq="H",
freq="h",
max_delay=600,
)
# Delays should be between 1 hour and 1H10M (1 hour normal, 0-10 min delay)
Expand All @@ -83,7 +83,7 @@ def test_random_stops(self):
result = synthetic_date_range(
start="2016-01-01 00:00:00",
end="2016-02-01 00:30:00",
freq="H",
freq="h",
random_stop_freq=0.5,
)
# approx half of all points should have been removed.
Expand All @@ -97,7 +97,7 @@ def test_longer_stops(self):
result = synthetic_date_range(
start="2016-01-01 00:00:00",
end="2016-02-01 00:30:00",
freq="H",
freq="h",
random_stop_freq=0.1,
random_stop_max_length=3,
)
Expand All @@ -111,7 +111,7 @@ def test_seed(self):
foo = synthetic_date_range(
start="2016-01-01 00:00:00",
end="2016-02-01 00:30:00",
freq="H",
freq="h",
max_delay=100,
random_stop_freq=0.1,
random_stop_max_length=3,
Expand All @@ -120,7 +120,7 @@ def test_seed(self):
bar = synthetic_date_range(
start="2016-01-01 00:00:00",
end="2016-02-01 00:30:00",
freq="H",
freq="h",
max_delay=100,
random_stop_freq=0.1,
random_stop_max_length=3,
Expand Down
4 changes: 2 additions & 2 deletions sam/data_sources/tests/test_synthetic_timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@

class TestSyntheticTimeseries(unittest.TestCase):
def setUp(self):
self.dates = pd.date_range("2015-01-01", "2015-01-01 03:00:00", freq="H").to_series()
self.many_dates = pd.date_range("2015-01-01", "2015-02-01 00:00:00", freq="H").to_series()
self.dates = pd.date_range("2015-01-01", "2015-01-01 03:00:00", freq="h").to_series()
self.many_dates = pd.date_range("2015-01-01", "2015-02-01 00:00:00", freq="h").to_series()

def test_nonoise(self):
result = synthetic_timeseries(self.dates)
Expand Down
2 changes: 1 addition & 1 deletion sam/data_sources/tests/test_weather.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def test_read_knmi_hourly(self):
self.assertEqual(result.columns.tolist(), ["RH", "SQ", "N", "TIME"])

expected_time = pd.Series(
pd.date_range("2016-03-07 06:00:00", "2016-03-07 12:00:00", freq="H")
pd.date_range("2016-03-07 06:00:00", "2016-03-07 12:00:00", freq="h")
)
expected_time.name = "TIME"
assert_series_equal(expected_time, result["TIME"])
Expand Down
6 changes: 3 additions & 3 deletions sam/data_sources/weather/knmi.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,15 +93,15 @@ def _parse_knmi_measurements(knmi_raw, freq, start=None, end=None):
knmi.columns = columns

if freq == "hourly":
knmi["H"] = pd.to_numeric(knmi["H"]) # needs to be numeric to subtract 1
knmi["HH"] = pd.to_numeric(knmi["HH"]) # needs to be numeric to subtract 1
# Subtract 1 from H since it runs from 1 to 24, which will make datetime conversion fail
knmi["TIME"] = knmi["YYYYMMDD"].astype(str) + " " + (knmi["H"] - 1).astype(str) + ":00:00"
knmi["TIME"] = knmi["YYYYMMDD"].astype(str) + " " + (knmi["HH"] - 1).astype(str) + ":00:00"
elif freq == "daily":
knmi["TIME"] = knmi["YYYYMMDD"].astype(str) + " 00:00:00"
else:
raise ValueError('freq must be either "hourly" or "daily"')

knmi = knmi.drop(["YYYYMMDD", "H"], axis=1, errors="ignore")
knmi = knmi.drop(["YYYYMMDD", "HH"], axis=1, errors="ignore")
knmi["TIME"] = pd.to_datetime(knmi["TIME"], format="%Y%m%d %H:%M:%S")

if freq == "hourly":
Expand Down
2 changes: 1 addition & 1 deletion sam/data_sources/weather/openweathermap.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging

import pandas as pd
from pandas.io.json import json_normalize
from pandas import json_normalize
from sam import config # Credentials file
from sam.logging_functions import log_dataframe_characteristics

Expand Down
8 changes: 5 additions & 3 deletions sam/exploration/find_incidents.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,9 +279,11 @@ def incident_curves_information(
np.where(
data.ACTUAL > data.PREDICT_HIGH,
(data.ACTUAL - data.PREDICT_HIGH) / (1 + data.PREDICT_HIGH - data.PREDICT),
(data.PREDICT_LOW - data.ACTUAL) / (1 + data.PREDICT - data.PREDICT_LOW)
if under_conf_interval
else 0,
(
(data.PREDICT_LOW - data.ACTUAL) / (1 + data.PREDICT - data.PREDICT_LOW)
if under_conf_interval
else 0
),
),
0,
)
Expand Down
8 changes: 4 additions & 4 deletions sam/feature_engineering/decompose_datetime.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import datetime
import logging
from dataclasses import dataclass
from typing import List, Optional, Sequence, Tuple, Union, cast

import numpy as np
import pandas as pd
import pytz
from sam.logging_functions import log_dataframe_characteristics, log_new_columns

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -162,7 +162,7 @@ def decompose_datetime(
# Fix timezone
if timezone is not None:
if timecol.dt.tz is not None:
if timecol.dt.tz != pytz.utc:
if timecol.dt.tz != datetime.timezone.utc:
raise ValueError(
"Data should either be in UTC timezone or it should have no"
" timezone information (assumed to be in UTC)"
Expand Down Expand Up @@ -387,13 +387,13 @@ def recode_onehot_features(
raise ValueError(f"{col} is not in input dataframe")

# get the onehot encoded dummies
dummies: pd.DataFrame = pd.get_dummies(df[col], prefix=col).astype(int)
dummies: pd.DataFrame = pd.get_dummies(df[col], prefix=col)

# fill in the weekdays not in the dataset
for i in range(onehot_min, onehot_max):
if not "%s_%d" % (col, i) in dummies.columns:
dummies["%s_%d" % (col, i)] = 0
dummies_sorted = dummies[np.sort(dummies.columns)]
dummies_sorted = dummies[np.sort(dummies.columns)].astype("int32")
new_df = new_df.join(dummies_sorted)

# drop the original. if keep_original is False, this is unneeded: it was already removed
Expand Down
2 changes: 1 addition & 1 deletion sam/feature_engineering/rolling_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def _nfft_helper(
# first convert time index to nanoseconds since epoch
# If a non-pandas timestamp index is used, these units aren't nanoseconds
# However, since we normalize to [-0.5, 0.5] anyway, it doesn't matter
time = np.array(series.index.astype(int))
time = np.array(series.index.astype("int64"))
# then make first value 0
time -= np.min(time)
# normalize to run from -0.5 to 0.5
Expand Down
2 changes: 1 addition & 1 deletion sam/feature_engineering/simple_feature_engineering.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ def _get_time_features(self, X: pd.DataFrame) -> pd.DataFrame:
for value in range(comp_min, comp_max + 1):
comp_series = self._get_time_column(X, component)
colname_ = f"{colname}_{value}"
X_out[colname_] = (comp_series == value).astype(int)
X_out[colname_] = (comp_series == value).astype("int32")

elif type == "cyclical":
comp_series = self._get_time_column(X, component)
Expand Down
7 changes: 5 additions & 2 deletions sam/feature_engineering/tests/test_decompose_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def test_cyclicals(self):
{"TIME": daterange, "OTHER": 1, "TIME_hour": [0, 1, 2, 3, 4]},
columns=["TIME", "OTHER", "TIME_hour"],
)
expected = expected.astype({"TIME_hour": "int32"})
assert_frame_equal(result, expected)

# add cyclical test without keeping original
Expand Down Expand Up @@ -90,6 +91,7 @@ def test_cyclicals(self):
},
columns=["TIME", "OTHER", "TIME_hour", "TIME_hour_sin", "TIME_hour_cos"],
)
expected = expected.astype({"TIME_hour": "int32"})
assert_frame_equal(result, expected)

def test_incorrect_cyclical_cols(self):
Expand Down Expand Up @@ -175,7 +177,7 @@ def test_remove_original(self):
daterange = pd.date_range(time1, time2, freq=freq)

data = pd.DataFrame({"TIME": daterange, "OTHER": 1})
expected = pd.DataFrame({"TIME_minute": [8, 23, 38, 53, 8]})
expected = pd.DataFrame({"TIME_minute": [8, 23, 38, 53, 8]}, dtype="int32")

result = decompose_datetime(data, "TIME", ["minute"], keep_original=False)
assert_frame_equal(result, expected)
Expand Down Expand Up @@ -256,7 +258,7 @@ def test_recode_onehot_datetime(self):
test_dataframe = pd.DataFrame({"TIME": daterange, "OTHER": 1})

result = decompose_datetime(test_dataframe, components=["weekday"], onehots=["weekday"])

expected_data_columns = [f"TIME_weekday_{i}" for i in range(7)]
expected = pd.DataFrame(
{
"TIME": test_dataframe["TIME"],
Expand All @@ -270,6 +272,7 @@ def test_recode_onehot_datetime(self):
"TIME_weekday_6": [0, 0, 0, 0],
}
)
expected = expected.astype({col: "int32" for col in expected_data_columns})

assert_frame_equal(result, expected)

Expand Down
Loading

0 comments on commit 7963c7b

Please sign in to comment.