linear fit for slotgemiddelde (#157)

* linear fit for slotgemiddelde (was default with nodal) * simplified slotgemiddelde code * remove with_AR option * updated tests * updated docstring
Deltares-research · Oct 15, 2024 · 10cf370 · 10cf370
1 parent 87b8820
commit 10cf370
Show file tree

Hide file tree

Showing 3 changed files with 54 additions and 60 deletions.
diff --git a/docs/whats-new.md b/docs/whats-new.md
@@ -4,6 +4,7 @@
 
 ### Feat
 - expanded physical_break_dict in [#151](https://github.com/Deltares-research/kenmerkendewaarden/pull/151)
+- linear fit for slotgemiddelden (no nodal) in [#157](https://github.com/Deltares-research/kenmerkendewaarden/pull/157)
 
 
 ## 0.3.0 (2024-10-01)

diff --git a/kenmerkendewaarden/slotgemiddelden.py b/kenmerkendewaarden/slotgemiddelden.py
@@ -27,10 +27,13 @@ def calc_slotgemiddelden(
     df_ext: pd.DataFrame = None,
     min_coverage: float = None,
     clip_physical_break: bool = False,
-    with_nodal: bool = True,
 ):
     """
     Compute slotgemiddelden from measurement timeseries and optionally also from extremes timeseries.
+    A simple linear trend is used to avoid all pretend-accuracy. However, when fitting a
+    linear trend on a limited amount of data, the nodal cycle and wind effects will cause
+    the model fit to be inaccurate. It is wise to use at least 30 years of data for 
+    a valid fit, this is >1.5 times the nodal cycle.
 
     Parameters
     ----------
@@ -42,8 +45,6 @@ def calc_slotgemiddelden(
         Set yearly means to nans for years that do not have sufficient data coverage. The default is None.
     clip_physical_break : bool, optional
         Whether to exclude the part of the timeseries before physical breaks like estuary closures. The default is False.
-    with_nodal : bool, optional
-        Whether to include a nodal cycle in the linear trend model. The default is True.
 
     Returns
     -------
@@ -71,7 +72,7 @@ def calc_slotgemiddelden(
         wl_mean_peryear = clip_timeseries_physical_break(wl_mean_peryear)
 
     # fit linear models over yearly mean values
-    pred_pd_wl = fit_models(wl_mean_peryear, with_nodal=with_nodal)
+    pred_pd_wl = predict_linear_model(wl_mean_peryear)
     slotgemiddelden_dict["wl_model_fit"] = pred_pd_wl
 
     if df_ext is not None:
@@ -103,9 +104,9 @@ def calc_slotgemiddelden(
             tidalrange_mean_peryear = clip_timeseries_physical_break(tidalrange_mean_peryear)
 
         # fit linear models over yearly mean values
-        pred_pd_HW = fit_models(HW_mean_peryear, with_nodal=with_nodal)
-        pred_pd_LW = fit_models(LW_mean_peryear, with_nodal=with_nodal)
-        pred_pd_tidalrange = fit_models(tidalrange_mean_peryear, with_nodal=with_nodal)
+        pred_pd_HW = predict_linear_model(HW_mean_peryear)
+        pred_pd_LW = predict_linear_model(LW_mean_peryear)
+        pred_pd_tidalrange = predict_linear_model(tidalrange_mean_peryear)
         slotgemiddelden_dict["HW_model_fit"] = pred_pd_HW
         slotgemiddelden_dict["LW_model_fit"] = pred_pd_LW
         slotgemiddelden_dict["tidalrange_model_fit"] = pred_pd_tidalrange
@@ -210,13 +211,13 @@ def plot_slotgemiddelden(
     return fig, ax
 
 
-def fit_models(mean_array_todate: pd.Series, with_nodal=True) -> pd.DataFrame:
+def predict_linear_model(ser: pd.Series, with_nodal=False) -> pd.DataFrame:
     """
     Fit linear model over yearly means in mean_array_todate, including five years in the future.
 
     Parameters
     ----------
-    mean_array_todate : pd.Series
+    ser : pd.Series
         DESCRIPTION.
 
     Returns
@@ -226,61 +227,52 @@ def fit_models(mean_array_todate: pd.Series, with_nodal=True) -> pd.DataFrame:
 
     """
 
-    start = mean_array_todate.index.min()
-    end = mean_array_todate.index.max() + 1
+    start = ser.index.min()
+    end = ser.index.max() + 1
 
-    logger.info(f"fit linear model (with_nodal={with_nodal}) for {start} to {end}")
+    logger.info(f"fit linear model for {start} to {end}")
 
-    # We'll just use the years. This assumes that annual waterlevels are used that are stored left-padded, the mean waterlevel for 2020 is stored as 2020-1-1. This is not logical, but common practice.
+    # generate contiguous timeseries including gaps and including slotgemiddelde year
     allyears_dt = pd.period_range(start=start, end=end)
-    mean_array_allyears = pd.Series(mean_array_todate, index=allyears_dt)
+    ser_allyears = pd.Series(ser, index=allyears_dt)
 
-    mean_array_allyears_nonans = mean_array_allyears.loc[~mean_array_allyears.isnull()]
-    if len(mean_array_allyears_nonans) < 2:
+    ser_nonans = ser_allyears.loc[~ser_allyears.isnull()]
+    if len(ser_nonans) < 2:
         raise ValueError(
-            f"nan-filtered timeseries has only one timestep, cannot perform model fit:\n{mean_array_allyears_nonans}"
+            f"nan-filtered timeseries has only one timestep, cannot perform model fit:\n{ser_nonans}"
         )
 
-    # convert to dataframe of expected format
-    # TODO: make functions accept mean_array instead of df as argument?
-    df = pd.DataFrame(
-        {"year": mean_array_allyears.index.year, "height": mean_array_allyears.values}
-    )
-
-    # below methods are copied from https://github.com/openearth/sealevel/blob/master/slr/slr/models.py
-    # TODO: install slr package as dependency or keep separate?
-    fit, _, X = linear_model(df, with_wind=False, with_nodal=with_nodal)
-    pred_linear = fit.predict(X)
+    # get model fit
+    fit, _, X = fit_linear_model(ser_allyears, with_nodal=with_nodal)
 
-    linear_fit = pd.Series(pred_linear, index=allyears_dt, name="values")
-    linear_fit.index.name = mean_array_todate.index.name
-    return linear_fit
+    # predict
+    pred_arr = fit.predict(X)
+    pred_pd = pd.Series(pred_arr, index=allyears_dt, name="values")
+    pred_pd.index.name = ser.index.name
+    return pred_pd
 
 
-# copied from https://github.com/openearth/sealevel/blob/master/slr/slr/models.py
-def linear_model(df, with_wind=True, with_ar=True, with_nodal=True, quantity="height"):
-    """Define the linear model with optional wind and autoregression.
-    See the latest report for a detailed description.
+def fit_linear_model(df, with_nodal=False):
+    """
+    Define the linear model with constant and trend, optionally with nodal.
+    simplified from https://github.com/openearth/sealevel/blob/master/slr/slr/models.py
     """
 
-    y = df[quantity]
-    X = np.c_[df["year"] - 1970,]
-    # month = np.mod(df['year'], 1) * 12.0
+    # just use the years. This assumes that annual waterlevels are used that are 
+    # stored left-padded, the mean waterlevel for 2020 is stored as 2020-1-1
+    # This is not logical, but common practice.
+    years = df.index.year
+    y = df.values
+    X = np.c_[years - 1970,]
     names = ["Constant", "Trend"]
     if with_nodal:
         X = np.c_[
             X,
-            np.cos(2 * np.pi * (df["year"] - 1970) / 18.613),
-            np.sin(2 * np.pi * (df["year"] - 1970) / 18.613),
+            np.cos(2 * np.pi * (years - 1970) / 18.613),
+            np.sin(2 * np.pi * (years - 1970) / 18.613),
         ]
         names.extend(["Nodal U", "Nodal V"])
-    if with_wind:
-        X = np.c_[X, df["u2"], df["v2"]]
-        names.extend(["Wind $u^2$", "Wind $v^2$"])
     X = sm.add_constant(X)
-    if with_ar:
-        model = sm.GLSAR(y, X, missing="drop", rho=1)
-    else:
-        model = sm.OLS(y, X, missing="drop")
+    model = sm.OLS(y, X, missing="drop")
     fit = model.fit(cov_type="HC0")
     return fit, names, X
diff --git a/tests/test_slotgemiddelden.py b/tests/test_slotgemiddelden.py
@@ -21,29 +21,30 @@ def test_calc_slotgemiddelden_outputtype(df_meas_2010_2014, df_ext_12_2010_2014)
 
 
 @pytest.mark.unittest
-def test_fit_models(df_meas_2010_2014):
+def test_predict_linear_model(df_meas_2010_2014):
     dict_wltidalindicators_valid = kw.calc_wltidalindicators(
         df_meas_2010_2014
     )  # 24*365=8760 (hourly interval), 24/3*365=2920 (3-hourly interval)
     wl_mean_peryear_valid = dict_wltidalindicators_valid["wl_mean_peryear"]
 
-    wl_model_fit_nodal = kw.slotgemiddelden.fit_models(
+    wl_model_fit_nodal = kw.slotgemiddelden.predict_linear_model(
         wl_mean_peryear_valid, with_nodal=True
     )
     nodal_expected = np.array(
-        [0.0141927, 0.08612119, 0.0853051, 0.07010864, 0.10051922, 0.23137634]
+        [0.07860955, 0.08999961, 0.07954378, 0.07398706, 0.09952146, 0.17882958]
     )
     assert np.allclose(wl_model_fit_nodal.values, nodal_expected)
-
-    wl_model_fit_linear = kw.slotgemiddelden.fit_models(
+    
+    wl_model_fit_linear = kw.slotgemiddelden.predict_linear_model(
         wl_mean_peryear_valid, with_nodal=False
     )
     linear_expected = np.array(
-        [0.07851414, 0.0813139, 0.08411366, 0.08691342, 0.08971318, 0.09251294]
+        [0.07917004, 0.08175116, 0.08433229, 0.08691342, 0.08949454, 0.09207567]
     )
     assert np.allclose(wl_model_fit_linear.values, linear_expected)
 
 
+
 @pytest.mark.unittest
 def test_calc_slotgemiddelden(df_meas_2010_2014, df_ext_12_2010_2014):
     slotgemiddelden_dict_inclext = kw.calc_slotgemiddelden(
@@ -94,14 +95,14 @@ def test_calc_slotgemiddelden(df_meas_2010_2014, df_ext_12_2010_2014):
     )
 
     # fmt: off
-    wl_model_fit_expected = np.array([0.0141927, 0.08612119, 0.0853051,
-                                      0.07010864, 0.10051922, 0.23137634])
-    hw_model_fit_expected = np.array([1.05295416, 1.12875177, 1.13988685,
-                                      1.1415461, 1.18998584, 1.336182])
-    lw_model_fit_expected = np.array([-0.67420399, -0.59089362, -0.59342291,
-                                      -0.61334278, -0.58024113, -0.42969074])
-    range_model_fit_expected = np.array([1.72715816, 1.71964539, 1.73330976,
-                                         1.75488888, 1.77022697, 1.76587273])
+    wl_model_fit_expected = np.array([0.07917004, 0.08175116, 0.08433229,
+                                      0.08691342, 0.08949454, 0.09207567])
+    hw_model_fit_expected = np.array([1.12529394, 1.13663287, 1.14797179,
+                                      1.15931071, 1.17064963, 1.18198856])
+    lw_model_fit_expected = np.array([-0.60236402, -0.59953375, -0.59670349,
+                                      -0.59387323, -0.59104297, -0.58821271])
+    range_model_fit_expected = np.array([1.72765796, 1.73616662, 1.74467528,
+                                         1.75318394, 1.7616926, 1.77020126])
     # fmt: on
     assert np.allclose(
         slotgemiddelden_dict_inclext["wl_model_fit"].values, wl_model_fit_expected