From ad933b247ff4f404da04d2f5c69a7a3b2ac0223d Mon Sep 17 00:00:00 2001 From: dbrakenhoff Date: Wed, 6 Oct 2021 11:46:59 +0200 Subject: [PATCH 1/8] up dev version --- traval/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/traval/version.py b/traval/version.py index 3ced358..66a981b 100644 --- a/traval/version.py +++ b/traval/version.py @@ -1 +1 @@ -__version__ = "0.2.1" +__version__ = "0.3b.0" From 90167b2bed470e4bcc321c8c5cfd1261fbab890f Mon Sep 17 00:00:00 2001 From: dbrakenhoff Date: Tue, 22 Mar 2022 16:13:21 +0100 Subject: [PATCH 2/8] fix dtypes selection --- traval/detector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/traval/detector.py b/traval/detector.py index 75f3c3a..7127526 100755 --- a/traval/detector.py +++ b/traval/detector.py @@ -91,7 +91,7 @@ def _validate_input_series(series): dtype = series.dtypes name = series.name elif isinstance(series, pd.DataFrame): - dtype = series.dtypes[0] + dtype = series.dtypes.values[0] name = series.columns[0] else: raise TypeError( From b28cfb21d4623a2d166a55a9931190d43acdf9af Mon Sep 17 00:00:00 2001 From: dbrakenhoff Date: Thu, 7 Apr 2022 17:17:11 +0200 Subject: [PATCH 3/8] fix mpl deprecation warning --- traval/plots.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/traval/plots.py b/traval/plots.py index 945b8ed..92f8fa8 100755 --- a/traval/plots.py +++ b/traval/plots.py @@ -140,7 +140,7 @@ def plot_series_comparison(self, mark_unique=True, mark_different=True, plot_labels = [i.get_label() for i in plot_handles] ax.legend(plot_handles, plot_labels, loc="best", ncol=int(np.ceil(len(plot_handles) / 2.))) - ax.grid(b=True) + ax.grid(visible=True) fig.tight_layout() return ax @@ -272,7 +272,7 @@ def plot_validation_result(self, ax=None): plot_labels = [i.get_label() for i in plot_handles] ax.legend(plot_handles, plot_labels, loc=(0, 1), markerscale=1.25, ncol=len(plot_handles), frameon=False) - ax.grid(b=True) + ax.grid(visible=True) fig.tight_layout() return ax @@ -336,7 +336,7 @@ def roc_plot(tpr, fpr, labels, colors=None, ax=None, ax.set_xlim(0, 1) ax.set_ylim(0, 1) - ax.grid(b=True) + ax.grid(visible=True) ax.legend(loc="lower right") ax.set_ylabel("True Positive Rate (sensitivity)") ax.set_xlabel("False Positive Rate (1-specificity)") @@ -410,7 +410,7 @@ def det_plot(fpr, fnr, labels, ax=None, **kwargs): ax.set_yticks(tick_locations) ax.set_yticklabels(tick_labels) ax.set_ylim(-3, 3) - ax.grid(b=True) + ax.grid(visible=True) ax.set_title("detection error tradeoff plot") From 63b71955f52be8ca6f0ebd2fe972ed9b3b2624eb Mon Sep 17 00:00:00 2001 From: dbrakenhoff Date: Mon, 20 Jun 2022 17:33:57 +0200 Subject: [PATCH 4/8] add method to obtain corrections comparison w truth --- traval/detector.py | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/traval/detector.py b/traval/detector.py index 7127526..73291f7 100755 --- a/traval/detector.py +++ b/traval/detector.py @@ -458,6 +458,49 @@ def get_corrections_dataframe(self): df.columns = list(self.ruleset.rules.keys()) return df + def get_corrections_comparison(self, truth=None): + + if truth is None and self.truth is not None: + truth = self.truth + else: + raise ValueError("Supply a time series for 'truth'!") + + comments_traval = self.get_comment_series() + comments_traval.name = "traval_comment" + + mask_truth_corrections = truth.iloc[:, 0].isna() + comments_truth = truth.loc[mask_truth_corrections] + + k = list(self.comparisons.keys())[-1] + comparison = self.comparisons[k].comparison_series() + translate = { + -1: "Value modified", + 0: "Flagged in both", + 1: "Only flagged in 'truth' series", + 2: "Only flagged in 'traval' series", + -9999: "NaN in both" + } + comparison = comparison.apply(lambda v: translate[v]) + comparison.name = "comparison_label" + + raw_index = (comments_traval.index + .union(comments_truth.index)) + + truth.columns = ["truth_series", "truth_comment"] + + traval_series = self.get_final_result() + traval_series.name = "traval_series" + + df = pd.concat([ + self.series.loc[raw_index.intersection(self.series.index)], + traval_series.loc[raw_index.intersection(traval_series.index)], + comments_traval, + truth.loc[raw_index.intersection(truth.index)], + comparison.loc[raw_index.intersection(comparison.index)] + ], axis=1) + + return df + def plot_overview(self, mark_suspects=True, **kwargs): """Plot timeseries with flagged values per applied rule. From e3521c57d8480879447afa5972cd5069dcf0c820 Mon Sep 17 00:00:00 2001 From: dbrakenhoff Date: Tue, 25 Oct 2022 14:59:03 +0200 Subject: [PATCH 5/8] add Nonetype support to TravalParameter --- traval/params.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/traval/params.py b/traval/params.py index 944e006..caea7ce 100644 --- a/traval/params.py +++ b/traval/params.py @@ -162,10 +162,12 @@ def from_csv(cls, csvfile): for i, (v, t) in params.loc[:, ["value", "dtype"]].iterrows(): if t == "float": v = float(v) - if t == "int": + elif t == "int": v = int(v) - if t == "str": + elif t == "str": continue # already str + elif t == "NoneType": + v = None params.loc[i, "value"] = v params.drop(columns=['dtype'], inplace=True) parameters, defaults = cls._split_df(params) From f0b203b3c7702c6c3bccc465e29f20eaeee31729 Mon Sep 17 00:00:00 2001 From: dbrakenhoff Date: Tue, 25 Oct 2022 15:00:09 +0200 Subject: [PATCH 6/8] improve rules - add time-shift smoothing to pastas prediction interval - add minimum ci width to pastas prediction interval - black formatting --- traval/rulelib.py | 187 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 129 insertions(+), 58 deletions(-) diff --git a/traval/rulelib.py b/traval/rulelib.py index fc96927..687253c 100755 --- a/traval/rulelib.py +++ b/traval/rulelib.py @@ -4,10 +4,15 @@ import numpy as np import pandas as pd -from .ts_utils import (diff_with_gap_awareness, - interpolate_series_to_new_index, - mask_corrections_as_nan, - resample_short_series_to_long_series, spike_finder) +from .ts_utils import ( + diff_with_gap_awareness, + interpolate_series_to_new_index, + mask_corrections_as_nan, + resample_short_series_to_long_series, + smooth_lower_bound, + smooth_upper_bound, + spike_finder, +) def rule_funcdict_to_nan(series, funcdict): @@ -65,8 +70,11 @@ def rule_max_gradient(series, max_step=0.5, max_timestep="1D"): corrections. Suspect values are set to np.nan. """ conversion = pd.Timedelta(max_timestep) / pd.Timedelta("1S") - grad = (series.diff() / - series.index.to_series().diff().dt.total_seconds() * conversion) + grad = ( + series.diff() + / series.index.to_series().diff().dt.total_seconds() + * conversion + ) mask = grad.abs() > max_step return mask_corrections_as_nan(series, mask) @@ -103,8 +111,9 @@ def rule_ufunc_threshold(series, ufunc, threshold, offset=0.0): """ ufunc = ufunc[0] if isinstance(threshold, pd.Series): - full_threshold_series = \ - resample_short_series_to_long_series(threshold, series) + full_threshold_series = resample_short_series_to_long_series( + threshold, series + ) mask = ufunc(series, full_threshold_series.add(offset)) else: mask = ufunc(series, threshold + offset) @@ -216,16 +225,16 @@ def rule_spike_detection(series, threshold=0.15, spike_tol=0.15, max_gap="7D"): a series with same index as the input timeseries containing corrections. Suspect values are set to np.nan. """ - upspikes, downspikes = spike_finder(series, - threshold=threshold, - spike_tol=spike_tol, - max_gap=max_gap) + upspikes, downspikes = spike_finder( + series, threshold=threshold, spike_tol=spike_tol, max_gap=max_gap + ) mask = upspikes.index.union(downspikes.index) return mask_corrections_as_nan(series, mask) -def rule_offset_detection(series, threshold=0.15, updown_diff=0.1, - max_gap="7D", return_df=False): +def rule_offset_detection( + series, threshold=0.15, updown_diff=0.1, max_gap="7D", return_df=False +): """Detection rule, detect periods with an offset error. This rule looks for jumps in both positive and negative direction that @@ -288,16 +297,22 @@ def rule_offset_detection(series, threshold=0.15, updown_diff=0.1, index_best_match = idiff.abs().idxmin() if idiff.loc[index_best_match] <= updown_diff: periods += [i, index_best_match] - df.loc[j] = [i, index_best_match, dh, - jump_df.loc[index_best_match], - idiff.loc[index_best_match]] + df.loc[j] = [ + i, + index_best_match, + dh, + jump_df.loc[index_best_match], + idiff.loc[index_best_match], + ] j += 1 - corrections = pd.Series(index=series.index, data=np.zeros( - series.index.size), fastpath=True) + corrections = pd.Series( + index=series.index, data=np.zeros(series.index.size), fastpath=True + ) for j in range(0, len(periods), 2): - corrections.loc[periods[j]:periods[j + 1] - - pd.Timedelta(seconds=30)] = np.nan + corrections.loc[ + periods[j] : periods[j + 1] - pd.Timedelta(seconds=30) + ] = np.nan if return_df: return corrections, df else: @@ -322,8 +337,9 @@ def rule_outside_n_sigma(series, n=2.0): """ - mask = ((series > series.mean() + n * series.std()) - | (series < series.mean() - n * series.std())) + mask = (series > series.mean() + n * series.std()) | ( + series < series.mean() - n * series.std() + ) return mask_corrections_as_nan(series, mask) @@ -386,8 +402,17 @@ def rule_outside_bandwidth(series, lowerbound, upperbound): return mask_corrections_as_nan(series, mask) -def rule_pastas_outside_pi(series, ml, ci=0.95, tmin=None, tmax=None, - savedir=None, verbose=False): +def rule_pastas_outside_pi( + series, + ml, + ci=0.95, + min_ci=None, + smoothfreq=None, + tmin=None, + tmax=None, + savedir=None, + verbose=False, +): """Detection rule, flag values based on pastas model prediction interval. Flag suspect outside prediction interval calculated by pastas timeseries @@ -403,6 +428,15 @@ def rule_pastas_outside_pi(series, ml, ci=0.95, tmin=None, tmax=None, confidence interval for calculating bandwidth, by default 0.95. Higher confidence interval means that bandwidth is wider and more observations will fall within the bounds. + min_ci : float, optional + value indicating minimum distance between upper and lower + bounds, if ci does not meet this requirement, this value is added + to the bounds. This can be used to prevent extremely narrow prediction + intervals. Default is None. + smoothfreq : str, optional + str indicating no. of periods and frequency str (i.e. "1D") for + smoothing upper and lower bounds only used if smoothbounds=True, + default is None. tmin : str or pd.Timestamp, optional set tmin for model simulation tmax : str or pd.Timestamp, optional @@ -421,14 +455,16 @@ def rule_pastas_outside_pi(series, ml, ci=0.95, tmin=None, tmax=None, if verbose: print("Warning: No Pastas model found!") corrections = mask_corrections_as_nan( - series, pd.Series(index=series.index, data=False)) + series, pd.Series(index=series.index, data=False) + ) corrections.name = "sim" # no fit elif ml.fit is None: if verbose: print("Warning: Pastas model fit attribute is None!") corrections = mask_corrections_as_nan( - series, pd.Series(index=series.index, data=False)) + series, pd.Series(index=series.index, data=False) + ) corrections.name = "sim" # calculate pi else: @@ -443,15 +479,32 @@ def rule_pastas_outside_pi(series, ml, ci=0.95, tmin=None, tmax=None, # prediction interval empty if pi.empty: if verbose: - print("Warning: calculated prediction interval with " - "Pastas model is empty!") + print( + "Warning: calculated prediction interval with " + "Pastas model is empty!" + ) corrections = mask_corrections_as_nan( - series, pd.Series(index=series.index, data=False)) + series, pd.Series(index=series.index, data=False) + ) corrections.name = "sim" else: - corrections = rule_outside_bandwidth(series, - pi.iloc[:, 0], - pi.iloc[:, 1]) + lower = pi.iloc[:, 0] + upper = pi.iloc[:, 1] + + # apply time-shift smoothing + if smoothfreq is not None: + upper = smooth_upper_bound(upper, smoothfreq=smoothfreq) + lower = smooth_lower_bound(lower, smoothfreq=smoothfreq) + + # apply minimum ci if passed + if min_ci is not None: + # check if mean of current interval is smalller than ci + if (upper - lower).mean() < min_ci: + # adjust bounds with half of min_ci each + upper = upper + min_ci / 2.0 + lower = lower - min_ci / 2.0 + + corrections = rule_outside_bandwidth(series, lower, upper) corrections.name = "sim (r^2={0:.3f})".format(ml.stats.rsq()) if savedir: @@ -459,21 +512,24 @@ def rule_pastas_outside_pi(series, ml, ci=0.95, tmin=None, tmax=None, return corrections -def rule_pastas_percentile_pi(series, ml, alpha=0.1, tmin=None, tmax=None, - verbose=False): +def rule_pastas_percentile_pi( + series, ml, alpha=0.1, tmin=None, tmax=None, verbose=False +): # no model if ml is None: if verbose: print("Warning: No Pastas model found!") corrections = mask_corrections_as_nan( - series, pd.Series(index=series.index, data=False)) + series, pd.Series(index=series.index, data=False) + ) corrections.name = "sim" # no fit elif ml.fit is None: if verbose: print("Warning: Pastas model fit attribute is None!") corrections = mask_corrections_as_nan( - series, pd.Series(index=series.index, data=False)) + series, pd.Series(index=series.index, data=False) + ) corrections.name = "sim" # calculate realizations @@ -522,8 +578,9 @@ def rule_keep_comments(series, keep_comments, comment_series, other_series): return corrections -def rule_shift_to_manual_obs(series, hseries, method="linear", - max_dt="1D", reset_dates=None): +def rule_shift_to_manual_obs( + series, hseries, method="linear", max_dt="1D", reset_dates=None +): """Adjustment rule, for shifting timeseries onto manual observations. Used for shifting timeseries based on sensor observations onto manual @@ -557,14 +614,18 @@ def rule_shift_to_manual_obs(series, hseries, method="linear", # check if time between manual obs and sensor obs # are further apart than max_dt: nearest = hseries.index.map( - lambda t: series.index.get_loc(t, method="nearest")) - mask = np.abs((series.index[nearest] - hseries.index).total_seconds() - ) <= (pd.Timedelta(max_dt) / pd.Timedelta("1S")) + lambda t: series.index.get_loc(t, method="nearest") + ) + mask = np.abs((series.index[nearest] - hseries.index).total_seconds()) <= ( + pd.Timedelta(max_dt) / pd.Timedelta("1S") + ) # interpolate raw obs to manual obs times - s_obs = series.reindex(series.index.join( - hseries.index, how="outer")).interpolate( - method="time").loc[hseries.index] + s_obs = ( + series.reindex(series.index.join(hseries.index, how="outer")) + .interpolate(method="time") + .loc[hseries.index] + ) # calculate diff diff = s_obs - hseries @@ -578,9 +639,13 @@ def rule_shift_to_manual_obs(series, hseries, method="linear", # interpolate w/ method if method == "linear": - diff_full_index = diff.reindex(series.index.join( - diff.index, how="outer"), method=None).interpolate( - method="linear").fillna(0.0) + diff_full_index = ( + diff.reindex( + series.index.join(diff.index, how="outer"), method=None + ) + .interpolate(method="linear") + .fillna(0.0) + ) else: diff_full_index = diff.reindex(series.index, method=method).fillna(0.0) @@ -614,7 +679,7 @@ def rule_combine_nan_and(*args): """Combination rule, combine NaN values for any number of timeseries. Used for combining intermediate results in branching algorithm trees to - create one final result, i.e. , i.e. (s1.isna() AND s2.isna()) + create one final result, i.e. (s1.isna() AND s2.isna()) Returns ------- @@ -633,8 +698,16 @@ def rule_combine_nan_and(*args): return result -def rule_flat_signal(series, window, min_obs, std_threshold=7.5e-3, - qbelow=None, qabove=None, hbelow=None, habove=None): +def rule_flat_signal( + series, + window, + min_obs, + std_threshold=7.5e-3, + qbelow=None, + qabove=None, + hbelow=None, + habove=None, +): """Detection rule, flag values based on dead signal in rolling window. Flag values when variation in signal within a window falls below a @@ -675,17 +748,15 @@ def rule_flat_signal(series, window, min_obs, std_threshold=7.5e-3, or dead. """ s = series.dropna() - stdfilt = s.dropna().rolling( - f'{int(window)}D', min_periods=min_obs).std() - stdmask = (stdfilt < std_threshold) + stdfilt = s.dropna().rolling(f"{int(window)}D", min_periods=min_obs).std() + stdmask = stdfilt < std_threshold if qabove is None and qbelow is not None: quantilemask = s < s.quantile(qbelow) elif qabove is not None and qbelow is None: quantilemask = s > s.quantile(qabove) elif qabove is not None and qbelow is not None: - quantilemask = ((s > s.quantile(qabove)) | - (s < s.quantile(qbelow))) + quantilemask = (s > s.quantile(qabove)) | (s < s.quantile(qbelow)) else: quantilemask = pd.Series(index=s.index, data=True, dtype=bool) @@ -694,11 +765,11 @@ def rule_flat_signal(series, window, min_obs, std_threshold=7.5e-3, elif habove is not None and hbelow is None: levelmask = s > habove elif habove is not None and hbelow is not None: - levelmask = ((s > habove) | (s < hbelow)) + levelmask = (s > habove) | (s < hbelow) else: levelmask = pd.Series(index=s.index, data=True, dtype=bool) - mask = (stdmask & quantilemask & levelmask) + mask = stdmask & quantilemask & levelmask mask = mask.reindex(series.index, fill_value=False) return mask_corrections_as_nan(series, mask) From 3f6299a5f4f8e0c3f8a21e50c78a2a1bcc3ca1b4 Mon Sep 17 00:00:00 2001 From: dbrakenhoff Date: Tue, 25 Oct 2022 15:00:39 +0200 Subject: [PATCH 7/8] add methods for time shift smoothing of bounds black formatting --- traval/ts_utils.py | 57 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 42 insertions(+), 15 deletions(-) diff --git a/traval/ts_utils.py b/traval/ts_utils.py index 074c06a..5d62271 100644 --- a/traval/ts_utils.py +++ b/traval/ts_utils.py @@ -19,8 +19,12 @@ def mask_corrections_as_nan(series, mask): c : pd.Series return corrections series """ - c = pd.Series(index=series.index, data=np.zeros(series.index.size), - fastpath=True, dtype=float) + c = pd.Series( + index=series.index, + data=np.zeros(series.index.size), + fastpath=True, + dtype=float, + ) c.loc[mask] = np.nan return c @@ -51,9 +55,7 @@ def resample_short_series_to_long_series(short_series, long_series): first_date_after = long_series.loc[mask].index[0] new_series.loc[first_date_after] = short_series.iloc[i] - new_series = (new_series - .fillna(method="ffill") - .fillna(method="bfill")) + new_series = new_series.fillna(method="ffill").fillna(method="bfill") return new_series @@ -130,10 +132,16 @@ def spike_finder(series, threshold=0.15, spike_tol=0.15, max_gap="7D"): # Mask spikes to only include large ones # use spike moments from above and check whether # jump in head is larger than threshold. - upspikes = diff.loc[spike_up.dropna().index].where( - lambda s: s > threshold).dropna() - downspikes = diff.loc[spike_down.dropna().index].where( - lambda s: s < -threshold).dropna() + upspikes = ( + diff.loc[spike_up.dropna().index] + .where(lambda s: s > threshold) + .dropna() + ) + downspikes = ( + diff.loc[spike_down.dropna().index] + .where(lambda s: s < -threshold) + .dropna() + ) return upspikes, downspikes @@ -178,10 +186,10 @@ def interpolate_series_to_new_index(series, new_index): new series with new index, with interpolated values """ # interpolate to new index - s_interp = np.interp(new_index, series.index.asi8, series.values, - left=np.nan, right=np.nan) - si = pd.Series(index=new_index, data=s_interp, - dtype=float, fastpath=True) + s_interp = np.interp( + new_index, series.index.asi8, series.values, left=np.nan, right=np.nan + ) + si = pd.Series(index=new_index, data=s_interp, dtype=float, fastpath=True) return si @@ -247,7 +255,26 @@ def create_synthetic_raw_timeseries(raw_series, truth_series, comments): # create synthetic raw series synth_raw = truth_series.loc[idx_in_both].copy() - synth_raw.loc[mask_comments] = \ - raw_series.loc[idx_in_both].loc[mask_comments] + synth_raw.loc[mask_comments] = raw_series.loc[idx_in_both].loc[ + mask_comments + ] return synth_raw + + +def shift_series_forward_backward(s, freqstr="1D"): + n = int(freqstr[:-1]) if freqstr[:-1].isnumeric() else 1 + freq = freqstr[-1] if freqstr[:-1].isalpha() else "D" + shift_forward = s.shift(periods=n, freq=freq) + shift_backward = s.shift(periods=-n, freq=freq) + return pd.concat([shift_backward, s, shift_forward], axis=1) + + +def smooth_upper_bound(b, smoothfreq="1D"): + smoother = shift_series_forward_backward(b, freqstr=smoothfreq) + return smoother.max(axis=1) + + +def smooth_lower_bound(b, smoothfreq="1D"): + smoother = shift_series_forward_backward(b, freqstr=smoothfreq) + return smoother.min(axis=1) From 3d810f1cb282a9358f74f6de99d4913c6aa8d07f Mon Sep 17 00:00:00 2001 From: dbrakenhoff Date: Tue, 25 Oct 2022 15:00:56 +0200 Subject: [PATCH 8/8] up version --- traval/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/traval/version.py b/traval/version.py index 66a981b..493f741 100644 --- a/traval/version.py +++ b/traval/version.py @@ -1 +1 @@ -__version__ = "0.3b.0" +__version__ = "0.3.0"