diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 55800ae..31e010f 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -11,31 +11,60 @@ on:
- master
jobs:
- build:
+ lint:
+ name: Lint
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout repo
+ uses: actions/checkout@v4
+
+ - name: Setup Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: 3.9
+ cache: "pip"
+ cache-dependency-path: pyproject.toml
+
+ - name: Install Python dependencies
+ run: |
+ pip install .
+
+ - name: ruff-lint
+ uses: chartboost/ruff-action@v1
+
+ - name: ruff-format
+ uses: chartboost/ruff-action@v1
+ with:
+ args: "format --check"
+
+ test:
+ needs: lint
+ name: Test
runs-on: ubuntu-latest
strategy:
matrix:
- python-version: [3.9, "3.10", "3.11"]
+ python-version: [3.9, "3.10", "3.11", "3.12"]
steps:
- uses: actions/checkout@v4
+
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
+ cache: "pip"
+ cache-dependency-path: pyproject.toml
+
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -e .[ci]
- - name: Lint with flake8
- run: |
- # stop the build if there are Python syntax errors or undefined names
- flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
- # exit-zero treats all errors as warnings.
- flake8 . --count --exit-zero --max-complexity=10 --max-line-length=80 --statistics
+
+
- name: Run pytest
run: |
pytest
+
- name: Run codacy-coverage-reporter
uses: codacy/codacy-coverage-reporter-action@master
with:
diff --git a/.readthedocs.yml b/.readthedocs.yml
index eaf0e55..3a374e9 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -8,15 +8,12 @@ version: 2
build:
os: ubuntu-22.04
tools:
- python: "3.9"
+ python: "3.11"
# Build documentation in the docs/ directory with Sphinx
sphinx:
configuration: docs/conf.py
-# Optionally build your docs in additional formats such as PDF and ePub
-formats: all
-
# Optionally set the version of Python and requirements required to build your docs
python:
install:
diff --git a/docs/conf.py b/docs/conf.py
index ec09b5c..833df79 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -13,15 +13,14 @@
import os
import sys
-sys.path.insert(0, os.path.abspath('.'))
+sys.path.insert(0, os.path.abspath("."))
from traval import __version__
-
# -- Project information -----------------------------------------------------
-project = 'traval'
-copyright = '2021, Artesia'
-author = 'Artesia'
+project = "traval"
+copyright = "2024, Artesia"
+author = "Artesia"
# The short X.Y version
version = __version__
@@ -34,29 +33,29 @@
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
- 'sphinx.ext.autodoc',
- 'sphinx.ext.autosummary',
- 'sphinx.ext.napoleon',
- 'sphinx.ext.doctest',
- 'sphinx.ext.intersphinx',
- 'sphinx.ext.todo',
- 'sphinx.ext.coverage',
- 'sphinx.ext.mathjax',
- 'sphinx.ext.ifconfig',
- 'sphinx.ext.viewcode',
- 'IPython.sphinxext.ipython_console_highlighting', # lowercase didn't work
- 'sphinx.ext.autosectionlabel',
- 'nbsphinx',
- 'nbsphinx_link'
+ "sphinx.ext.autodoc",
+ "sphinx.ext.autosummary",
+ "sphinx.ext.napoleon",
+ "sphinx.ext.doctest",
+ "sphinx.ext.intersphinx",
+ "sphinx.ext.todo",
+ "sphinx.ext.coverage",
+ "sphinx.ext.mathjax",
+ "sphinx.ext.ifconfig",
+ "sphinx.ext.viewcode",
+ "IPython.sphinxext.ipython_console_highlighting", # lowercase didn't work
+ "sphinx.ext.autosectionlabel",
+ "nbsphinx",
+ "nbsphinx_link",
]
# Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
+templates_path = ["_templates"]
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
# -- Options for HTML output -------------------------------------------------
@@ -64,24 +63,24 @@
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
-html_theme = 'sphinx_rtd_theme'
+html_theme = "sphinx_rtd_theme"
html_theme_options = {
- 'display_version': True,
- 'prev_next_buttons_location': 'bottom',
+ "display_version": True,
+ "prev_next_buttons_location": "bottom",
# 'style_external_links': False,
# 'vcs_pageview_mode': '',
# 'style_nav_header_background': 'white',
# Toc options
- 'collapse_navigation': False,
- 'sticky_navigation': False,
- 'navigation_depth': 4,
- 'includehidden': True,
- 'titles_only': False,
+ "collapse_navigation": False,
+ "sticky_navigation": False,
+ "navigation_depth": 4,
+ "includehidden": True,
+ "titles_only": False,
"github_url": "https://github.com/ArtesiaWater/traval",
}
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ["_static"]
diff --git a/docs/examples.rst b/docs/examples.rst
index d10fa86..fc7aba7 100644
--- a/docs/examples.rst
+++ b/docs/examples.rst
@@ -3,8 +3,12 @@ Examples
The following notebooks contain examples showcasing traval.
-The first example shows how to apply the tools contained in traval to detect errors in a single timeseries.
-The second example shows how the same can be done for a full dataset with lots of timeseries.
+- The first example shows how to apply the tools contained in traval to detect errors in
+ a single time series.
+- The second example shows how the same can be done for a full
+ dataset with lots of time series.
+- The third notebook contains small examples for each of the error detection rules
+ contained in traval.
.. toctree::
:maxdepth: 1
diff --git a/docs/examples/ex03_rules.nblink b/docs/examples/ex03_rules.nblink
new file mode 100644
index 0000000..cb2b3cb
--- /dev/null
+++ b/docs/examples/ex03_rules.nblink
@@ -0,0 +1,3 @@
+{
+ "path": "../../examples/notebooks/ex03_testing_rules.ipynb"
+}
diff --git a/docs/getting_started.rst b/docs/getting_started.rst
index 25d695e..5b8ed3e 100644
--- a/docs/getting_started.rst
+++ b/docs/getting_started.rst
@@ -5,11 +5,8 @@ Getting Started
Installation
------------
-To install traval, a working version of Python 3.7 or 3.8 has to be installed on
-your computer. We recommend using the Anaconda Distribution with Python 3.7 as
-it includes most of the python package dependencies and the Jupyter Notebook
-software to run the notebooks. However, you are free to install any
-Python distribution you want.
+To install traval, a working version of Python 3.9 or higher has to be installed on
+your computer.
To install traval, use:
@@ -71,10 +68,10 @@ Take a look at the ruleset by just typing `ruleset`:
1: rule1 0
-Next define a Detector object. This object is designed to store a timeseries
+Next define a Detector object. This object is designed to store a time series
and the intermediate and final results after applying an error detection
-algorithm. Initialize the Detector object with some timeseries. In this example
-we assume there is a timeseries called `raw_series`:
+algorithm. Initialize the Detector object with some time series. In this example
+we assume there is a time series called `raw_series`:
.. code:: python
@@ -82,7 +79,7 @@ we assume there is a timeseries called `raw_series`:
detect = traval.Detector(raw_series)
-Apply our first algorithm to the timeseries.
+Apply our first algorithm to the time series.
.. code:: python
diff --git a/docs/index.rst b/docs/index.rst
index 6d602ad..801d02e 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -6,20 +6,20 @@
Welcome to traval's documentation!
==================================
-Python package for applying automatic error detection algorithms to timeseries.
+Python package for applying automatic error detection algorithms to time series.
This module is set up to provide tools for applying any error detection
-algorithm to any timeseries. The module consists of three main components:
+algorithm to any time series. The module consists of three main components:
-- `Detector`: a data management object for storing timeseries and error detection results.
+- `Detector`: a data management object for storing time series and error detection results.
- `RuleSet`: the RuleSet object is a highly flexible object for defining error detection algorithms based on (user-defined) functions.
-- `SeriesComparison*`: objects for comparing timeseries. These objects include plots for visualizing the comparisons.
+- `SeriesComparison*`: objects for comparing time series. These objects include plots for visualizing the comparisons.
The general workflow consists of the following steps:
1. Define error detection algorithm(s).
-2. Load data, i.e. raw timeseries data and optionally timeseries representing the "truth" to see how well the algorithms perform.
-3. Initialize Detector objects and apply algorithms to timeseries.
+2. Load data, i.e. raw time series data and optionally time series representing the "truth" to see how well the algorithms perform.
+3. Initialize Detector objects and apply algorithms to time series.
4. Store and analyze the results.
For more detailed information and examples, please refer to the notebooks in
diff --git a/docs/modules.rst b/docs/modules.rst
index 85a1eaa..ae111b2 100644
--- a/docs/modules.rst
+++ b/docs/modules.rst
@@ -25,15 +25,15 @@ Rule Library
:members:
-Timeseries Comparison
----------------------
+Time Series Comparison
+----------------------
.. automodule:: traval.ts_comparison
:members:
-Timeseries Utilities
---------------------
+Time series Utilities
+---------------------
.. automodule:: traval.ts_utils
:members:
diff --git a/examples/notebooks/ex01_applying_automatic_error_detection_algorithms_to_a_timeseries.ipynb b/examples/notebooks/ex01_applying_automatic_error_detection_algorithms_to_a_timeseries.ipynb
index fd40799..a3e0e6f 100644
--- a/examples/notebooks/ex01_applying_automatic_error_detection_algorithms_to_a_timeseries.ipynb
+++ b/examples/notebooks/ex01_applying_automatic_error_detection_algorithms_to_a_timeseries.ipynb
@@ -4,12 +4,12 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "# Example 1: Applying an automatic error detection algorithm to a timeseries\n",
+ "# Example 1: Applying an automatic error detection algorithm to a time series\n",
"_Created by DavĂd Brakenhoff, Artesia, May 2020_\n",
"\n",
"
\n",
"\n",
- "This notebook contains a simple example how to set up an automatic error detection algorithm based on a few simple rules and applies those rules to a groundwater timeseries.\n",
+ "This notebook contains a simple example how to set up an automatic error detection algorithm based on a few simple rules and applies those rules to a groundwater time series.\n",
"\n",
"First import the requisite packages:"
]
@@ -21,6 +21,7 @@
"outputs": [],
"source": [
"import os\n",
+ "\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
@@ -86,7 +87,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@@ -124,7 +125,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 4,
"metadata": {},
"outputs": [
{
@@ -138,7 +139,7 @@
" 4: combine (1, 2, 3)"
]
},
- "execution_count": 5,
+ "execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
@@ -159,7 +160,7 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 5,
"metadata": {},
"outputs": [
{
@@ -177,7 +178,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 6,
"metadata": {},
"outputs": [
{
@@ -186,14 +187,6 @@
"text": [
"RuleSet written to file: 'test.json'\n"
]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/home/david/Github/traval/traval/ruleset.py:436: UserWarning: Custom functions will not be preserved when storing RuleSet as JSON file!\n",
- " warnings.warn(msg)\n"
- ]
}
],
"source": [
@@ -210,7 +203,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
@@ -229,7 +222,7 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 8,
"metadata": {},
"outputs": [
{
@@ -238,7 +231,7 @@
"Detector: "
]
},
- "execution_count": 9,
+ "execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@@ -257,7 +250,7 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
@@ -273,19 +266,17 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
- "image/png": "\n",
+ "image/png": "",
"text/plain": [
- "
"
+ ],
+ "text/plain": [
+ " correction_code series_values comparison_values\n",
+ "2020-01-01 99 NaN NaN\n",
+ "2020-01-02 0 NaN NaN\n",
+ "2020-01-03 0 NaN NaN\n",
+ "2020-01-04 0 NaN NaN\n",
+ "2020-01-05 0 NaN NaN\n",
+ "2020-01-06 0 NaN NaN\n",
+ "2020-01-07 0 NaN NaN\n",
+ "2020-01-08 0 NaN NaN\n",
+ "2020-01-09 0 NaN NaN\n",
+ "2020-01-10 99 NaN NaN"
+ ]
+ },
+ "execution_count": 42,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# rule_combine_corrections_or\n",
+ "date_range = pd.date_range(\"2020-01-01\", freq=\"D\", periods=10)\n",
+ "s1 = pd.DataFrame(index=date_range, columns=[\"correction_code\"], data=0)\n",
+ "s2 = s1.copy()\n",
+ "s1.iloc[0] = 99\n",
+ "s2.iloc[-1] = -2\n",
+ "c11 = rlib.rule_combine_corrections_or(s1, s2)\n",
+ "assert (c11[\"correction_code\"] == 99).sum() == 2\n",
+ "c11"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## `rule_combine_corrections_and`\n",
+ "\n",
+ "Rule for combining results of any number of other rules. Observations are suspect if\n",
+ "ALL rules flag an observation as suspect."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
correction_code
\n",
+ "
series_values
\n",
+ "
comparison_values
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
2020-01-01
\n",
+ "
0
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ "
2020-01-02
\n",
+ "
99
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ "
2020-01-03
\n",
+ "
0
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ "
2020-01-04
\n",
+ "
0
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ "
2020-01-05
\n",
+ "
0
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ "
2020-01-06
\n",
+ "
0
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ "
2020-01-07
\n",
+ "
0
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ "
2020-01-08
\n",
+ "
0
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ "
2020-01-09
\n",
+ "
0
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ "
2020-01-10
\n",
+ "
0
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " correction_code series_values comparison_values\n",
+ "2020-01-01 0 NaN NaN\n",
+ "2020-01-02 99 NaN NaN\n",
+ "2020-01-03 0 NaN NaN\n",
+ "2020-01-04 0 NaN NaN\n",
+ "2020-01-05 0 NaN NaN\n",
+ "2020-01-06 0 NaN NaN\n",
+ "2020-01-07 0 NaN NaN\n",
+ "2020-01-08 0 NaN NaN\n",
+ "2020-01-09 0 NaN NaN\n",
+ "2020-01-10 0 NaN NaN"
+ ]
+ },
+ "execution_count": 55,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# rule_combine_corrections_and\n",
+ "date_range = pd.date_range(\"2020-01-01\", freq=\"D\", periods=10)\n",
+ "s1 = pd.DataFrame(index=date_range, columns=[\"correction_code\"], data=0)\n",
+ "s2 = s1.copy()\n",
+ "s1.iloc[0:2] = 99\n",
+ "s2.iloc[1:3] = -2\n",
+ "c12 = rlib.rule_combine_corrections_and(s1, s2)\n",
+ "assert (c12[\"correction_code\"] == 99).sum() == 1\n",
+ "c12"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## `rule_funcdict`\n",
+ "\n",
+ "Rule that takes a dictionary of functions and applies those iteratively to the original\n",
+ "time series. Observations are suspect if any rule flags an observation as suspect."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
correction_code
\n",
+ "
series_values
\n",
+ "
comparison_values
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
2020-01-01
\n",
+ "
99
\n",
+ "
0.0
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ "
2020-01-02
\n",
+ "
99
\n",
+ "
1.0
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ "
2020-01-03
\n",
+ "
99
\n",
+ "
2.0
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ "
2020-01-04
\n",
+ "
0
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ "
2020-01-05
\n",
+ "
0
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ "
2020-01-06
\n",
+ "
0
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ "
2020-01-07
\n",
+ "
0
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ "
2020-01-08
\n",
+ "
0
\n",
+ "
NaN
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ "
2020-01-09
\n",
+ "
99
\n",
+ "
8.0
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ "
2020-01-10
\n",
+ "
99
\n",
+ "
9.0
\n",
+ "
NaN
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " correction_code series_values comparison_values\n",
+ "2020-01-01 99 0.0 NaN\n",
+ "2020-01-02 99 1.0 NaN\n",
+ "2020-01-03 99 2.0 NaN\n",
+ "2020-01-04 0 NaN NaN\n",
+ "2020-01-05 0 NaN NaN\n",
+ "2020-01-06 0 NaN NaN\n",
+ "2020-01-07 0 NaN NaN\n",
+ "2020-01-08 0 NaN NaN\n",
+ "2020-01-09 99 8.0 NaN\n",
+ "2020-01-10 99 9.0 NaN"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# rule_funcdict_to_nan\n",
+ "date_range = pd.date_range(\"2020-01-01\", freq=\"D\", periods=10)\n",
+ "s1 = pd.Series(index=date_range, data=np.arange(10))\n",
+ "fdict = {\"lt_3\": lambda s: s < 3.0, \"gt_7\": lambda s: s > 7.0}\n",
+ "c13 = rlib.rule_funcdict(s1, fdict)\n",
+ "assert (c13[\"correction_code\"] == 99).sum() == 5\n",
+ "c13"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## `rule_keep_comments`\n",
+ "\n",
+ "Rule that keeps observations that have some comment associated with it. Can be used to\n",
+ "filter validated time series comments to obtain specific observations."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
correction_code
\n",
+ "
series_values
\n",
+ "
comparison_values
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
2020-01-01
\n",
+ "
99
\n",
+ "
0.0
\n",
+ "
keep
\n",
+ "
\n",
+ "
\n",
+ "
2020-01-02
\n",
+ "
99
\n",
+ "
1.0
\n",
+ "
keep
\n",
+ "
\n",
+ "
\n",
+ "
2020-01-03
\n",
+ "
99
\n",
+ "
2.0
\n",
+ "
keep
\n",
+ "
\n",
+ "
\n",
+ "
2020-01-04
\n",
+ "
99
\n",
+ "
3.0
\n",
+ "
keep
\n",
+ "
\n",
+ "
\n",
+ "
2020-01-05
\n",
+ "
0
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
2020-01-06
\n",
+ "
0
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
2020-01-07
\n",
+ "
0
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
2020-01-08
\n",
+ "
0
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
2020-01-09
\n",
+ "
0
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
2020-01-10
\n",
+ "
0
\n",
+ "
NaN
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " correction_code series_values comparison_values\n",
+ "2020-01-01 99 0.0 keep\n",
+ "2020-01-02 99 1.0 keep\n",
+ "2020-01-03 99 2.0 keep\n",
+ "2020-01-04 99 3.0 keep\n",
+ "2020-01-05 0 NaN \n",
+ "2020-01-06 0 NaN \n",
+ "2020-01-07 0 NaN \n",
+ "2020-01-08 0 NaN \n",
+ "2020-01-09 0 NaN \n",
+ "2020-01-10 0 NaN "
+ ]
+ },
+ "execution_count": 33,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# rule_keep_comments\n",
+ "date_range = pd.date_range(\"2020-01-01\", freq=\"D\", periods=10)\n",
+ "raw = pd.Series(index=date_range, data=np.arange(10), dtype=float)\n",
+ "comments = [\"keep\"] * 4 + [\"\"] * 3 + [\"discard\"] * 3\n",
+ "comment_series = pd.Series(index=raw.index, data=comments)\n",
+ "c14 = rlib.rule_keep_comments(raw, [\"keep\"], comment_series)\n",
+ "assert (c14[\"correction_code\"] == 99).sum() == 4\n",
+ "assert (c14[\"comparison_values\"] == \"keep\").sum() == 4\n",
+ "c14"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "artesia",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/pyproject.toml b/pyproject.toml
index a0db3bb..0477869 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "traval"
dynamic = ["version"]
-description = "Python package for applying automatic error detection algorithms to timeseries. Create custom error detection algorithms to support data validation workflows."
+description = "Python package for applying automatic error detection algorithms to time series. Create custom error detection algorithms to support data validation workflows."
license = { file = "LICENSE" }
readme = "readme.md"
authors = [{ name = "D.A. Brakenhoff" }]
@@ -66,10 +66,23 @@ packages = ["traval"]
[tool.setuptools.dynamic]
version = { attr = "traval.version.__version__" }
-[tool.black]
+[tool.ruff]
line-length = 88
+extend-include = ["*.ipynb"]
-[tool.isort]
-profile = "black"
-src_paths = ["traval"]
-line_length = 88
+[tool.ruff.lint]
+# See: https://docs.astral.sh/ruff/rules/
+select = [
+ "C4", # flake8-comprehensions
+ "E", # pycodestyle
+ "F", # pyflakes
+ "I", # isort
+ "PT", # pytest-style
+ "D", # pydocstyle
+ "B", # flake8-bugbear
+ "NPY", # numpy
+]
+ignore = ["D100", "D102", "D103", "D401"]
+
+[tool.ruff.lint.pydocstyle]
+convention = "numpy"
diff --git a/readme.md b/readme.md
index f3e57de..6e34fa1 100644
--- a/readme.md
+++ b/readme.md
@@ -6,63 +6,71 @@
# traval
-Tools for applying automatic error detection algorithms to timeseries.
+Tools for applying automatic error detection algorithms to time series.
## Introduction
-This module is set up to provide tools for applying any error detection
-algorithm to any timeseries. The module consists of three main components:
+This module is set up to provide tools for applying any error detection
+algorithm to any time series. The module consists of three main components:
-- `RuleSet`: the RuleSet object is a highly flexible object for defining error detection algorithms based on (user-defined) functions.
-- `Detector`: a data management object for storing timeseries and error detection results.
-- `SeriesComparison*`: objects for comparing timeseries. These objects include plots for visualizing the comparisons.
+- `RuleSet`: the RuleSet object is a highly flexible object for defining error
+ detection algorithms based on (user-defined) functions.
+- `Detector`: a data management object for storing time series and error detection
+ results.
+- `SeriesComparison*`: objects for comparing time series. These objects include plots
+ for visualizing the comparisons.
The general workflow consists of the following steps:
-1. Define error detection algorithm(s).
-2. Load data, i.e. raw timeseries data and optionally timeseries representing the "truth" to see how well the algorithms perform.
-3. Initialize Detector objects and apply algorithms to timeseries.
-4. Store and analyze the results.
+1. Define error detection algorithm(s).
+2. Load data, i.e. raw time series data and optionally time series representing the
+ "truth" to see how well the algorithms perform.
+3. Initialize Detector objects and apply algorithms to time series.
+4. Store and analyze the results.
-For more detailed information and examples, please refer to the notebooks in
+For more detailed information and examples, please refer to the notebooks in
the examples directory.
## Installation
To install the traval module, follow these steps:
-1. Clone the repository from GitHub.
-2. Open a terminal and navigate to the module root directory: `/traval`
-3. Type `pip install -e .`
+1. Clone the repository from GitHub.
+2. Open a terminal and navigate to the module root directory: `/traval`
+3. Type `pip install -e .`
## Usage
-The basic usage of the module is described below. To start using the module,
+The basic usage of the module is described below. To start using the module,
import the package:
```python
->>> import traval
+import traval
```
-The first step is generally to define an error detection algorithm. This is
+The first step is generally to define an error detection algorithm. This is
done with the `RuleSet` object:
```python
->>> ruleset = traval.RuleSet("my_first_algorithm")
+ruleset = traval.RuleSet("my_first_algorithm")
```
-Add a detection rule (using a general rule from the library contained within
+Add a detection rule (using a general rule from the library contained within
the module). In this case the rule states any value above 10.0 is suspect:
```python
->>> ruleset.add_rule("rule1", traval.rulelib.rule_ufunc_threshold , apply_to=0,
- kwargs={"ufunc": (np.greater,), "threshold": 10.0})
+ruleset.add_rule(
+ "rule1",
+ traval.rulelib.rule_ufunc_threshold,
+ apply_to=0,
+ kwargs={"ufunc": (np.greater,), "threshold": 10.0}
+)
```
Take a look at the ruleset by just typing `ruleset`:
```python
->>> ruleset
+ruleset
```
```text
@@ -71,25 +79,25 @@ RuleSet: 'my_first_algorithm'
1: rule1 0
```
-Next define a Detector object. This object is designed to store a timeseries
-and the intermediate and final results after applying an error detection
-algorithm. Initialize the Detector object with some timeseries. In this example
-we assume there is a timeseries called `raw_series`:
+Next define a Detector object. This object is designed to store a time series
+and the intermediate and final results after applying an error detection
+algorithm. Initialize the Detector object with some time series. In this example
+we assume there is a time series called `raw_series`:
```python
>>> detect = traval.Detector(raw_series)
```
-Apply our first algorithm to the timeseries.
+Apply our first algorithm to the time series.
```python
>>> detect.apply_ruleset(ruleset)
```
-By default, the result of each step in the algorithm is compared to the
-original series and stored in the `detect.comparisons` attribute. Take a
-look at the comparison between the raw data and the result of the error
-detection algorithm.
+By default, the result of each step in the algorithm is compared to the
+original series and stored in the `detect.comparisons` attribute. Take a
+look at the comparison between the raw data and the result of the error
+detection algorithm.
Since we only defined one step, step 1 represents the final result.
@@ -97,7 +105,7 @@ Since we only defined one step, step 1 represents the final result.
>>> cp = detect.comparisons[1] # result of step 1 = final result
```
-The `SeriesComparison*` objects contain methods to visualize the comparison,
+The `SeriesComparison*` objects contain methods to visualize the comparison,
or summarize the number of observations in each category:
```python
@@ -105,9 +113,9 @@ or summarize the number of observations in each category:
>>> cp.summary # series containing number of observations in each category
```
-For more detailed explanation and more complex examples, see the notebook(s)
+For more detailed explanation and more complex examples, see the notebook(s)
in the examples directory.
## Author
-- D.A. Brakenhoff, Artesia, 2020
+- D.A. Brakenhoff, Artesia, 2020
diff --git a/tests/test_001.py b/tests/test_001.py
index b83ffc9..ba2dea7 100644
--- a/tests/test_001.py
+++ b/tests/test_001.py
@@ -1,2 +1,3 @@
+# ruff: noqa: D100 D103
def test_import():
- import traval
+ pass
diff --git a/tests/test_002_ruleset.py b/tests/test_002_ruleset.py
index f95790d..0306bcc 100644
--- a/tests/test_002_ruleset.py
+++ b/tests/test_002_ruleset.py
@@ -1,19 +1,21 @@
+# ruff: noqa: D100 D103
import numpy as np
import pandas as pd
+
import traval
from traval.ruleset import RuleSet
def func1(s):
mask = s > 10
- s = pd.Series(index=s.index, data=0.0)
+ s = pd.DataFrame(index=s.index, data=0.0, columns=["correction_code"])
s.loc[mask] = np.nan
return s
def func2(s, val):
mask = s < val
- s = pd.Series(index=s.index, data=0.0)
+ s = pd.DataFrame(index=s.index, data=0.0, columns=["correction_code"])
s.loc[mask] = np.nan
return s
@@ -32,76 +34,80 @@ def func4(*args):
return result
-def test_init():
+def get_empty_ruleset():
+ return RuleSet(name="test")
+
+
+def get_filled_ruleset():
rset = traval.RuleSet(name="test")
+ rset.add_rule("gt10", func1, apply_to=0)
+ rset.add_rule("less_than_value", func2, apply_to=1, kwargs={"val": 0})
return rset
+def test_init():
+ _ = traval.RuleSet(name="test")
+
+
def test_add_rules():
- rset = test_init()
+ rset = traval.RuleSet(name="test")
rset.add_rule("gt10", func1, apply_to=0)
rset.add_rule("less_than_value", func2, apply_to=1, kwargs={"val": 0})
- return rset
def test_update_rules():
- rset = test_add_rules()
+ rset = traval.RuleSet(name="test")
+ rset.add_rule("gt10", func1, apply_to=0)
+ rset.add_rule("less_than_value", func2, apply_to=1, kwargs={"val": 0})
rset.update_rule("less_than_value", func2, apply_to=1, kwargs={"val": func3})
- return rset
def test_to_dataframe():
- rset = test_add_rules()
- rdf = rset.to_dataframe()
- return rdf
+ rset = get_filled_ruleset()
+ _ = rset.to_dataframe()
def test_applyself_static_kwargs():
series = pd.Series(index=range(10), data=range(-5, 23, 3), name="test_series")
- rset = test_add_rules()
+ rset = get_filled_ruleset()
_, _ = rset(series)
- return
def test_applyself_callable_kwargs():
series = pd.Series(index=range(10), data=range(-5, 23, 3), name="test_series")
- rset = test_update_rules()
+ rset = get_filled_ruleset()
+ rset.update_rule("less_than_value", func2, apply_to=1, kwargs={"val": func3})
_, _ = rset(series)
- return
def test_applyself_combine():
- rset = test_init()
+ rset = traval.RuleSet(name="test")
rset.add_rule("+1", lambda s: s + 1, apply_to=0)
rset.add_rule("add 0+1", func4, apply_to=(0, 1))
series = pd.Series(index=range(10), data=0.0, name="test_series")
d, _ = rset(series)
assert (d[len(d) - 1] == 1.0).all()
- return d
def test_del_rules():
- rset = test_add_rules()
+ rset = get_filled_ruleset()
rset.del_rule("gt10")
assert len(rset.rules) == 1
- return
def test_to_from_pickle():
- rset = test_add_rules()
+ rset = get_filled_ruleset()
rset.to_pickle("test.pkl")
rset = RuleSet.from_pickle("test.pkl")
import os
os.remove("test.pkl")
- return
def test_to_from_json():
- rset = test_add_rules()
+ rset = get_filled_ruleset()
rset.to_json("test.json")
rset = RuleSet.from_json("test.json")
import os
os.remove("test.json")
- return
diff --git a/tests/test_003_detector.py b/tests/test_003_detector.py
index 1b91318..e8588d1 100644
--- a/tests/test_003_detector.py
+++ b/tests/test_003_detector.py
@@ -1,22 +1,42 @@
+# ruff: noqa: D100 D103
import numpy as np
import pandas as pd
+from test_002_ruleset import get_filled_ruleset
+
import traval
-from test_002_ruleset import test_add_rules
+
+def get_detector():
+ s = pd.Series(
+ index=range(10), data=np.arange(-5, 23, 3, dtype=float), name="test_series"
+ )
+ return traval.Detector(s)
+
+
+def get_detector_with_result():
+ d = get_detector()
+ rset = get_filled_ruleset()
+ t = pd.Series(
+ index=range(10), data=np.arange(-5, 23, 3, dtype=float), name="test_series"
+ )
+ t[t < 0] = np.nan
+ t[t > 10] = np.nan
+ d = get_detector()
+ d.set_truth(t)
+ d.apply_ruleset(rset)
+ return d
def test_init_detector():
s = pd.Series(
index=range(10), data=np.arange(-5, 23, 3, dtype=float), name="test_series"
)
- d = traval.Detector(s)
- return d
+ traval.Detector(s)
def test_repr():
- d = test_init_detector()
+ d = get_detector()
d.__repr__()
- return d
def test_add_truth():
@@ -25,56 +45,53 @@ def test_add_truth():
)
t[t < 0] = np.nan
t[t > 10] = np.nan
- d = test_init_detector()
+ d = get_detector()
d.set_truth(t)
- return d
def test_apply_ruleset():
- rset = test_add_rules()
- d = test_add_truth()
+ rset = get_filled_ruleset()
+ t = pd.Series(
+ index=range(10), data=np.arange(-5, 23, 3, dtype=float), name="test_series"
+ )
+ t[t < 0] = np.nan
+ t[t > 10] = np.nan
+ d = get_detector()
+ d.set_truth(t)
d.apply_ruleset(rset)
- return d
def test_reset():
- d = test_apply_ruleset()
+ d = get_detector_with_result()
d.reset()
assert not hasattr(d, "ts_result")
- return
def test_confusion_matrix():
- d = test_apply_ruleset()
+ d = get_detector_with_result()
_ = d.confusion_matrix()
- return
def test_uniqueness():
- d = test_apply_ruleset()
+ d = get_detector_with_result()
_ = d.uniqueness()
- return
def test_plot_overview():
- d = test_apply_ruleset()
+ d = get_detector_with_result()
_ = d.plot_overview()
- return
def test_get_series():
- d = test_apply_ruleset()
+ d = get_detector_with_result()
_ = d.get_series(2, category="tp")
- return
def test_get_corrections():
- d = test_apply_ruleset()
+ d = get_detector_with_result()
_ = d.get_corrections_dataframe()
- return
def test_get_final_result():
- d = test_apply_ruleset()
+ d = get_detector_with_result()
_ = d.get_final_result()
- return
diff --git a/tests/test_004_comparison.py b/tests/test_004_comparison.py
index b7199cc..082fbae 100644
--- a/tests/test_004_comparison.py
+++ b/tests/test_004_comparison.py
@@ -1,5 +1,7 @@
+# ruff: noqa: D100 D103
import numpy as np
import pandas as pd
+
import traval
@@ -12,8 +14,7 @@ def test_series_comparison():
s2 = pd.Series(index=idx2, data=2.0)
s2.loc["2020-04-01":"2020-04-30"] = np.nan
- sc = traval.SeriesComparison(s1, s2)
- return sc
+ _ = traval.SeriesComparison(s1, s2)
def test_series_relative_comparison():
@@ -72,8 +73,6 @@ def test_series_relative_comparison():
for k, v in checkresult.items():
assert summary.loc[k] == v
- return scr
-
def test_relative_comparison_stats():
base_idx = pd.date_range("2020-01-01", periods=110, freq="D")
@@ -94,9 +93,6 @@ def test_relative_comparison_stats():
assert scr.bc.false_positive_rate + scr.bc.specificity == 1
assert scr.bc.false_negative_rate + scr.bc.sensitivity == 1
- return scr
-
-def test_confusion_matrix():
- cp = test_relative_comparison_stats()
- return cp.bc.confusion_matrix()
+ # test confusion matrix
+ scr.bc.confusion_matrix()
diff --git a/tests/test_005_plots.py b/tests/test_005_plots.py
index 1181485..b8996dc 100644
--- a/tests/test_005_plots.py
+++ b/tests/test_005_plots.py
@@ -1,5 +1,7 @@
+# ruff: noqa: D100 D103
import numpy as np
import pandas as pd
+
from traval import SeriesComparison, SeriesComparisonRelative
@@ -20,10 +22,9 @@ def test_series_comparison_plot():
sc = SeriesComparison(s1, s2)
- ax = sc.plots.plot_series_comparison(
+ sc.plots.plot_series_comparison(
mark_different=True, mark_identical=True, mark_unique=True
)
- return ax
def test_relative_series_comparison_plot():
@@ -43,8 +44,6 @@ def test_relative_series_comparison_plot():
scr = SeriesComparisonRelative(s1, s2, b)
- ax = scr.plots.plot_relative_comparison(
+ scr.plots.plot_relative_comparison(
mark_unique=True, mark_different=True, mark_identical=True, mark_introduced=True
)
-
- return ax
diff --git a/tests/test_006_rulelib.py b/tests/test_006_rulelib.py
index efd945b..d37cbeb 100644
--- a/tests/test_006_rulelib.py
+++ b/tests/test_006_rulelib.py
@@ -1,6 +1,7 @@
+# ruff: noqa: D100 D103
import numpy as np
import pandas as pd
-import pytest
+
from traval import rulelib as rlib
@@ -9,8 +10,7 @@ def test_rule_ufunc_threshold_float():
date_range = pd.date_range("2020-01-01", freq="D", periods=10)
s1 = pd.Series(index=date_range, data=np.arange(10))
c1 = rlib.rule_ufunc_threshold(s1, (np.greater_equal,), 5)
- assert c1.iloc[5:].isna().sum() == 5
- return c1
+ assert (c1["correction_code"] == 2).sum() == 5
def test_rule_ufunc_threshold_series():
@@ -20,8 +20,7 @@ def test_rule_ufunc_threshold_series():
idx = date_range[:3].to_list() + date_range[-4:-1].to_list()
thresh_series = pd.Series(index=idx, data=5.0)
c2 = rlib.rule_ufunc_threshold(s1, (np.greater_equal,), thresh_series)
- assert c2.iloc[5:].isna().sum() == 5
- return c2
+ assert (c2["correction_code"] == 2).sum() == 5
def test_rule_diff_ufunc_threshold():
@@ -30,8 +29,7 @@ def test_rule_diff_ufunc_threshold():
s1 = pd.Series(index=date_range, data=np.arange(10))
s1.loc[date_range[4]] += 1
c3 = rlib.rule_diff_ufunc_threshold(s1, (np.greater_equal,), 1.1)
- assert c3.iloc[4:5].isna().all()
- return c3
+ assert (c3["correction_code"] == 2).sum() == 1
def test_rule_other_ufunc_threshold():
@@ -40,8 +38,7 @@ def test_rule_other_ufunc_threshold():
s1 = pd.Series(index=date_range, data=np.arange(10))
val = s1.copy()
c4 = rlib.rule_other_ufunc_threshold(s1, val, (np.less,), 5)
- assert c4.iloc[:5].isna().sum() == 5
- return c4
+ assert (c4["correction_code"] == -2).sum() == 5
def test_rule_max_gradient():
@@ -50,8 +47,7 @@ def test_rule_max_gradient():
s1 = pd.Series(index=date_range, data=np.arange(10))
s1.loc[date_range[4]] += 1
c5 = rlib.rule_max_gradient(s1, max_step=1.0, max_timestep="1D")
- assert c5.iloc[4:5].isna().all()
- return c5
+ assert (c5["correction_code"] == 2).sum() == 1
def test_rule_spike_detection():
@@ -60,8 +56,7 @@ def test_rule_spike_detection():
s1 = pd.Series(index=date_range, data=np.arange(10))
s1.iloc[4] += 3
c6 = rlib.rule_spike_detection(s1, threshold=2, spike_tol=2)
- assert c6.iloc[4:5].isna().all()
- return c6
+ assert (c6["correction_code"] == 99).sum() == 1
def test_offset_detection():
@@ -70,8 +65,7 @@ def test_offset_detection():
s1 = pd.Series(index=date_range, data=np.arange(10))
s1.iloc[3:7] += 10
c7 = rlib.rule_offset_detection(s1, threshold=5, updown_diff=2.0)
- assert c7.iloc[3:7].isna().sum() == 4
- return c7
+ assert (c7["correction_code"] == 99).sum() == 4
def test_rule_outside_n_sigma():
@@ -79,8 +73,8 @@ def test_rule_outside_n_sigma():
date_range = pd.date_range("2020-01-01", freq="D", periods=10)
s1 = pd.Series(index=date_range, data=np.arange(10))
c8 = rlib.rule_outside_n_sigma(s1, n=1.0)
- assert c8.iloc[[0, 1, 8, 9]].isna().sum() == 4
- return c8
+ assert (c8["correction_code"] == -2).sum() == 2
+ assert (c8["correction_code"] == 2).sum() == 2
def test_rule_diff_outside_of_n_sigma():
@@ -88,9 +82,8 @@ def test_rule_diff_outside_of_n_sigma():
date_range = pd.date_range("2020-01-01", freq="D", periods=10)
s1 = pd.Series(index=date_range, data=np.arange(10))
s1.iloc[5:] += np.arange(5)
- c9 = rlib.rule_diff_outside_of_n_sigma(s1, 1.0)
- assert c9.iloc[6:].isna().sum() == 4
- return c9
+ c9 = rlib.rule_diff_outside_of_n_sigma(s1, 2.0)
+ assert (c9["correction_code"] == 2).sum() == 4
def test_rule_outside_bandwidth():
@@ -100,8 +93,19 @@ def test_rule_outside_bandwidth():
lb = pd.Series(index=date_range[[0, -1]], data=[1, 2])
ub = pd.Series(index=date_range[[0, -1]], data=[7, 8])
c10 = rlib.rule_outside_bandwidth(s1, lb, ub)
- assert c10.iloc[[0, 1, 8, 9]].isna().sum() == 4
- return c10
+ assert (c10["correction_code"] == -2).sum() == 2
+ assert (c10["correction_code"] == 2).sum() == 2
+
+
+def test_rule_compare_to_manual_obs():
+ # rule_shift_to_manual_obs
+ date_range = pd.date_range("2020-01-01", freq="D", periods=10)
+ s1 = pd.Series(index=date_range, data=np.arange(10))
+ h = pd.Series(index=date_range[[1, -1]], data=[2, 7])
+ c11 = rlib.rule_compare_to_manual_obs(
+ s1, h, threshold=1.0, max_dt="2D", method="linear"
+ )
+ assert (c11["correction_code"] == -2).sum() == 3
def test_rule_shift_to_manual_obs():
@@ -112,7 +116,6 @@ def test_rule_shift_to_manual_obs():
a = rlib.rule_shift_to_manual_obs(s1, h, max_dt="2D")
assert (a.iloc[1:] == s1.iloc[1:] + 1).all()
assert a.iloc[0] == s1.iloc[0]
- return a
def test_rule_combine_nan_or():
@@ -122,9 +125,18 @@ def test_rule_combine_nan_or():
s2 = s1.copy()
s1.iloc[0] = np.nan
s2.iloc[-1] = np.nan
- c11 = rlib.rule_combine_nan_or(s1, s2)
- assert c11.iloc[[0, -1]].isna().sum() == 2
- return c11
+ c11a = rlib.rule_combine_nan_or(s1, s2)
+ assert c11a.iloc[[0, -1]].isna().sum() == 2
+
+
+def test_rule_combine_corrections_or():
+ date_range = pd.date_range("2020-01-01", freq="D", periods=10)
+ s1 = pd.DataFrame(index=date_range, columns=["correction_code"], data=0)
+ s2 = s1.copy()
+ s1.iloc[0] = 99
+ s2.iloc[-1] = -2
+ c11b = rlib.rule_combine_corrections_or(s1, s2)
+ assert (c11b["correction_code"] == 99).sum() == 2
def test_rule_combine_nan_and():
@@ -134,9 +146,19 @@ def test_rule_combine_nan_and():
s2 = s1.copy()
s1.iloc[0:2] = np.nan
s2.iloc[1:3] = np.nan
- c11 = rlib.rule_combine_nan_and(s1, s2)
- assert c11.isna().sum() == 2
- return c11
+ c12a = rlib.rule_combine_nan_and(s1, s2)
+ assert c12a.isna().sum() == 2
+
+
+def test_rule_combine_corrections_and():
+ # rule_combine_nan
+ date_range = pd.date_range("2020-01-01", freq="D", periods=10)
+ s1 = pd.DataFrame(index=date_range, columns=["correction_code"], data=0)
+ s2 = s1.copy()
+ s1.iloc[0:2] = 99
+ s2.iloc[1:3] = -2
+ c12b = rlib.rule_combine_corrections_and(s1, s2)
+ assert (c12b["correction_code"] == 99).sum() == 1
def test_rule_funcdict_to_nan():
@@ -144,9 +166,8 @@ def test_rule_funcdict_to_nan():
date_range = pd.date_range("2020-01-01", freq="D", periods=10)
s1 = pd.Series(index=date_range, data=np.arange(10))
fdict = {"lt_3": lambda s: s < 3.0, "gt_7": lambda s: s > 7.0}
- c12 = rlib.rule_funcdict_to_nan(s1, fdict)
- assert c12.iloc[[0, 1, 2, -2, -1]].isna().sum() == 5
- return c12
+ c13 = rlib.rule_funcdict(s1, fdict)
+ assert (c13["correction_code"] == 99).sum() == 5
def test_rule_keep_comments():
@@ -155,16 +176,6 @@ def test_rule_keep_comments():
raw = pd.Series(index=date_range, data=np.arange(10), dtype=float)
comments = ["keep"] * 4 + [""] * 3 + ["discard"] * 3
comment_series = pd.Series(index=raw.index, data=comments)
- val = raw.copy()
- val += 1.0
- val.loc[comment_series == "keep"] = np.nan
- f = rlib.rule_keep_comments(raw, ["keep"], comment_series, val)
- assert (f.loc[comment_series == "keep"] == 0).all()
- assert (f.loc[comment_series != "keep"] == 1).all()
-
-
-@pytest.mark.skip
-def test_rule_pastas_outside_pi():
- # rule_pastas_outside_pi
- # skip for now
- pass
+ c14 = rlib.rule_keep_comments(raw, ["keep"], comment_series)
+ assert (c14["correction_code"] == 99).sum() == 4
+ assert (c14["comparison_values"] == "keep").sum() == 4
diff --git a/tests/test_007_binaryclassifier.py b/tests/test_007_binaryclassifier.py
index b8467c0..b548b3f 100644
--- a/tests/test_007_binaryclassifier.py
+++ b/tests/test_007_binaryclassifier.py
@@ -1,14 +1,15 @@
+# ruff: noqa: D100 D103
from pandas import Series
+
from traval import BinaryClassifier
-def test_bc():
- bc = BinaryClassifier(9, 1, 9, 1)
- return bc
+def get_bc():
+ return BinaryClassifier(9, 1, 9, 1)
def test_all_stats():
- bc = test_bc()
+ bc = get_bc()
stats = bc.get_all_statistics()
answer = {
"tp": 9.0,
@@ -31,14 +32,12 @@ def test_all_stats():
"mcc": 0.8,
}
assert (stats == Series(answer)).all()
- return
def test_add():
- bc = test_bc()
+ bc = get_bc()
bcsum = bc + bc
assert bcsum.tp == 18
assert bcsum.fp == 2
assert bcsum.tn == 18
assert bcsum.fn == 2
- return
diff --git a/tests/test_008_travalparameters.py b/tests/test_008_travalparameters.py
index 85a2e8f..4910959 100644
--- a/tests/test_008_travalparameters.py
+++ b/tests/test_008_travalparameters.py
@@ -1,6 +1,8 @@
+# ruff: noqa: D100 D103
import os
import numpy as np
+
from traval import RuleSet, TravalParameters, rulelib
@@ -51,18 +53,17 @@ def get_ruleset2():
def test_tp_from_ruleset():
rset = get_ruleset1()
- tp = TravalParameters.from_ruleset(rset)
- return tp
+ TravalParameters.from_ruleset(rset)
def test_tp_from_ruleset_w_locations():
rset = get_ruleset1()
- tp = TravalParameters.from_ruleset(rset, locations=["loc1"])
- return tp
+ TravalParameters.from_ruleset(rset, locations=["loc1"])
def test_tp_get_parameters_defaults():
- tp = test_tp_from_ruleset()
+ rset = get_ruleset1()
+ tp = TravalParameters.from_ruleset(rset)
_ = tp.get_parameters() # return all defaults
_ = tp.get_parameters(rulename="gt10") # return all params for rule
p3 = tp.get_parameters(rulename="gt10", parameter="threshold") # value
@@ -76,11 +77,11 @@ def test_tp_get_parameters_defaults():
tp.get_parameters(rulename="gt10", parameter="non-existent-param")
except KeyError:
pass
- return
def test_tp_get_parameters_location_specific():
- tp = test_tp_from_ruleset_w_locations()
+ rset = get_ruleset1()
+ tp = TravalParameters.from_ruleset(rset, locations=["loc1"])
_ = tp.get_parameters() # return all defaults
_ = tp.get_parameters(location="loc1") # return all for location
# return loc params for rule
@@ -100,7 +101,6 @@ def test_tp_get_parameters_location_specific():
)
except KeyError:
pass
- return
def test_tp_to_from_csv():
@@ -112,7 +112,6 @@ def test_tp_to_from_csv():
mask = tp.defaults["value"].apply(lambda s: tp._test_callable(s))
assert (tp.defaults.loc[~mask].index == tp2.defaults.index).all()
assert (tp.defaults.loc[~mask, "value"] == tp2.defaults.loc[~mask, "value"]).all()
- return
def test_tp_to_from_json():
@@ -124,7 +123,6 @@ def test_tp_to_from_json():
mask = tp.defaults["value"].apply(lambda s: tp._test_callable(s))
assert (tp.defaults.loc[~mask].index == tp2.defaults.index).all()
assert (tp.defaults.loc[~mask, "value"] == tp2.defaults.loc[~mask, "value"]).all()
- return
def test_tp_to_from_pickle():
@@ -135,4 +133,3 @@ def test_tp_to_from_pickle():
os.remove("test.pkl")
assert (tp.defaults.index == tp2.defaults.index).all()
assert (tp.defaults["value"] == tp2.defaults["value"]).all()
- return
diff --git a/traval/binary_classifier.py b/traval/binary_classifier.py
index bb653e7..563b3d8 100644
--- a/traval/binary_classifier.py
+++ b/traval/binary_classifier.py
@@ -53,7 +53,7 @@ def from_series_comparison_relative(cls, comparison):
Parameters
----------
comparison : traval.SeriesComparisonRelative
- object comparing two timeseries with base timeseries
+ object comparing two time series with base time series
Returns
-------
@@ -89,7 +89,7 @@ def from_confusion_matrix(cls, cmat):
BinaryClassifier
BinaryClassifier object based on values in confusion matrix.
- See also
+ See Also
--------
BinaryClassifier.confusion_matrix : for explanation (of abbreviations)
"""
@@ -154,7 +154,6 @@ def confusion_matrix(self, as_array=False):
data : pd.DataFrame or np.array
confusion matrix
"""
-
# create array with data
data = np.zeros((2, 2), dtype=int)
# true positives = errors correctly identified
@@ -192,7 +191,7 @@ def matthews_correlation_coefficient(self):
phi : float
the Matthews correlation coefficient
- See also
+ See Also
--------
mcc : convenience method for calculating MCC
"""
@@ -218,7 +217,7 @@ def mcc(self):
phi : float
the Matthews correlation coefficient
- See also
+ See Also
--------
matthews_correlation_coefficient : more information about the statistic
"""
@@ -268,7 +267,7 @@ def specificity(self):
def true_positive_rate(self):
"""True Positive Rate. Synonym for sensitivity.
- See sensitiviy for description.
+ See sensitivity for description.
"""
return self.sensitivity
@@ -349,7 +348,7 @@ def accuracy(self):
@property
def prevalence(self):
- """Prevalance of true errors in total population.
+ """Prevalence of true errors in total population.
Prevalence = (TP + FN) / (TP + FP + FN + TN)
@@ -434,7 +433,6 @@ def get_all_statistics(self, use_abbreviations=True):
s : pandas.Series
series containing all statistics
"""
-
sdict = {}
for k, v in self.stats_abbreviations.items():
if use_abbreviations:
diff --git a/traval/detector.py b/traval/detector.py
index 8762bf6..005b9e7 100755
--- a/traval/detector.py
+++ b/traval/detector.py
@@ -5,23 +5,29 @@
import pandas as pd
from .ts_comparison import SeriesComparison, SeriesComparisonRelative
-from .ts_utils import unique_nans_in_series
+from .ts_utils import (
+ corrections_as_float,
+ corrections_as_nan,
+ mask_corrections_modified_value,
+ mask_corrections_no_comparison_value,
+ unique_nans_in_series,
+)
class Detector:
- """Detector object for applying error detection algorithms to timeseries.
+ """Detector object for applying error detection algorithms to time series.
- The Detector is used to apply error detection algorithms to a timeseries
+ The Detector is used to apply error detection algorithms to a time series
and optionally contains a 'truth' series, to which the error detection
result can be compared. An example of a 'truth' series is a manually
- validated timeseries. Custom error detection algorithms can be defined
+ validated time series. Custom error detection algorithms can be defined
using the RuleSet object.
Parameters
----------
series : pd.Series or pd.DataFrame
- timeseries to check
+ time series to check
truth : pd.Series or pd.DataFrame, optional
series that represents the 'truth', i.e. a benchmark to which
the error detection result can be compared, by default None
@@ -29,15 +35,14 @@ class Detector:
Examples
--------
-
- Given a timeseries 'series' and some ruleset 'rset':
+ Given a time series 'series' and some ruleset 'rset':
>>> d = Detector(series)
>>> d.apply_ruleset(rset)
>>> d.plot_overview()
- See also
+ See Also
--------
traval.RuleSet : object for defining detection algorithms
"""
@@ -48,7 +53,7 @@ def __init__(self, series, truth=None):
Parameters
----------
series : pd.Series or pd.DataFrame
- timeseries to check
+ time series to check
truth : pd.Series or pd.DataFrame, optional
series that represents the 'truth', i.e. a benchmark to which
the error detection result can be compared, by default None
@@ -77,7 +82,7 @@ def _validate_input_series(series):
Parameters
----------
series : object
- timeseries to check, must be pd.Series or pd.DataFrame. Datatype
+ time series to check, must be pd.Series or pd.DataFrame. Datatype
of series or first column of DataFrame must be float.
Raises
@@ -85,7 +90,6 @@ def _validate_input_series(series):
TypeError
if series or dtype of series does not comply
"""
-
# check pd.Series or pd.DataFrame
if isinstance(series, pd.Series):
dtype = series.dtypes
@@ -123,7 +127,7 @@ def apply_ruleset(self, ruleset, compare=True):
for convenience.
- See also
+ See Also
--------
traval.RuleSet : object for defining detection algorithms
"""
@@ -238,8 +242,8 @@ def confusion_matrix(self, steps=None, truth=None):
def uniqueness(self, truth=None):
"""Calculate unique contribution per rule to stats.
- Note: the calculated statistics per rule contain an undercount,
- i.e. when multiple rules mark the same observatin as suspect it is
+ Note: the calculated statistics per rule are under counted,
+ i.e. when multiple rules mark the same observation as suspect it is
not contained in this result.
Parameters
@@ -388,7 +392,7 @@ def get_comment_series(self, steps=None):
rulenames = [self.ruleset.get_step_name(i) for i in steps]
# get corrections
- corr = self.get_corrections_dataframe()
+ corr = self.get_corrections_dataframe(as_correction_codes=True)
if corr.empty:
corr = pd.DataFrame(index=self.series.index, columns=rulenames, data=0.0)
@@ -397,8 +401,8 @@ def get_comment_series(self, steps=None):
comments = []
for col in corr.columns:
- s = corr[col].copy()
- s = s.replace(0.0, "").replace(np.nan, col)
+ s = pd.Series(index=corr.index, data=col)
+ s.loc[corr[col] == 0] = ""
comments.append(s)
comments = pd.concat(comments, axis=1).apply(
@@ -422,12 +426,12 @@ def get_results_dataframe(self):
return df
def get_final_result(self):
- """Get final timeseries with flagged values set to NaN.
+ """Get final time series with flagged values set to NaN.
Returns
-------
series : pandas.Series
- Timeseries produced by final step in RuleSet with flagged
+ time series produced by final step in RuleSet with flagged
values set to NaN.
"""
key = len(self.results.keys()) - 1
@@ -435,29 +439,54 @@ def get_final_result(self):
s.name = self.name
return s
- def get_corrections_dataframe(self):
+ def get_corrections_dataframe(self, as_correction_codes=False, as_addable_df=False):
"""Get DataFrame containing corrections.
+ Parameters
+ ----------
+ as_correction_codes : bool, optional
+ return DataFrame with correction codes, by default False
+ as_addable_df : bool, optional
+ return DataFrame with corrections dataframe that you can add to the original
+ time series to obtain the final result. Corrections are NaN when errors are
+ detected, and nonzero where observations are shifted, and zero everywhere
+ else.
+
Returns
-------
df : pandas.DataFrame
- DataFrame containing corrections. NaN means value is flagged
- as suspicious, 0.0 means no correction.
+ DataFrame containing corrections.
"""
+ if as_correction_codes and as_addable_df:
+ raise ValueError(
+ "Only one of 'as_correction_codes' and 'as_addable_df' can be True!"
+ )
clist = []
for s in self.corrections.values():
if isinstance(s, np.ndarray):
- s = pd.Series(dtype=float)
- clist.append(s.fillna(-9999))
-
- # corrections are nan, 0.0 means nothing is changed
- df = (
- pd.concat(clist, axis=1)
- .isna()
- .astype(float)
- .replace(0.0, np.nan)
- .replace(1.0, 0.0)
- )
+ if as_addable_df:
+ s = pd.Series()
+ else:
+ s = pd.Series(name="correction_code")
+ elif isinstance(s, pd.DataFrame) and "correction_code" in s.columns:
+ if as_addable_df:
+ s = corrections_as_nan(s) + corrections_as_float(s)
+ else:
+ s = s["correction_code"]
+ elif isinstance(s, pd.Series):
+ if as_correction_codes:
+ s = mask_corrections_no_comparison_value(s, s.isna()).add(
+ mask_corrections_modified_value(s, s, (s.notnull() & s != 0.0)),
+ fill_value=0,
+ )
+ s = s["correction_code"]
+
+ clist.append(s)
+
+ # corrections, 0 means nothing is changed, nan means value is missing
+ df = pd.concat(clist, axis=1)
+ if as_correction_codes:
+ df = df.infer_objects(copy=False).fillna(0).astype(int)
df.columns = list(self.ruleset.rules.keys())
return df
@@ -506,7 +535,7 @@ def get_corrections_comparison(self, truth=None):
return df
def plot_overview(self, mark_suspects=True, **kwargs):
- """Plot timeseries with flagged values per applied rule.
+ """Plot time series with flagged values per applied rule.
Parameters
----------
@@ -518,8 +547,6 @@ def plot_overview(self, mark_suspects=True, **kwargs):
ax : list of matplotlib.pyplot.Axes
axes objects
"""
- resultsdf = self.get_results_dataframe()
-
if "figsize" in kwargs:
figsize = kwargs.pop("figsize")
else:
@@ -534,16 +561,17 @@ def plot_overview(self, mark_suspects=True, **kwargs):
**kwargs,
)
- for iax, icol in zip(axes, resultsdf):
- iax.plot(resultsdf.index, resultsdf[icol], label=icol)
+ for icol, iax in enumerate(axes):
+ iresult = self.results[icol]
+ iax.plot(iresult.index, iresult, label=self.ruleset.get_step_name(icol))
if mark_suspects:
- if icol != resultsdf.columns[0]:
- corr = self.corrections[resultsdf.columns.get_loc(icol)]
- if isinstance(corr, pd.Series):
+ if icol != 0:
+ icorr = self.corrections[icol]
+ if isinstance(icorr, pd.DataFrame):
iax.plot(
- corr.index,
- resultsdf.loc[corr.index].iloc[:, 0],
+ icorr.index,
+ self.results[0].loc[icorr.index],
marker="x",
c="C3",
ls="none",
diff --git a/traval/params.py b/traval/params.py
index d61e2a3..90a2749 100644
--- a/traval/params.py
+++ b/traval/params.py
@@ -409,7 +409,7 @@ def _combine_parameter_dfs(self):
@staticmethod
def _test_callable(f):
- """Method to test whether parameter value is a callable.
+ """Test whether parameter value is a callable.
Also returns True if callable is stored in a tuple.
diff --git a/traval/plots.py b/traval/plots.py
index 4d761fd..7edf91f 100755
--- a/traval/plots.py
+++ b/traval/plots.py
@@ -6,7 +6,7 @@
class ComparisonPlots:
- """Mix-in class for plots for comparing timeseries."""
+ """Mix-in class for plots for comparing time series."""
color_dict = {
"only_in_s1": {"color": "orange"},
@@ -60,16 +60,16 @@ def reset_color_dict(self):
def plot_series_comparison(
self, mark_unique=True, mark_different=True, mark_identical=True, ax=None
):
- """Plot comparison between two timeseries.
+ """Plot comparison between two time series.
Parameters
----------
mark_unique : bool, optional
mark unique values with colored X's, by default True
mark_different : bool, optional
- highlight where timeseries differ with red, by default True
+ highlight where time series differ with red, by default True
mark_identical : bool, optional
- highlight where timeseries are identical with green,
+ highlight where time series are identical with green,
by default True
ax : axis, optional
axis object to plot on, by default None
@@ -79,7 +79,6 @@ def plot_series_comparison(
ax : axis
axis object
"""
-
if ax is None:
fig, ax = plt.subplots(1, 1, figsize=(12, 5))
else:
@@ -174,7 +173,7 @@ def plot_relative_comparison(
mark_introduced=False,
ax=None,
):
- """Plot comparison between two timeseries relative to base timeseries.
+ """Plot comparison between two time series relative to base time series.
Parameters
----------
@@ -185,7 +184,7 @@ def plot_relative_comparison(
mark_identical : bool, optional
highlight where series are identical with green, by default True
mark_introduced : bool, optional
- mark observations that are not in the base timeseries with X's,
+ mark observations that are not in the base time series with X's,
by default False
ax : axis, optional
axis to plot on, by default None
@@ -195,7 +194,6 @@ def plot_relative_comparison(
ax : axis
axis handle
"""
-
ax = self.plot_series_comparison(
mark_unique=mark_unique,
mark_different=mark_different,
@@ -479,7 +477,6 @@ def det_plot(fpr, fnr, labels, ax=None, **kwargs):
ax : matplotlib.pyplot.Axes
axes handle
"""
-
if not isinstance(fpr, list):
fpr = [fpr]
if not isinstance(fnr, list):
diff --git a/traval/rulelib.py b/traval/rulelib.py
index 04024cb..6e12c9d 100755
--- a/traval/rulelib.py
+++ b/traval/rulelib.py
@@ -5,9 +5,16 @@
import pandas as pd
from .ts_utils import (
+ CorrectionCode,
diff_with_gap_awareness,
+ get_empty_corrections_df,
interpolate_series_to_new_index,
- mask_corrections_as_nan,
+ mask_corrections_above_below,
+ mask_corrections_above_threshold,
+ mask_corrections_below_threshold,
+ mask_corrections_equal_value,
+ mask_corrections_no_comparison_value,
+ mask_corrections_not_equal_value,
resample_short_series_to_long_series,
smooth_lower_bound,
smooth_upper_bound,
@@ -15,7 +22,18 @@
)
-def rule_funcdict_to_nan(series, funcdict):
+def _ufunc_corrections(series, ufunc, threshold, mask):
+ if "greater" in ufunc.__name__:
+ return mask_corrections_above_threshold(series, threshold, mask)
+ elif "less" in ufunc.__name__:
+ return mask_corrections_below_threshold(series, threshold, mask)
+ elif ufunc.__name__ == "equal":
+ return mask_corrections_equal_value(series, threshold, mask)
+ else:
+ return mask_corrections_not_equal_value(series, threshold, mask)
+
+
+def rule_funcdict(series, funcdict):
"""Detection rule, flag values with dictionary of functions.
Use dictionary of functions to identify suspect values and set
@@ -24,17 +42,17 @@ def rule_funcdict_to_nan(series, funcdict):
Parameters
----------
series : pd.Series
- timeseries in which suspect values are identified
+ time series in which suspect values are identified
funcdict : dict
dictionary with function names as keys and functions/methods as
- values. Each function is applied to each value in the timeseries
+ values. Each function is applied to each value in the time series
using `series.apply(func)`. Suspect values are those where
the function evaluates to True.
Returns
-------
corrections: pd.Series
- a series with same index as the input timeseries containing
+ a series with same index as the input time series containing
corrections. Suspect values (according to the provided functions)
are set to np.nan.
"""
@@ -43,19 +61,19 @@ def rule_funcdict_to_nan(series, funcdict):
mask = series.apply(func)
else:
mask = or_(mask, series.apply(func))
- return mask_corrections_as_nan(series, mask)
+ return mask_corrections_no_comparison_value(series, mask)
def rule_max_gradient(series, max_step=0.5, max_timestep="1D"):
"""Detection rule, flag values when maximum gradient exceeded.
- Set values tot NaN when maximum gradient between two
- observations is exceeded.
+ Flag values when maximum gradient between two observations is exceeded.
+ Use negative max_step to flag values with negative gradient.
Parameters
----------
series : pd.Series
- timeseries in which suspect values are identified
+ time series in which suspect values are identified
max_step : float, optional
max jump between two observations within given timestep,
by default 0.5
@@ -66,15 +84,19 @@ def rule_max_gradient(series, max_step=0.5, max_timestep="1D"):
Returns
-------
corrections: pd.Series
- a series with same index as the input timeseries containing
+ a series with same index as the input time series containing
corrections. Suspect values are set to np.nan.
"""
- conversion = pd.Timedelta(max_timestep) / pd.Timedelta("1S")
+ conversion = pd.Timedelta(max_timestep) / pd.Timedelta("1s")
grad = (
series.diff() / series.index.to_series().diff().dt.total_seconds() * conversion
)
- mask = grad.abs() > max_step
- return mask_corrections_as_nan(series, mask)
+ if max_step > 0.0:
+ mask = grad > max_step
+ return mask_corrections_above_threshold(series, max_step, mask)
+ else:
+ mask = grad < -max_step
+ return mask_corrections_below_threshold(series, max_step, mask)
def rule_hardmax(series, threshold, offset=0.0):
@@ -100,13 +122,13 @@ def rule_ufunc_threshold(series, ufunc, threshold, offset=0.0):
Parameters
----------
series : pd.Series
- timeseries in which suspect values are identified
+ time series in which suspect values are identified
ufunc : tuple
tuple containing ufunc (i.e. (numpy.greater_equal,) ). The function
must be callable according to `ufunc(series, threshold)`. The function
is passed as a tuple to bypass RuleSet logic.
threshold : float or pd.Series
- value or timeseries to compare series with
+ value or time series to compare series with
offset : float, optional
value that is added to the threshold, e.g. if some extra tolerance is
allowable. Default value is 0.0.
@@ -114,16 +136,18 @@ def rule_ufunc_threshold(series, ufunc, threshold, offset=0.0):
Returns
-------
corrections: pd.Series
- a series with same index as the input timeseries containing
+ a series with same index as the input time series containing
corrections. Suspect values are set to np.nan.
"""
ufunc = ufunc[0]
if isinstance(threshold, pd.Series):
full_threshold_series = resample_short_series_to_long_series(threshold, series)
+ threshold = full_threshold_series.add(offset)
mask = ufunc(series, full_threshold_series.add(offset))
else:
- mask = ufunc(series, threshold + offset)
- return mask_corrections_as_nan(series, mask)
+ threshold = threshold + offset
+ mask = ufunc(series, threshold)
+ return _ufunc_corrections(series, ufunc, threshold, mask)
def rule_diff_ufunc_threshold(series, ufunc, threshold, max_gap="7D"):
@@ -142,13 +166,13 @@ def rule_diff_ufunc_threshold(series, ufunc, threshold, max_gap="7D"):
Parameters
----------
series : pd.Series
- timeseries in which suspect values are identified
+ time series in which suspect values are identified
ufunc : tuple
tuple containing ufunc (i.e. (numpy.greater_equal,) ). The function
must be callable according to `ufunc(series, threshold)`. The function
is passed as a tuple to bypass RuleSet logic.
threshold : float
- value to compare diff of timeseries to
+ value to compare diff of time series to
max_gap : str, optional
only considers observations within this maximum gap
between measurements to calculate diff, by default "7D".
@@ -156,21 +180,20 @@ def rule_diff_ufunc_threshold(series, ufunc, threshold, max_gap="7D"):
Returns
-------
corrections: pd.Series
- a series with same index as the input timeseries containing
+ a series with same index as the input time series containing
corrections. Suspect values are set to np.nan.
"""
ufunc = ufunc[0]
# identify gaps and set diff value after gap to nan
diff = diff_with_gap_awareness(series, max_gap=max_gap)
- mask = ufunc(diff.abs(), threshold)
- return mask_corrections_as_nan(series, mask)
+ mask = ufunc(diff, threshold)
+ return _ufunc_corrections(series, ufunc, threshold, mask)
def rule_other_ufunc_threshold(series, other, ufunc, threshold):
"""Detection rule, flag values based on other series and threshold.
- Set values to Nan based on comparison of another timeseries with a
- threshold value.
+ Correct values based on comparison of another time series with a threshold value.
The argument ufunc is a tuple containing an operator function (i.e. '>',
'<', '>=', '<='). These are passed using their named equivalents, e.g. in
@@ -181,44 +204,45 @@ def rule_other_ufunc_threshold(series, other, ufunc, threshold):
Parameters
----------
series : pd.Series
- timeseries in which suspect values are identified, only used
+ time series in which suspect values are identified, only used
to test if index of other overlaps
other : pd.Series
- other timeseries based on which suspect values are identified
+ other time series based on which suspect values are identified
ufunc : tuple
tuple containing ufunc (i.e. (numpy.greater_equal,) ). The function
must be callable according to `ufunc(series, threshold)`. The function
is passed as a tuple to bypass RuleSet logic.
threshold : float
- value to compare timeseries to
+ value to compare time series to
Returns
-------
corrections: pd.Series
- a series with same index as the input timeseries containing
+ a series with same index as the input time series containing
corrections. Suspect values are set to np.nan.
"""
ufunc = ufunc[0]
mask = ufunc(other, threshold)
shared_idx = series.index.intersection(other.loc[mask].index)
- return mask_corrections_as_nan(series, shared_idx)
+ other_values = other.reindex(series.index).loc[series.index]
+ return _ufunc_corrections(other_values, ufunc, threshold, shared_idx)
def rule_spike_detection(series, threshold=0.15, spike_tol=0.15, max_gap="7D"):
- """Detection rule, identify spikes in timeseries and set to NaN.
+ """Detection rule, identify spikes in time series and set to NaN.
- Spikes are sudden jumps in the value of a timeseries that last 1 timestep.
+ Spikes are sudden jumps in the value of a time series that last 1 timestep.
They can be both negative or positive.
Parameters
----------
series : pd.Series
- timeseries in which suspect values are identified
+ time series in which suspect values are identified
threshold : float, optional
the minimum size of the jump to qualify as a spike, by default 0.15
spike_tol : float, optional
- offset between value of timeseries before spike and after spike,
- by default 0.15. After a spike, the value of the timeseries is usually
+ offset between value of time series before spike and after spike,
+ by default 0.15. After a spike, the value of the time series is usually
close to but not identical to the value that preceded the spike. Use
this parameter to control how close the value has to be.
max_gap : str, optional
@@ -228,14 +252,14 @@ def rule_spike_detection(series, threshold=0.15, spike_tol=0.15, max_gap="7D"):
Returns
-------
corrections: pd.Series
- a series with same index as the input timeseries containing
+ a series with same index as the input time series containing
corrections. Suspect values are set to np.nan.
"""
upspikes, downspikes = spike_finder(
series, threshold=threshold, spike_tol=spike_tol, max_gap=max_gap
)
mask = upspikes.index.union(downspikes.index)
- return mask_corrections_as_nan(series, mask)
+ return mask_corrections_no_comparison_value(series, mask)
def rule_offset_detection(
@@ -257,7 +281,7 @@ def rule_offset_detection(
Parameters
----------
series : pd.Series
- timeseries in which to look for offset errors
+ time series in which to look for offset errors
threshold : float, optional
minimum jump to consider as offset error, by default 0.35
updown_diff : float, optional
@@ -278,7 +302,7 @@ def rule_offset_detection(
Returns
-------
corrections: pd.Series
- a series with same index as the input timeseries containing
+ a series with same index as the input time series containing
corrections. Suspect values are set to np.nan.
"""
verbose = False
@@ -356,11 +380,19 @@ def rule_offset_detection(
]
periods = [jump_df.index[0], series.index[-1]]
- corrections = pd.Series(
- index=series.index, data=np.zeros(series.index.size), fastpath=True
+ # manually compute corrections dataframe
+ corrections = pd.DataFrame(
+ index=series.index,
+ data={
+ "correction_code": np.zeros(series.size, dtype=float),
+ "series_values": np.full(series.size, np.nan),
+ "comparison_values": np.full(series.size, np.nan),
+ },
)
for j in range(0, len(periods), 2):
- corrections.loc[periods[j] : periods[j + 1] - pd.Timedelta(seconds=30)] = np.nan
+ corrections.loc[
+ periods[j] : periods[j + 1] - pd.Timedelta(seconds=30), "correction_code"
+ ] = 99
if return_df:
return corrections, df, jump_df
else:
@@ -368,27 +400,34 @@ def rule_offset_detection(
def rule_outside_n_sigma(series, n=2.0):
- """Detection rule, set values outside of n * standard deviation to NaN
+ """Detection rule, set values outside of n * standard deviation to NaN.
Parameters
----------
series : pd.Series
- timeseries in which suspect values are identified
+ time series in which suspect values are identified
n : float, optional
number of standard deviations to use, by default 2
Returns
-------
corrections: pd.Series
- a series with same index as the input timeseries containing
+ a series with same index as the input time series containing
corrections. Suspect values are set to np.nan.
"""
-
- mask = (series > series.mean() + n * series.std()) | (
- series < series.mean() - n * series.std()
+ threshold_above = series.mean() + n * series.std()
+ mask_above = series > threshold_above
+ threshold_below = series.mean() - n * series.std()
+ mask_below = series < threshold_below
+
+ return mask_corrections_above_below(
+ series,
+ mask_above,
+ threshold_above,
+ mask_below,
+ threshold_below,
)
- return mask_corrections_as_nan(series, mask)
def rule_diff_outside_of_n_sigma(series, n=2.0, max_gap="7D"):
@@ -400,7 +439,7 @@ def rule_diff_outside_of_n_sigma(series, n=2.0, max_gap="7D"):
Parameters
----------
series : pd.Series
- timeseries in which suspect values are identified
+ time series in which suspect values are identified
n : float, optional
number of standard deviations to use, by default 2
max_gap : str, optional
@@ -410,15 +449,14 @@ def rule_diff_outside_of_n_sigma(series, n=2.0, max_gap="7D"):
Returns
-------
corrections: pd.Series
- a series with same index as the input timeseries containing
+ a series with same index as the input time series containing
corrections. Suspect values are set to np.nan.
"""
-
# identify gaps and set diff value after gap to nan
diff = diff_with_gap_awareness(series, max_gap=max_gap)
nsigma = n * diff.std()
- mask = (diff.abs() - diff.mean()) > nsigma
- return mask_corrections_as_nan(series, mask)
+ mask = diff.abs() > nsigma
+ return mask_corrections_above_threshold(diff, nsigma, mask)
def rule_outside_bandwidth(series, lowerbound, upperbound):
@@ -427,18 +465,18 @@ def rule_outside_bandwidth(series, lowerbound, upperbound):
Parameters
----------
series : pd.Series
- timeseries in which suspect values are identified
+ time series in which suspect values are identified
lowerbound : pd.Series
- timeseries containing the lower bound, if bound values are less
+ time series containing the lower bound, if bound values are less
frequent than series, bound is interpolated to series.index
upperbound : pd.Series
- timeseries containing the upper bound, if bound values are less
+ time series containing the upper bound, if bound values are less
frequent than series, bound is interpolated to series.index
Returns
-------
corrections : pd.Series
- a series with same index as the input timeseries containing
+ a series with same index as the input time series containing
corrections. Suspect values are set to np.nan.
"""
if series.index.symmetric_difference(lowerbound.index).size > 0:
@@ -446,8 +484,11 @@ def rule_outside_bandwidth(series, lowerbound, upperbound):
if series.index.symmetric_difference(upperbound.index).size > 0:
upperbound = interpolate_series_to_new_index(upperbound, series.index)
- mask = (series > upperbound) | (series < lowerbound)
- return mask_corrections_as_nan(series, mask)
+ mask_above = series > upperbound
+ mask_below = series < lowerbound
+ return mask_corrections_above_below(
+ series, mask_above, upperbound, mask_below, lowerbound
+ )
def rule_pastas_outside_pi(
@@ -463,15 +504,15 @@ def rule_pastas_outside_pi(
):
"""Detection rule, flag values based on pastas model prediction interval.
- Flag suspect outside prediction interval calculated by pastas timeseries
+ Flag suspect outside prediction interval calculated by pastas time series
model. Uses a pastas.Model and a confidence interval as input.
Parameters
----------
series : pd.Series
- timeseries to identify suspect observations in
+ time series to identify suspect observations in
ml : pastas.Model
- timeseries model for series
+ time series model for series
ci : float, optional
confidence interval for calculating bandwidth, by default 0.95.
Higher confidence interval means that bandwidth is wider and more
@@ -495,34 +536,30 @@ def rule_pastas_outside_pi(
Returns
-------
corrections : pd.Series
- a series with same index as the input timeseries containing
+ a series with same index as the input time series containing
corrections. Suspect values are set to np.nan.
"""
# no model
if ml is None:
if verbose:
print("Warning: No Pastas model found!")
- corrections = mask_corrections_as_nan(
- series, pd.Series(index=series.index, data=False)
- )
- corrections.name = "sim"
- # no fit
- elif ml.fit is None:
+ corrections = get_empty_corrections_df(series)
+ corrections.columns = ["sim", "series_values", "comparison_values"]
+ # no solver
+ elif ml.solver is None:
if verbose:
- print("Warning: Pastas model fit attribute is None!")
- corrections = mask_corrections_as_nan(
- series, pd.Series(index=series.index, data=False)
- )
- corrections.name = "sim"
+ print("Warning: Model has no attribute solver!")
+ corrections = get_empty_corrections_df(series)
+ corrections.columns = ["sim", "series_values", "comparison_values"]
# calculate pi
else:
- if tmin is not None:
- ml.settings["tmin"] = tmin
- if tmax is not None:
- ml.settings["tmax"] = tmax
+ if tmin is None:
+ tmin = series.first_valid_index()
+ if tmax is None:
+ tmax = series.last_valid_index()
# calculate prediction interval
- pi = ml.fit.prediction_interval(alpha=(1 - ci))
+ pi = ml.solver.prediction_interval(alpha=(1 - ci), tmin=tmin, tmax=tmax)
# prediction interval empty
if pi.empty:
@@ -531,10 +568,8 @@ def rule_pastas_outside_pi(
"Warning: calculated prediction interval with "
"Pastas model is empty!"
)
- corrections = mask_corrections_as_nan(
- series, pd.Series(index=series.index, data=False)
- )
- corrections.name = "sim"
+ corrections = get_empty_corrections_df(series)
+ corrections.columns = ["sim", "series_values", "comparison_values"]
else:
lower = pi.iloc[:, 0]
upper = pi.iloc[:, 1]
@@ -553,9 +588,15 @@ def rule_pastas_outside_pi(
lower = lower - min_ci / 2.0
corrections = rule_outside_bandwidth(series, lower, upper)
- corrections.name = "sim (r^2={0:.3f})".format(ml.stats.rsq())
+ corrections.columns = [
+ "correction_code",
+ "series_values",
+ "comparison_values",
+ ]
+ corrections.index.name = f"sim (r^2={ml.stats.rsq():.3f})"
if savedir:
+ savedir.mkdir(exist_ok=True)
pi.to_pickle(os.path.join(savedir, f"pi_{ml.name}.pkl"))
return corrections
@@ -567,80 +608,117 @@ def rule_pastas_percentile_pi(
if ml is None:
if verbose:
print("Warning: No Pastas model found!")
- corrections = mask_corrections_as_nan(
- series, pd.Series(index=series.index, data=False)
- )
- corrections.name = "sim"
- # no fit
- elif ml.fit is None:
+ corrections = get_empty_corrections_df(series)
+ corrections.columns = ["sim", "series_values", "comparison_values"]
+ # no solver
+ elif ml.solver is None:
if verbose:
- print("Warning: Pastas model fit attribute is None!")
- corrections = mask_corrections_as_nan(
- series, pd.Series(index=series.index, data=False)
- )
- corrections.name = "sim"
+ print("Warning: Model has no solver attribute!")
+ corrections = get_empty_corrections_df(series)
+ corrections.columns = ["sim", "series_values", "comparison_values"]
# calculate realizations
# TODO: work in progress
-def rule_keep_comments(series, keep_comments, comment_series, other_series):
- """Filter rule, modify timeseries to keep data with certain comments.
+def rule_keep_comments(series, keep_comments, comment_series):
+ """Filter rule, modify time series to keep data with certain comments.
- This rule was invented to extract timeseries only containing certain
+ This rule was invented to extract time series only containing certain
types of errors, based on labeled data. For example, to get only erroneous
observations caused by sensors above the groundwater level:
- - series: the raw timeseries
+ - series: the raw time series
- keep_comments: list of comments to keep, e.g. ['dry sensor']
- - comment_series: timeseries containing the comments for erroneous obs
- - other_series: the validated timeseries where the commented observations
- were removed (set to NaN).
+ - comment_series: time series containing the comments for erroneous obs
Parameters
----------
series : pd.Series
- timeseries to filter
+ time series to filter
keep_comments : list of str
list of comments to keep
comment_series : pd.Series
- timeseries containing comments, should have same index as series
- other_series : pd.Series
- timeseries containing corrected/adjusted values corresponding
- to the commmented entries.
+ time series containing comments, should have same index as series
Returns
-------
- corrections : pd.Series
- timeseries containing NaN values where comment is in keep_comments
+ corrections : pd.DataFrame
+ dataframe containing correction code 99 where comment is in keep_comments
and 0 otherwise.
"""
- new_series = series.copy()
- for c in keep_comments:
- mask = comment_series.str.startswith(c)
- new_series.where(mask, other=other_series, inplace=True)
+ c = get_empty_corrections_df(series)
+ c["comparison_values"] = ""
+ for comment in keep_comments:
+ mask = comment_series.str.contains(comment)
+ c.loc[mask, "correction_code"] = CorrectionCode.UNKNOWN_COMPARISON_VALUE
+ c.loc[mask, "series_values"] = series.loc[mask]
+ c.loc[mask, "comparison_values"] = comment
- corrections = new_series - series
- corrections.name = "_".join(keep_comments)
+ return c
- return corrections
+
+def rule_compare_to_manual_obs(
+ series, manual_obs, threshold=0.05, method="linear", max_dt="1D"
+):
+ # check if time between manual obs and sensor obs
+ # are further apart than max_dt:
+ nearest = series.index.get_indexer(manual_obs.index, method="nearest")
+ mask = np.abs((series.index[nearest] - manual_obs.index).total_seconds()) <= (
+ pd.Timedelta(max_dt) / pd.Timedelta("1s")
+ )
+
+ # interpolate raw obs to manual obs times
+ s_obs = (
+ series.reindex(series.index.join(manual_obs.index, how="outer"))
+ .interpolate(method="time")
+ .loc[manual_obs.index]
+ )
+
+ # calculate diff (manual - sensor, i.e. positive value means
+ # manual observation is higher)
+ diff = -(s_obs - manual_obs)
+
+ # use only diff where mask is True (= time between obs < max_dt)
+ diff = diff.loc[mask]
+
+ # interpolate w/ method
+ if method == "linear":
+ diff_full_index = (
+ diff.reindex(series.index.join(diff.index, how="outer"), method=None)
+ .interpolate(method="linear")
+ .fillna(0.0)
+ )
+ else:
+ diff_full_index = diff.reindex(series.index, method=method).fillna(0.0)
+
+ mask_above = diff_full_index.loc[series.index] > threshold
+ mask_below = diff_full_index.loc[series.index] < -threshold
+
+ return mask_corrections_above_below(
+ diff_full_index.loc[series.index],
+ mask_above,
+ threshold,
+ mask_below,
+ -threshold,
+ )
def rule_shift_to_manual_obs(
series, hseries, method="linear", max_dt="1D", reset_dates=None
):
- """Adjustment rule, for shifting timeseries onto manual observations.
+ """Adjustment rule, for shifting time series onto manual observations.
- Used for shifting timeseries based on sensor observations onto manual
+ Used for shifting time series based on sensor observations onto manual
verification measurements. By default uses linear interpolation between
two manual verification observations.
Parameters
----------
series : pd.Series
- timeseries to adjust
+ time series to adjust
hseries : pd.Series
- timeseries containing manual observations
+ time series containing manual observations
method : str, optional
method to use for interpolating between two manual observations,
by default "linear". Other options are those that are accepted by
@@ -656,15 +734,14 @@ def rule_shift_to_manual_obs(
Returns
-------
adjusted_series : pd.Series
- timeseries containing adjustments to shift series onto manual
+ time series containing adjustments to shift series onto manual
observations.
"""
# check if time between manual obs and sensor obs
# are further apart than max_dt:
- # nearest = hseries.index.map(lambda t: series.index[series.index.get_indexer([t], method="nearest")])
nearest = series.index.get_indexer(hseries.index, method="nearest")
mask = np.abs((series.index[nearest] - hseries.index).total_seconds()) <= (
- pd.Timedelta(max_dt) / pd.Timedelta("1S")
+ pd.Timedelta(max_dt) / pd.Timedelta("1s")
)
# interpolate raw obs to manual obs times
@@ -674,8 +751,9 @@ def rule_shift_to_manual_obs(
.loc[hseries.index]
)
- # calculate diff
- diff = s_obs - hseries
+ # calculate diff (manual - sensor, i.e. positive value means
+ # manual observation is higher)
+ diff = -(s_obs - hseries)
# use only diff where mask is True (= time between obs < max_dt)
diff = diff.loc[mask]
@@ -694,13 +772,13 @@ def rule_shift_to_manual_obs(
else:
diff_full_index = diff.reindex(series.index, method=method).fillna(0.0)
- adjusted_series = series - diff_full_index
+ adjusted_series = series + diff_full_index
return adjusted_series
def rule_combine_nan_or(*args):
- """Combination rule, combine NaN values for any number of timeseries.
+ """Combination rule, combine NaN values for any number of time series.
Used for combining intermediate results in branching algorithm trees to
create one final result, i.e. (s1.isna() OR s2.isna())
@@ -708,7 +786,7 @@ def rule_combine_nan_or(*args):
Returns
-------
corrections : pd.Series
- a series with same index as the input timeseries containing
+ a series with same index as the input time series containing
corrections. Contains NaNs where any of the input series
values is NaN.
"""
@@ -720,8 +798,28 @@ def rule_combine_nan_or(*args):
return result
+def rule_combine_corrections_or(*args):
+ """Combination rule, combine corrections for any number of time series.
+
+ Used for combining intermediate results in branching algorithm trees to
+ create one final result, i.e. (corr_s1 OR corr_s2)
+
+ Returns
+ -------
+ corrections : pd.Series
+ a series with same index as the input time series containing
+ corrections. Contains corrections where all of the input series
+ values contain corrections.
+ """
+ for i, series in enumerate(args):
+ if i == 0:
+ c = get_empty_corrections_df(series)
+ c.loc[series["correction_code"] != 0, "correction_code"] = 99
+ return c
+
+
def rule_combine_nan_and(*args):
- """Combination rule, combine NaN values for any number of timeseries.
+ """Combination rule, combine NaN values for any number of time series.
Used for combining intermediate results in branching algorithm trees to
create one final result, i.e. (s1.isna() AND s2.isna())
@@ -729,7 +827,7 @@ def rule_combine_nan_and(*args):
Returns
-------
corrections : pd.Series
- a series with same index as the input timeseries containing
+ a series with same index as the input time series containing
corrections. Contains NaNs where any of the input series
values is NaN.
"""
@@ -743,6 +841,29 @@ def rule_combine_nan_and(*args):
return result
+def rule_combine_corrections_and(*args):
+ """Combination rule, combine corrections for any number of time series.
+
+ Used for combining intermediate results in branching algorithm trees to
+ create one final result, i.e. (corr_s1 AND corr_s2)
+
+ Returns
+ -------
+ corrections : pd.Series
+ a series with same index as the input time series containing
+ corrections. Contains corrections where all of the input series
+ values contain corrections.
+ """
+ for i, series in enumerate(args):
+ if i == 0:
+ mask = series["correction_code"] != 0
+ else:
+ mask = mask & (series["correction_code"] != 0)
+ c = get_empty_corrections_df(args[0])
+ c.loc[mask, "correction_code"] = 99
+ return c
+
+
def rule_flat_signal(
series,
window,
@@ -762,7 +883,7 @@ def rule_flat_signal(
Parameters
----------
series : pd.Series
- timeseries to analyse
+ time series to analyse
window : int
number of days in window
min_obs : int
@@ -779,16 +900,16 @@ def rule_flat_signal(
limit. Only search for flat signals above this limit.
By default None.
hbelow : float, optional
- absolute value in units of timeseries signifying an upper limit.
+ absolute value in units of time series signifying an upper limit.
Only search for flat signals below this limit. By default None.
habove : float, optional
- absolute value in units of timeseries signifying a lower limit.
+ absolute value in units of time series signifying a lower limit.
Only search for flat signals above this limit. By default None.
Returns
-------
corrections : pd.Series
- a series with same index as the input timeseries containing
+ a series with same index as the input time series containing
corrections. Contains NaNs where the signal is considered flat
or dead.
"""
@@ -817,4 +938,4 @@ def rule_flat_signal(
mask = stdmask & quantilemask & levelmask
mask = mask.reindex(series.index, fill_value=False)
- return mask_corrections_as_nan(series, mask)
+ return mask_corrections_no_comparison_value(series, mask)
diff --git a/traval/ruleset.py b/traval/ruleset.py
index 6839c90..d4e766b 100755
--- a/traval/ruleset.py
+++ b/traval/ruleset.py
@@ -12,6 +12,8 @@
class RuleSetEncoder(json.JSONEncoder):
+ """Encode values in RuleSet to JSON."""
+
def default(self, o):
if callable(o):
return "func:" + o.__name__
@@ -39,7 +41,8 @@ def ruleset_hook(obj):
val = getattr(rulelib, funcname)
except AttributeError:
warnings.warn(
- f"Could not load function {funcname} " "from `traval.rulelib`!"
+ f"Could not load function {funcname} " "from `traval.rulelib`!",
+ stacklevel=1,
)
val = funcname
obj[key] = val
@@ -49,7 +52,9 @@ def ruleset_hook(obj):
try:
val = getattr(np, funcname)
except AttributeError:
- warnings.warn(f"Could not load function {funcname} " "from `numpy`!")
+ warnings.warn(
+ f"Could not load function {funcname} " "from `numpy`!", stacklevel=1
+ )
val = (funcname,)
obj[key] = (val,)
elif str(value).startswith("series:"):
@@ -83,7 +88,7 @@ class RuleSet:
The RuleSet object stores detection rules and other relevant information
in a dictionary. The order in which rules are carried out, the functions
- that parse the timeseries, the extra arguments required by those functions
+ that parse the time series, the extra arguments required by those functions
are all stored together.
The detection functions must take a series as the first argument, and
@@ -103,7 +108,6 @@ class RuleSet:
Examples
--------
-
Given two detection functions 'foo' and 'bar':
>>> rset = RuleSet(name="foobar")
@@ -144,21 +148,20 @@ def __call__(self, series):
Parameters
----------
series : pandas.Series or pandas.DataFrame
- timeseries to apply rules to
+ time series to apply rules to
Returns
-------
d : OrderedDict
- Dictionary containing resulting timeseries after applying rules.
+ Dictionary containing resulting time series after applying rules.
Keys represent step numbers (0 is the original series, 1 the
outcome of rule #1, etc.)
c : OrderedDict
- Dictionary containing corrections to timeseries based on rules
+ Dictionary containing corrections to time series based on rules
Keys represent step numbers (1 contains the corrections based on
rule #1, etc.). When no correction is available, step contains
the value 0.
"""
-
return self._applyself(series)
def add_rule(self, name, func, apply_to=None, kwargs=None):
@@ -272,7 +275,7 @@ def get_parameters(self, name=None):
@staticmethod
def _parse_kwargs(kwargs, name=None):
- """Internal method, parse keyword arguments dictionary.
+ """Internal method to parse keyword arguments dictionary.
Iterates over keys, values in kwargs dictionary. If value is callable,
calls value with 'name' as function argument. The result is stored
@@ -290,7 +293,7 @@ def _parse_kwargs(kwargs, name=None):
dict
dictionary of parsed arguments
"""
- new_args = dict()
+ new_args = {}
if kwargs is not None:
for k, v in kwargs.items():
if callable(v):
@@ -300,21 +303,21 @@ def _parse_kwargs(kwargs, name=None):
return new_args
def _applyself(self, series):
- """Internal method, apply ruleset to series.
+ """Internal method to apply ruleset to series.
Parameters
----------
series: pandas.Series or pandas.DataFrame
- timeseries to apply rules to
+ time series to apply rules to
Returns
-------
d: OrderedDict
- Dictionary containing resulting timeseries after applying rules.
+ Dictionary containing resulting time series after applying rules.
Keys represent step numbers (0 is the original series, 1 the
outcome of rule # 1, etc.)
c: OrderedDict
- Dictionary containing corrections to timeseries based on rules
+ Dictionary containing corrections to time series based on rules
Keys represent step numbers(1 contains the corrections based on
rule # 1, etc.). When no correction is available, step contains
the value 0.
@@ -329,8 +332,21 @@ def _applyself(self, series):
arg_dict = self._parse_kwargs(irule["kwargs"], name)
corr = irule["func"](d[int(irule["apply_to"])], **arg_dict)
# store both correction and result
- d[i] = d[int(irule["apply_to"])] + corr
- c[i] = corr.loc[corr != 0.0].copy()
+ # support correction code based corrections
+ if isinstance(corr, pd.DataFrame) and "correction_code" in corr.columns:
+ d[i] = d[int(irule["apply_to"])].where(
+ corr["correction_code"] == 0, np.nan
+ )
+ c[i] = corr.loc[corr["correction_code"] != 0.0].copy()
+ elif isinstance(corr, pd.Series):
+ # support nan-based corrections
+ d[i] = d[int(irule["apply_to"])] + corr
+ c[i] = corr.loc[corr != 0.0]
+ else:
+ raise TypeError(
+ "Corrections computed by rules must be pd.Series containing "
+ "NaNs or DataFrame containing a column named 'correction_code'."
+ )
# if apply_to is tuple, collect series as kwargs to func
elif isinstance(irule["apply_to"], tuple):
# collect results
@@ -383,7 +399,7 @@ def to_pickle(self, fname, verbose=True):
verbose : bool, optional
prints message when operation complete, default is True
- See also
+ See Also
--------
from_pickle : load RuleSet from pickle file
to_json : store RuleSet as json file (does not support custom functions)
@@ -412,7 +428,7 @@ def from_pickle(cls, fname):
RuleSet
RuleSet object, including custom functions and parameters
- See also
+ See Also
--------
to_pickle : store RuleSet as pickle (supports custom functions)
to_json : store RuleSet as json file (does not support custom functions)
@@ -443,7 +459,7 @@ def to_json(self, fname=None, verbose=True):
prints message when operation complete, default is True
- See also
+ See Also
--------
from_json : load RuleSet from json file
to_pickle : store RuleSet as pickle (supports custom functions)
@@ -453,7 +469,7 @@ def to_json(self, fname=None, verbose=True):
"Custom functions will not be preserved when storing "
"RuleSet as JSON file!"
)
- warnings.warn(msg)
+ warnings.warn(msg, stacklevel=1)
rules = deepcopy(self.rules)
rules["name"] = self.name
if fname is not None:
@@ -486,7 +502,7 @@ def from_json(cls, fname):
RuleSet:
RuleSet object
- See also
+ See Also
--------
to_json : store RuleSet as JSON file (does not support custom functions)
to_pickle : store RuleSet as pickle (supports custom functions)
@@ -520,9 +536,9 @@ def get_resolved_ruleset(self, name):
new_ruleset = deepcopy(self.rules)
for rule in new_ruleset.values():
rule["kwargs"] = self._parse_kwargs(rule["kwargs"], name=name)
-
+
# create new object with resolved parameters
rset = RuleSet(name)
rset.rules = new_ruleset
-
+
return rset
diff --git a/traval/ts_comparison.py b/traval/ts_comparison.py
index 9a4689f..6c5a833 100755
--- a/traval/ts_comparison.py
+++ b/traval/ts_comparison.py
@@ -53,7 +53,7 @@ def idx_in_idx2(self):
class SeriesComparison:
- """Object for comparing two timeseries.
+ """Object for comparing two time series.
Comparison yields the following categories:
@@ -77,7 +77,7 @@ class SeriesComparison:
"""
def __init__(self, s1, s2, names=None, diff_threshold=0.0):
- """Compare two timeseries.
+ """Compare two time series.
Parameters
----------
@@ -86,7 +86,7 @@ def __init__(self, s1, s2, names=None, diff_threshold=0.0):
s2 : pd.Series or pd.DataFrame
second series to compare
names : list of str, optional
- list of names of timeseries, by default None, which
+ list of names of time series, by default None, which
uses series name, or dataframe column name
diff_threshold : float, optional
value beyond which a difference is considered significant, by
@@ -127,7 +127,7 @@ def __init__(self, s1, s2, names=None, diff_threshold=0.0):
@staticmethod
def _parse_series(series):
- """Internal method to parse timeseries input.
+ """Internal method to parse time series input.
Parameters
----------
@@ -138,7 +138,7 @@ def _parse_series(series):
Returns
-------
series, comments : pd.Series, pd.Series
- returns timeseries and comment series. Comment series is empty
+ returns time series and comment series. Comment series is empty
series if no comments are included in input
Raises
@@ -248,7 +248,6 @@ def compare_by_comment(self):
ValueError
if no comment series is found
"""
-
if self.c2n.empty:
raise ValueError("No comment series!")
@@ -339,64 +338,63 @@ def _check_idx_comparison(self, return_missing=False):
class SeriesComparisonRelative(SeriesComparison):
- """Object for comparing two timeseries relative to a third timeseries.
+ """Object for comparing two time series relative to a third time series.
Extends the SeriesComparison object to include a comparison between
- two timeseries and a third base timeseries. This is used for example, when
+ two time series and a third base time series. This is used for example, when
comparing the results of two error detection outcomes to the original
- raw timeseries.
+ raw time series.
Comparison yields both the results from SeriesComparison as well as the
- following categories for the relative comparison to the base timeseries:
+ following categories for the relative comparison to the base time series:
- - kept_in_both: both timeseries and the base timeseries contain values
+ - kept_in_both: both time series and the base time series contain values
- flagged_in_s1: value is NaN/missing in series #1
- flagged_in_s2: value is NaN/missing in series #2
- flagged_in_both: value is NaN/missing in both series #1 and series #2
- - in_all_nan: value is NaN in all timeseries (series #1, #2 and base)
+ - in_all_nan: value is NaN in all time series (series #1, #2 and base)
- introduced_in_s1: value is NaN/missing in base but has value in series #1
- introduced_in_s2: value is NaN/missing in base but has value in series #2
- introduced_in_both: value is NaN/missing in base but has value in both
- timeseries
+ time series
Parameters
----------
s1 : pd.Series or pd.DataFrame
first series to compare
truth : pd.Series or pd.DataFrame
- second series to compare, if a "truth" timeseries is available
- pass it as the second timeseries. Stored in object as 's2'.
+ second series to compare, if a "truth" time series is available
+ pass it as the second time series. Stored in object as 's2'.
base : pd.Series or pd.DataFrame
- timeseries to compare other two series with
+ time series to compare other two series with
diff_threshold : float, optional
value beyond which a difference is considered significant, by
default 0.0. Two values whose difference is smaller than threshold
are considered identical.
- See also
+ See Also
--------
- SeriesComparison : Comparison of two timeseries relative to each other
+ SeriesComparison : Comparison of two time series relative to each other
"""
def __init__(self, s1, truth, base, diff_threshold=0.0):
- """Compare two timeseries relative to a base timeseries.
+ """Compare two time series relative to a base time series.
Parameters
----------
s1 : pd.Series or pd.DataFrame
first series to compare
truth : pd.Series or pd.DataFrame
- second series to compare, if a "truth" timeseries is available
- pass it as the second timeseries. Stored in object as 's2'.
+ second series to compare, if a "truth" time series is available
+ pass it as the second time series. Stored in object as 's2'.
base : pd.Series or pd.DataFrame
- timeseries to compare other two series with
+ time series to compare other two series with
diff_threshold : float, optional
value beyond which a difference is considered significant, by
default 0.0. Two values whose difference is smaller than threshold
are considered identical.
"""
-
# Do the original comparison between s1 and s2
super().__init__(s1, truth, diff_threshold=diff_threshold)
@@ -417,9 +415,8 @@ def __init__(self, s1, truth, base, diff_threshold=0.0):
self.bc = BinaryClassifier.from_series_comparison_relative(self)
def _compare_series_to_base(self):
- """Internal method for comparing two timseries to base timeseries."""
-
- # where Nans in base timeseries
+ """Internal method for comparing two timseries to base time series."""
+ # where Nans in base time series
nanmask = self.basen.isna()
# prepare some indices
@@ -439,7 +436,7 @@ def _compare_series_to_base(self):
self.idx_r_in_all_nan = self.basen.loc[nanmask].index.difference(s1s2_union)
# self.idx_r_in_all_nan = self.basen.loc[nanmask].index.intersection(
# self.idx_in_both_nan) # only where all are NaN
- # counts for both NaNs and missing in base timeseries
+ # counts for both NaNs and missing in base time series
self.idx_r_introduced_in_s1 = (
self.basen.loc[nanmask]
.index.intersection(only_in_s1)
@@ -457,13 +454,13 @@ def _compare_series_to_base(self):
)
def _summarize_comparison_to_base(self):
- """Internal method for summarizing comparison with base timeseries.
+ """Internal method for summarizing comparison with base time series.
Returns
-------
summary : pandas.Series
Series summarizing the series comparison relative to base
- timeseries, containing counts per category
+ time series, containing counts per category
"""
categories = [
"kept_in_both",
@@ -496,7 +493,6 @@ def compare_to_base_by_comment(self):
ValueError
if no comment series is available.
"""
-
if self.c2n.empty:
raise ValueError("No comment series!")
diff --git a/traval/ts_utils.py b/traval/ts_utils.py
index abf26fb..9243182 100644
--- a/traval/ts_utils.py
+++ b/traval/ts_utils.py
@@ -1,45 +1,247 @@
+from enum import IntEnum
+
import numpy as np
import pandas as pd
-def mask_corrections_as_nan(series, mask):
- """Get corrections series with NaNs where mask == True.
+class CorrectionCode(IntEnum):
+ """Codes and labels for labeling error detection results."""
+
+ NO_CORRECTION = 0
+ BELOW_THRESHOLD = -2
+ NOT_EQUAL_VALUE = -1
+ EQUAL_VALUE = 1
+ ABOVE_THRESHOLD = 2
+ MODIFIED_VALUE = 4
+ UNKNOWN_COMPARISON_VALUE = 99
+
+
+def get_empty_corrections_df(series):
+ """Method to get corrections empty dataframe.
+
+ Parameters
+ ----------
+ series : pd.Series
+ time series to apply corrections to
+ """
+ c = pd.DataFrame(
+ index=series.index,
+ data={
+ "correction_code": CorrectionCode.NO_CORRECTION,
+ "series_values": np.full(series.size, np.nan),
+ "comparison_values": np.full(series.size, np.nan),
+ },
+ )
+ return c
+
+
+def _mask_corrections(series, values, mask, correction_code):
+ c = get_empty_corrections_df(series)
+ c.loc[mask, "series_values"] = series
+ if values is not None:
+ if isinstance(values, pd.Series):
+ c.loc[mask, "comparison_values"] = values.loc[mask]
+ else:
+ c.loc[mask, "comparison_values"] = values
+ c.loc[mask, "correction_code"] = correction_code
+ return c
+
+
+def mask_corrections_above_below(
+ series,
+ mask_above,
+ threshold_above,
+ mask_below,
+ threshold_below,
+):
+ """Get corrections where above threshold.
+
+ Parameters
+ ----------
+ series : pd.Series
+ time series to apply corrections to
+ threshold_above : pd.Series
+ time series with values to compare with
+ mask_above : DateTimeIndex or boolean np.array
+ DateTimeIndex containing timestamps where value should be set to NaN,
+ or boolean array with same length as series set to True where
+ value should be set to NaN. (Uses pandas .loc[mask] to set values.)
+ threshold_below : pd.Series
+ time series with values to compare with
+ mask_below : DateTimeIndex or boolean np.array
+ DateTimeIndex containing timestamps where value should be set to NaN,
+ or boolean array with same length as series set to True where
+ value should be set to NaN. (Uses pandas .loc[mask] to set values.)
+ """
+ c_above = mask_corrections_above_threshold(series, threshold_above, mask_above)
+ c_below = mask_corrections_below_threshold(series, threshold_below, mask_below)
+ return c_above.add(c_below, fill_value=0)
+
+
+def mask_corrections_above_threshold(series, threshold, mask):
+ """Get corrections where below threshold.
Parameters
----------
series : pd.Series
- timeseries to provide corrections for
+ time series to apply corrections to
+ threshold : pd.Series
+ time series with values to compare with
mask : DateTimeIndex or boolean np.array
DateTimeIndex containing timestamps where value should be set to NaN,
or boolean array with same length as series set to True where
value should be set to NaN. (Uses pandas .loc[mask] to set values.)
+ """
+ return _mask_corrections(series, threshold, mask, CorrectionCode.ABOVE_THRESHOLD)
+
+
+def mask_corrections_below_threshold(series, threshold, mask):
+ """Get corrections where below threshold.
+
+ Parameters
+ ----------
+ series : pd.Series
+ time series to apply corrections to
+ threshold : pd.Series
+ time series with values to compare with
+ mask : DateTimeIndex or boolean np.array
+ DateTimeIndex containing timestamps where value should be set to NaN,
+ or boolean array with same length as series set to True where
+ value should be set to NaN. (Uses pandas .loc[mask] to set values.)
+ """
+ return _mask_corrections(series, threshold, mask, CorrectionCode.BELOW_THRESHOLD)
+
+
+def mask_corrections_equal_value(series, values, mask):
+ """Get corrections where equal to value.
+
+ Parameters
+ ----------
+ series : pd.Series
+ time series to apply corrections to
+ values : pd.Series
+ time series with values to compare with
+ mask : DateTimeIndex or boolean np.array
+ DateTimeIndex containing timestamps where value should be set to NaN,
+ or boolean array with same length as series set to True where
+ value should be set to NaN. (Uses pandas .loc[mask] to set values.)
+ """
+ return _mask_corrections(series, values, mask, CorrectionCode.EQUAL_VALUE)
+
+
+def mask_corrections_modified_value(series, values, mask):
+ """Get corrections where value was modified.
+
+ Parameters
+ ----------
+ series : pd.Series
+ time series to apply corrections to
+ values : pd.Series
+ time series with values to compare with
+ mask : DateTimeIndex or boolean np.array
+ DateTimeIndex containing timestamps where value should be set to NaN,
+ or boolean array with same length as series set to True where
+ value should be set to NaN. (Uses pandas .loc[mask] to set values.)
+ """
+ return _mask_corrections(series, values, mask, CorrectionCode.MODIFIED_VALUE)
+
+
+def mask_corrections_not_equal_value(series, values, mask):
+ """Get corrections where not equal to value.
+
+ Parameters
+ ----------
+ series : pd.Series
+ time series to apply corrections to
+ values : pd.Series
+ time series with values to compare with
+ mask : DateTimeIndex or boolean np.array
+ DateTimeIndex containing timestamps where value should be set to NaN,
+ or boolean array with same length as series set to True where
+ value should be set to NaN. (Uses pandas .loc[mask] to set values.)
+ """
+ return _mask_corrections(series, values, mask, CorrectionCode.NOT_EQUAL_VALUE)
+
+
+def mask_corrections_no_comparison_value(series, mask):
+ """Get corrections where equal to value.
+
+ Parameters
+ ----------
+ series : pd.Series
+ time series to apply corrections to
+ mask : DateTimeIndex or boolean np.array
+ DateTimeIndex containing timestamps where value should be set to NaN,
+ or boolean array with same length as series set to True where
+ value should be set to NaN. (Uses pandas .loc[mask] to set values.)
+ """
+ return _mask_corrections(
+ series, None, mask, CorrectionCode.UNKNOWN_COMPARISON_VALUE
+ )
+
+
+def corrections_as_nan(corrections):
+ """Convert correction code series to NaNs.
+
+ Excludes codes 0 and 4, which are used to indicate no correction and a modification
+ of the value, respectively.
+
+ Parameters
+ ----------
+ corrections : pd.Series or pd.DataFrame
+ series or dataframe with correction code
Returns
-------
c : pd.Series
- return corrections series
+ return corrections series with nans where value is corrected
"""
- c = pd.Series(
- index=series.index,
- data=np.zeros(series.index.size),
- fastpath=True,
- dtype=float,
+ if isinstance(corrections, pd.DataFrame):
+ corrections = corrections["correction_code"]
+ c = pd.Series(index=corrections.index, data=0.0)
+ # set values where correction code is *not* 0 or 4 to NaN
+ # (meaning a correction was applied)
+ c.loc[(corrections != 0) | (corrections != 4)] = np.nan
+ return c
+
+
+def corrections_as_float(corrections):
+ """Convert correction code series to NaNs.
+
+ Excludes codes 0 and 4, which are used to indicate no correction and a modification
+ of the value, respectively.
+
+ Parameters
+ ----------
+ corrections : pd.DataFrame
+ dataframe with correction code and original + modified values
+
+ Returns
+ -------
+ c : pd.Series
+ return corrections series with floats where value is modified
+ """
+ c = pd.Series(index=corrections.index, data=0.0)
+ # set values where correction code is 4 to difference between original and modified
+ mask = corrections["correction_code"] == 4
+ c.loc[mask] = (
+ corrections.loc[mask, "comparison_values"]
+ - corrections.loc[mask, "series_values"]
)
- c.loc[mask] = np.nan
return c
def resample_short_series_to_long_series(short_series, long_series):
- """Resample a short timeseries to index from a longer timeseries.
+ """Resample a short time series to index from a longer time series.
First uses 'ffill' then 'bfill' to fill new series.
Parameters
----------
short_series : pd.Series
- short timeseries
+ short time series
long_series : pd.Series
- long timeseries
+ long time series
Returns
-------
@@ -55,17 +257,17 @@ def resample_short_series_to_long_series(short_series, long_series):
first_date_after = long_series.loc[mask].index[0]
new_series.loc[first_date_after] = short_series.iloc[i]
- new_series = new_series.fillna(method="ffill").fillna(method="bfill")
+ new_series = new_series.ffill().bfill()
return new_series
def diff_with_gap_awareness(series, max_gap="7D"):
- """Get diff of timeseries with a limit on gap between two values.
+ """Get diff of time series with a limit on gap between two values.
Parameters
----------
series : pd.Series
- timeseries to calculate diff for
+ time series to calculate diff for
max_gap : str, optional
maximum period between two observations for calculating diff, otherwise
set value to NaN, by default "7D"
@@ -73,7 +275,7 @@ def diff_with_gap_awareness(series, max_gap="7D"):
Returns
-------
diff : pd.Series
- timeseries with diff, with NaNs whenever two values are farther apart
+ time series with diff, with NaNs whenever two values are farther apart
than max_gap.
"""
diff = series.diff()
@@ -86,20 +288,20 @@ def diff_with_gap_awareness(series, max_gap="7D"):
def spike_finder(series, threshold=0.15, spike_tol=0.15, max_gap="7D"):
- """Find spikes in timeseries.
+ """Find spikes in time series.
- Spikes are sudden jumps in the value of a timeseries that last 1 timestep.
+ Spikes are sudden jumps in the value of a time series that last 1 timestep.
They can be both negative or positive.
Parameters
----------
series : pd.Series
- timeseries to find spikes in
+ time series to find spikes in
threshold : float, optional
the minimum size of the jump to qualify as a spike, by default 0.15
spike_tol : float, optional
- offset between value of timeseries before spike and after spike,
- by default 0.15. After a spike, the value of the timeseries is usually
+ offset between value of time series before spike and after spike,
+ by default 0.15. After a spike, the value of the time series is usually
close to but not identical to the value that preceded the spike. Use
this parameter to control how close the value has to be.
max_gap : str, optional
@@ -112,7 +314,6 @@ def spike_finder(series, threshold=0.15, spike_tol=0.15, max_gap="7D"):
pandas DateTimeIndex objects containing timestamps of upward and
downward spikes.
"""
-
# identify gaps and set diff value after gap to nan
diff = diff_with_gap_awareness(series, max_gap=max_gap)
@@ -140,7 +341,7 @@ def spike_finder(series, threshold=0.15, spike_tol=0.15, max_gap="7D"):
def bandwidth_moving_avg_n_sigma(series, window, n):
- """Calculate bandwidth around timeseries based moving average + n * std.
+ """Calculate bandwidth around time series based moving average + n * std.
Parameters
----------
@@ -165,7 +366,7 @@ def bandwidth_moving_avg_n_sigma(series, window, n):
def interpolate_series_to_new_index(series, new_index):
- """Interpolate timeseries to new DateTimeIndex.
+ """Interpolate time series to new DateTimeIndex.
Parameters
----------
@@ -183,7 +384,7 @@ def interpolate_series_to_new_index(series, new_index):
s_interp = np.interp(
new_index, series.index.asi8, series.values, left=np.nan, right=np.nan
)
- si = pd.Series(index=new_index, data=s_interp, dtype=float, fastpath=True)
+ si = pd.Series(index=new_index, data=s_interp, dtype=float)
return si
@@ -202,7 +403,6 @@ def unique_nans_in_series(series, *args):
mask : pd.Series
mask with value True where NaN is unique to series
"""
-
mask = series.isna()
for s in args:
@@ -214,30 +414,29 @@ def unique_nans_in_series(series, *args):
return mask
-def create_synthetic_raw_timeseries(raw_series, truth_series, comments):
- """Create synthetic raw timeseries.
+def create_synthetic_raw_time_series(raw_series, truth_series, comments):
+ """Create synthetic raw time series.
Updates 'truth_series' (where values are labelled with a comment)
with values from raw_series. Used for removing unlabeled changes between
- a raw and validated timeseries.
+ a raw and validated time series.
Parameters
----------
raw_series : pd.Series
- timeseries with raw data
+ time series with raw data
truth_series : pd.Series
- timeseries with validated data
+ time series with validated data
comments : pd.Series
- timeseries with comments. Index must be same as 'truth_series'.
+ time series with comments. Index must be same as 'truth_series'.
When value does not have a comment it must be an empty string: ''.
Returns
-------
s : pd.Series
- synthetic raw timeseries, same as truth_series but updated with
+ synthetic raw time series, same as truth_series but updated with
raw_series where value has been commented.
"""
-
if truth_series.index.symmetric_difference(comments.index).size > 0:
raise ValueError("'truth_series' and 'comments' must have same index!")
@@ -265,7 +464,7 @@ def shift_series_forward_backward(s, freqstr="1D"):
def smooth_upper_bound(b, smoothfreq="1D"):
smoother = shift_series_forward_backward(b, freqstr=smoothfreq)
smoother.iloc[:, 0] = smoother.iloc[:, 0].interpolate(method="linear")
- smoother.iloc[:, 2] = smoother.iloc[:, 1].interpolate(method="linear")
+ smoother.iloc[:, 2] = smoother.iloc[:, 2].interpolate(method="linear")
return smoother.max(axis=1).loc[smoother.iloc[:, 1].dropna().index]
@@ -274,3 +473,19 @@ def smooth_lower_bound(b, smoothfreq="1D"):
smoother.iloc[:, 0] = smoother.iloc[:, 0].interpolate(method="linear")
smoother.iloc[:, 2] = smoother.iloc[:, 2].interpolate(method="linear")
return smoother.min(axis=1).loc[smoother.iloc[:, 1].dropna().index]
+
+
+def get_correction_status_name(corrections):
+ """Get correction status name from correction codes.
+
+ Parameters
+ ----------
+ correction_code : pd.DataFrame or pd.Series
+ dataframe or series containing corrections codes
+
+ Returns
+ -------
+ pd.DataFrame or pd.Series
+ dataframe or series filled with correction status name
+ """
+ return corrections.fillna(0).map(lambda c: CorrectionCode(c).name)
diff --git a/traval/version.py b/traval/version.py
index 3d26edf..3d18726 100644
--- a/traval/version.py
+++ b/traval/version.py
@@ -1 +1 @@
-__version__ = "0.4.1"
+__version__ = "0.5.0"