diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 55800ae..31e010f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -11,31 +11,60 @@ on: - master jobs: - build: + lint: + name: Lint + runs-on: ubuntu-latest + steps: + - name: Checkout repo + uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: 3.9 + cache: "pip" + cache-dependency-path: pyproject.toml + + - name: Install Python dependencies + run: | + pip install . + + - name: ruff-lint + uses: chartboost/ruff-action@v1 + + - name: ruff-format + uses: chartboost/ruff-action@v1 + with: + args: "format --check" + + test: + needs: lint + name: Test runs-on: ubuntu-latest strategy: matrix: - python-version: [3.9, "3.10", "3.11"] + python-version: [3.9, "3.10", "3.11", "3.12"] steps: - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} + cache: "pip" + cache-dependency-path: pyproject.toml + - name: Install dependencies run: | python -m pip install --upgrade pip pip install -e .[ci] - - name: Lint with flake8 - run: | - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=80 --statistics + + - name: Run pytest run: | pytest + - name: Run codacy-coverage-reporter uses: codacy/codacy-coverage-reporter-action@master with: diff --git a/.readthedocs.yml b/.readthedocs.yml index eaf0e55..3a374e9 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -8,15 +8,12 @@ version: 2 build: os: ubuntu-22.04 tools: - python: "3.9" + python: "3.11" # Build documentation in the docs/ directory with Sphinx sphinx: configuration: docs/conf.py -# Optionally build your docs in additional formats such as PDF and ePub -formats: all - # Optionally set the version of Python and requirements required to build your docs python: install: diff --git a/docs/conf.py b/docs/conf.py index ec09b5c..833df79 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -13,15 +13,14 @@ import os import sys -sys.path.insert(0, os.path.abspath('.')) +sys.path.insert(0, os.path.abspath(".")) from traval import __version__ - # -- Project information ----------------------------------------------------- -project = 'traval' -copyright = '2021, Artesia' -author = 'Artesia' +project = "traval" +copyright = "2024, Artesia" +author = "Artesia" # The short X.Y version version = __version__ @@ -34,29 +33,29 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.autosummary', - 'sphinx.ext.napoleon', - 'sphinx.ext.doctest', - 'sphinx.ext.intersphinx', - 'sphinx.ext.todo', - 'sphinx.ext.coverage', - 'sphinx.ext.mathjax', - 'sphinx.ext.ifconfig', - 'sphinx.ext.viewcode', - 'IPython.sphinxext.ipython_console_highlighting', # lowercase didn't work - 'sphinx.ext.autosectionlabel', - 'nbsphinx', - 'nbsphinx_link' + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", + "sphinx.ext.napoleon", + "sphinx.ext.doctest", + "sphinx.ext.intersphinx", + "sphinx.ext.todo", + "sphinx.ext.coverage", + "sphinx.ext.mathjax", + "sphinx.ext.ifconfig", + "sphinx.ext.viewcode", + "IPython.sphinxext.ipython_console_highlighting", # lowercase didn't work + "sphinx.ext.autosectionlabel", + "nbsphinx", + "nbsphinx_link", ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] # -- Options for HTML output ------------------------------------------------- @@ -64,24 +63,24 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_theme = 'sphinx_rtd_theme' +html_theme = "sphinx_rtd_theme" html_theme_options = { - 'display_version': True, - 'prev_next_buttons_location': 'bottom', + "display_version": True, + "prev_next_buttons_location": "bottom", # 'style_external_links': False, # 'vcs_pageview_mode': '', # 'style_nav_header_background': 'white', # Toc options - 'collapse_navigation': False, - 'sticky_navigation': False, - 'navigation_depth': 4, - 'includehidden': True, - 'titles_only': False, + "collapse_navigation": False, + "sticky_navigation": False, + "navigation_depth": 4, + "includehidden": True, + "titles_only": False, "github_url": "https://github.com/ArtesiaWater/traval", } # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] diff --git a/docs/examples.rst b/docs/examples.rst index d10fa86..fc7aba7 100644 --- a/docs/examples.rst +++ b/docs/examples.rst @@ -3,8 +3,12 @@ Examples The following notebooks contain examples showcasing traval. -The first example shows how to apply the tools contained in traval to detect errors in a single timeseries. -The second example shows how the same can be done for a full dataset with lots of timeseries. +- The first example shows how to apply the tools contained in traval to detect errors in + a single time series. +- The second example shows how the same can be done for a full + dataset with lots of time series. +- The third notebook contains small examples for each of the error detection rules + contained in traval. .. toctree:: :maxdepth: 1 diff --git a/docs/examples/ex03_rules.nblink b/docs/examples/ex03_rules.nblink new file mode 100644 index 0000000..cb2b3cb --- /dev/null +++ b/docs/examples/ex03_rules.nblink @@ -0,0 +1,3 @@ +{ + "path": "../../examples/notebooks/ex03_testing_rules.ipynb" +} diff --git a/docs/getting_started.rst b/docs/getting_started.rst index 25d695e..5b8ed3e 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -5,11 +5,8 @@ Getting Started Installation ------------ -To install traval, a working version of Python 3.7 or 3.8 has to be installed on -your computer. We recommend using the Anaconda Distribution with Python 3.7 as -it includes most of the python package dependencies and the Jupyter Notebook -software to run the notebooks. However, you are free to install any -Python distribution you want. +To install traval, a working version of Python 3.9 or higher has to be installed on +your computer. To install traval, use: @@ -71,10 +68,10 @@ Take a look at the ruleset by just typing `ruleset`: 1: rule1 0 -Next define a Detector object. This object is designed to store a timeseries +Next define a Detector object. This object is designed to store a time series and the intermediate and final results after applying an error detection -algorithm. Initialize the Detector object with some timeseries. In this example -we assume there is a timeseries called `raw_series`: +algorithm. Initialize the Detector object with some time series. In this example +we assume there is a time series called `raw_series`: .. code:: python @@ -82,7 +79,7 @@ we assume there is a timeseries called `raw_series`: detect = traval.Detector(raw_series) -Apply our first algorithm to the timeseries. +Apply our first algorithm to the time series. .. code:: python diff --git a/docs/index.rst b/docs/index.rst index 6d602ad..801d02e 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -6,20 +6,20 @@ Welcome to traval's documentation! ================================== -Python package for applying automatic error detection algorithms to timeseries. +Python package for applying automatic error detection algorithms to time series. This module is set up to provide tools for applying any error detection -algorithm to any timeseries. The module consists of three main components: +algorithm to any time series. The module consists of three main components: -- `Detector`: a data management object for storing timeseries and error detection results. +- `Detector`: a data management object for storing time series and error detection results. - `RuleSet`: the RuleSet object is a highly flexible object for defining error detection algorithms based on (user-defined) functions. -- `SeriesComparison*`: objects for comparing timeseries. These objects include plots for visualizing the comparisons. +- `SeriesComparison*`: objects for comparing time series. These objects include plots for visualizing the comparisons. The general workflow consists of the following steps: 1. Define error detection algorithm(s). -2. Load data, i.e. raw timeseries data and optionally timeseries representing the "truth" to see how well the algorithms perform. -3. Initialize Detector objects and apply algorithms to timeseries. +2. Load data, i.e. raw time series data and optionally time series representing the "truth" to see how well the algorithms perform. +3. Initialize Detector objects and apply algorithms to time series. 4. Store and analyze the results. For more detailed information and examples, please refer to the notebooks in diff --git a/docs/modules.rst b/docs/modules.rst index 85a1eaa..ae111b2 100644 --- a/docs/modules.rst +++ b/docs/modules.rst @@ -25,15 +25,15 @@ Rule Library :members: -Timeseries Comparison ---------------------- +Time Series Comparison +---------------------- .. automodule:: traval.ts_comparison :members: -Timeseries Utilities --------------------- +Time series Utilities +--------------------- .. automodule:: traval.ts_utils :members: diff --git a/examples/notebooks/ex01_applying_automatic_error_detection_algorithms_to_a_timeseries.ipynb b/examples/notebooks/ex01_applying_automatic_error_detection_algorithms_to_a_timeseries.ipynb index fd40799..a3e0e6f 100644 --- a/examples/notebooks/ex01_applying_automatic_error_detection_algorithms_to_a_timeseries.ipynb +++ b/examples/notebooks/ex01_applying_automatic_error_detection_algorithms_to_a_timeseries.ipynb @@ -4,12 +4,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Example 1: Applying an automatic error detection algorithm to a timeseries\n", + "# Example 1: Applying an automatic error detection algorithm to a time series\n", "_Created by DavĂ­d Brakenhoff, Artesia, May 2020_\n", "\n", "
\n", "\n", - "This notebook contains a simple example how to set up an automatic error detection algorithm based on a few simple rules and applies those rules to a groundwater timeseries.\n", + "This notebook contains a simple example how to set up an automatic error detection algorithm based on a few simple rules and applies those rules to a groundwater time series.\n", "\n", "First import the requisite packages:" ] @@ -21,6 +21,7 @@ "outputs": [], "source": [ "import os\n", + "\n", "import numpy as np\n", "import pandas as pd\n", "\n", @@ -86,7 +87,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -124,7 +125,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -138,7 +139,7 @@ " 4: combine (1, 2, 3)" ] }, - "execution_count": 5, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -159,7 +160,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -177,7 +178,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -186,14 +187,6 @@ "text": [ "RuleSet written to file: 'test.json'\n" ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/david/Github/traval/traval/ruleset.py:436: UserWarning: Custom functions will not be preserved when storing RuleSet as JSON file!\n", - " warnings.warn(msg)\n" - ] } ], "source": [ @@ -210,7 +203,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -229,7 +222,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -238,7 +231,7 @@ "Detector: " ] }, - "execution_count": 9, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -257,7 +250,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -273,19 +266,17 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [ { "data": { - "image/png": "\n", + "image/png": "", "text/plain": [ - "
" + "
" ] }, - "metadata": { - "needs_background": "light" - }, + "metadata": {}, "output_type": "display_data" } ], @@ -302,7 +293,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -396,7 +387,7 @@ "2012-09-24 19:00:00 29.6158 29.6158 29.6158 29.6158 29.6158" ] }, - "execution_count": 12, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -415,7 +406,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -447,179 +438,179 @@ " \n", " \n", " \n", - " 2012-09-24 15:00:00\n", - " 0.0\n", - " NaN\n", - " 0.0\n", - " 0.0\n", + " 2015-06-27 14:30:41\n", + " 99\n", + " -2\n", + " 0\n", + " 0\n", " \n", " \n", - " 2015-06-27 14:30:41\n", - " NaN\n", - " NaN\n", - " 0.0\n", - " 0.0\n", + " 2012-09-24 15:00:00\n", + " 0\n", + " -2\n", + " 0\n", + " 0\n", " \n", " \n", " 2016-12-23 10:00:00\n", - " 0.0\n", - " NaN\n", - " 0.0\n", - " 0.0\n", + " 0\n", + " -2\n", + " 0\n", + " 0\n", " \n", " \n", " 2016-12-23 11:00:00\n", - " 0.0\n", - " NaN\n", - " 0.0\n", - " 0.0\n", + " 0\n", + " -2\n", + " 0\n", + " 0\n", " \n", " \n", " 2016-12-23 12:00:00\n", - " 0.0\n", - " NaN\n", - " 0.0\n", - " 0.0\n", + " 0\n", + " -2\n", + " 0\n", + " 0\n", " \n", " \n", " 2016-12-23 13:00:00\n", - " 0.0\n", - " NaN\n", - " 0.0\n", - " 0.0\n", + " 0\n", + " -2\n", + " 0\n", + " 0\n", " \n", " \n", " 2016-12-23 14:00:00\n", - " 0.0\n", - " NaN\n", - " 0.0\n", - " 0.0\n", + " 0\n", + " -2\n", + " 0\n", + " 0\n", " \n", " \n", " 2016-12-23 15:00:00\n", - " 0.0\n", - " NaN\n", - " 0.0\n", - " 0.0\n", + " 0\n", + " -2\n", + " 0\n", + " 0\n", " \n", " \n", " 2016-12-23 16:00:00\n", - " 0.0\n", - " NaN\n", - " 0.0\n", - " 0.0\n", + " 0\n", + " -2\n", + " 0\n", + " 0\n", " \n", " \n", " 2016-12-23 17:00:00\n", - " 0.0\n", - " NaN\n", - " 0.0\n", - " 0.0\n", + " 0\n", + " -2\n", + " 0\n", + " 0\n", " \n", " \n", " 2016-12-23 18:00:00\n", - " 0.0\n", - " NaN\n", - " 0.0\n", - " 0.0\n", + " 0\n", + " -2\n", + " 0\n", + " 0\n", " \n", " \n", " 2016-12-23 19:00:00\n", - " 0.0\n", - " NaN\n", - " 0.0\n", - " 0.0\n", + " 0\n", + " -2\n", + " 0\n", + " 0\n", " \n", " \n", " 2016-12-23 20:00:00\n", - " 0.0\n", - " NaN\n", - " 0.0\n", - " 0.0\n", + " 0\n", + " -2\n", + " 0\n", + " 0\n", " \n", " \n", " 2016-12-23 21:00:00\n", - " 0.0\n", - " NaN\n", - " 0.0\n", - " 0.0\n", + " 0\n", + " -2\n", + " 0\n", + " 0\n", " \n", " \n", " 2016-12-23 22:00:00\n", - " 0.0\n", - " NaN\n", - " 0.0\n", - " 0.0\n", + " 0\n", + " -2\n", + " 0\n", + " 0\n", " \n", " \n", " 2016-12-23 23:00:00\n", - " 0.0\n", - " NaN\n", - " 0.0\n", - " 0.0\n", + " 0\n", + " -2\n", + " 0\n", + " 0\n", " \n", " \n", " 2016-12-24 00:00:00\n", - " 0.0\n", - " NaN\n", - " 0.0\n", - " 0.0\n", + " 0\n", + " -2\n", + " 0\n", + " 0\n", " \n", " \n", " 2016-12-24 01:00:00\n", - " 0.0\n", - " NaN\n", - " 0.0\n", - " 0.0\n", + " 0\n", + " -2\n", + " 0\n", + " 0\n", " \n", " \n", " 2016-12-24 02:00:00\n", - " 0.0\n", - " NaN\n", - " 0.0\n", - " 0.0\n", + " 0\n", + " -2\n", + " 0\n", + " 0\n", " \n", " \n", " 2016-12-24 03:00:00\n", - " 0.0\n", - " NaN\n", - " 0.0\n", - " 0.0\n", + " 0\n", + " -2\n", + " 0\n", + " 0\n", " \n", " \n", " 2016-12-24 04:00:00\n", - " 0.0\n", - " NaN\n", - " 0.0\n", - " 0.0\n", + " 0\n", + " -2\n", + " 0\n", + " 0\n", " \n", " \n", " 2016-12-24 05:00:00\n", - " 0.0\n", - " NaN\n", - " 0.0\n", - " 0.0\n", + " 0\n", + " -2\n", + " 0\n", + " 0\n", " \n", " \n", " 2016-12-24 06:00:00\n", - " 0.0\n", - " NaN\n", - " 0.0\n", - " 0.0\n", + " 0\n", + " -2\n", + " 0\n", + " 0\n", " \n", " \n", " 2016-12-24 07:00:00\n", - " 0.0\n", - " NaN\n", - " 0.0\n", - " 0.0\n", + " 0\n", + " -2\n", + " 0\n", + " 0\n", " \n", " \n", " 2016-12-24 08:00:00\n", - " 0.0\n", - " NaN\n", - " 0.0\n", - " 0.0\n", + " 0\n", + " -2\n", + " 0\n", + " 0\n", " \n", " \n", "\n", @@ -627,40 +618,40 @@ ], "text/plain": [ " spikes dry hardmax combine\n", - "2012-09-24 15:00:00 0.0 NaN 0.0 0.0\n", - "2015-06-27 14:30:41 NaN NaN 0.0 0.0\n", - "2016-12-23 10:00:00 0.0 NaN 0.0 0.0\n", - "2016-12-23 11:00:00 0.0 NaN 0.0 0.0\n", - "2016-12-23 12:00:00 0.0 NaN 0.0 0.0\n", - "2016-12-23 13:00:00 0.0 NaN 0.0 0.0\n", - "2016-12-23 14:00:00 0.0 NaN 0.0 0.0\n", - "2016-12-23 15:00:00 0.0 NaN 0.0 0.0\n", - "2016-12-23 16:00:00 0.0 NaN 0.0 0.0\n", - "2016-12-23 17:00:00 0.0 NaN 0.0 0.0\n", - "2016-12-23 18:00:00 0.0 NaN 0.0 0.0\n", - "2016-12-23 19:00:00 0.0 NaN 0.0 0.0\n", - "2016-12-23 20:00:00 0.0 NaN 0.0 0.0\n", - "2016-12-23 21:00:00 0.0 NaN 0.0 0.0\n", - "2016-12-23 22:00:00 0.0 NaN 0.0 0.0\n", - "2016-12-23 23:00:00 0.0 NaN 0.0 0.0\n", - "2016-12-24 00:00:00 0.0 NaN 0.0 0.0\n", - "2016-12-24 01:00:00 0.0 NaN 0.0 0.0\n", - "2016-12-24 02:00:00 0.0 NaN 0.0 0.0\n", - "2016-12-24 03:00:00 0.0 NaN 0.0 0.0\n", - "2016-12-24 04:00:00 0.0 NaN 0.0 0.0\n", - "2016-12-24 05:00:00 0.0 NaN 0.0 0.0\n", - "2016-12-24 06:00:00 0.0 NaN 0.0 0.0\n", - "2016-12-24 07:00:00 0.0 NaN 0.0 0.0\n", - "2016-12-24 08:00:00 0.0 NaN 0.0 0.0" + "2015-06-27 14:30:41 99 -2 0 0\n", + "2012-09-24 15:00:00 0 -2 0 0\n", + "2016-12-23 10:00:00 0 -2 0 0\n", + "2016-12-23 11:00:00 0 -2 0 0\n", + "2016-12-23 12:00:00 0 -2 0 0\n", + "2016-12-23 13:00:00 0 -2 0 0\n", + "2016-12-23 14:00:00 0 -2 0 0\n", + "2016-12-23 15:00:00 0 -2 0 0\n", + "2016-12-23 16:00:00 0 -2 0 0\n", + "2016-12-23 17:00:00 0 -2 0 0\n", + "2016-12-23 18:00:00 0 -2 0 0\n", + "2016-12-23 19:00:00 0 -2 0 0\n", + "2016-12-23 20:00:00 0 -2 0 0\n", + "2016-12-23 21:00:00 0 -2 0 0\n", + "2016-12-23 22:00:00 0 -2 0 0\n", + "2016-12-23 23:00:00 0 -2 0 0\n", + "2016-12-24 00:00:00 0 -2 0 0\n", + "2016-12-24 01:00:00 0 -2 0 0\n", + "2016-12-24 02:00:00 0 -2 0 0\n", + "2016-12-24 03:00:00 0 -2 0 0\n", + "2016-12-24 04:00:00 0 -2 0 0\n", + "2016-12-24 05:00:00 0 -2 0 0\n", + "2016-12-24 06:00:00 0 -2 0 0\n", + "2016-12-24 07:00:00 0 -2 0 0\n", + "2016-12-24 08:00:00 0 -2 0 0" ] }, - "execution_count": 13, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "corrections = detector.get_corrections_dataframe()\n", + "corrections = detector.get_corrections_dataframe(as_correction_codes=True)\n", "corrections" ] }, @@ -668,13 +659,325 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The comparison objects are stored as a dictionary under `detector.comparisons`:" + "Getting the status code names is more informative and can be done with\n", + "`traval.ts_utils.get_correction_status_name`. Note that NaNs mean that a specific rule\n", + "did not flag that particular observation as suspect. The status codes and their meanings can be derived from `traval.ts_utils.CorrectionCode`: " + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(traval.ts_utils.CorrectionCode)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
spikesdryhardmaxcombine
2015-06-27 14:30:41UNKNOWN_COMPARISON_VALUEBELOW_THRESHOLDNO_CORRECTIONNO_CORRECTION
2012-09-24 15:00:00NO_CORRECTIONBELOW_THRESHOLDNO_CORRECTIONNO_CORRECTION
2016-12-23 10:00:00NO_CORRECTIONBELOW_THRESHOLDNO_CORRECTIONNO_CORRECTION
2016-12-23 11:00:00NO_CORRECTIONBELOW_THRESHOLDNO_CORRECTIONNO_CORRECTION
2016-12-23 12:00:00NO_CORRECTIONBELOW_THRESHOLDNO_CORRECTIONNO_CORRECTION
2016-12-23 13:00:00NO_CORRECTIONBELOW_THRESHOLDNO_CORRECTIONNO_CORRECTION
2016-12-23 14:00:00NO_CORRECTIONBELOW_THRESHOLDNO_CORRECTIONNO_CORRECTION
2016-12-23 15:00:00NO_CORRECTIONBELOW_THRESHOLDNO_CORRECTIONNO_CORRECTION
2016-12-23 16:00:00NO_CORRECTIONBELOW_THRESHOLDNO_CORRECTIONNO_CORRECTION
2016-12-23 17:00:00NO_CORRECTIONBELOW_THRESHOLDNO_CORRECTIONNO_CORRECTION
2016-12-23 18:00:00NO_CORRECTIONBELOW_THRESHOLDNO_CORRECTIONNO_CORRECTION
2016-12-23 19:00:00NO_CORRECTIONBELOW_THRESHOLDNO_CORRECTIONNO_CORRECTION
2016-12-23 20:00:00NO_CORRECTIONBELOW_THRESHOLDNO_CORRECTIONNO_CORRECTION
2016-12-23 21:00:00NO_CORRECTIONBELOW_THRESHOLDNO_CORRECTIONNO_CORRECTION
2016-12-23 22:00:00NO_CORRECTIONBELOW_THRESHOLDNO_CORRECTIONNO_CORRECTION
2016-12-23 23:00:00NO_CORRECTIONBELOW_THRESHOLDNO_CORRECTIONNO_CORRECTION
2016-12-24 00:00:00NO_CORRECTIONBELOW_THRESHOLDNO_CORRECTIONNO_CORRECTION
2016-12-24 01:00:00NO_CORRECTIONBELOW_THRESHOLDNO_CORRECTIONNO_CORRECTION
2016-12-24 02:00:00NO_CORRECTIONBELOW_THRESHOLDNO_CORRECTIONNO_CORRECTION
2016-12-24 03:00:00NO_CORRECTIONBELOW_THRESHOLDNO_CORRECTIONNO_CORRECTION
2016-12-24 04:00:00NO_CORRECTIONBELOW_THRESHOLDNO_CORRECTIONNO_CORRECTION
2016-12-24 05:00:00NO_CORRECTIONBELOW_THRESHOLDNO_CORRECTIONNO_CORRECTION
2016-12-24 06:00:00NO_CORRECTIONBELOW_THRESHOLDNO_CORRECTIONNO_CORRECTION
2016-12-24 07:00:00NO_CORRECTIONBELOW_THRESHOLDNO_CORRECTIONNO_CORRECTION
2016-12-24 08:00:00NO_CORRECTIONBELOW_THRESHOLDNO_CORRECTIONNO_CORRECTION
\n", + "
" + ], + "text/plain": [ + " spikes dry hardmax \\\n", + "2015-06-27 14:30:41 UNKNOWN_COMPARISON_VALUE BELOW_THRESHOLD NO_CORRECTION \n", + "2012-09-24 15:00:00 NO_CORRECTION BELOW_THRESHOLD NO_CORRECTION \n", + "2016-12-23 10:00:00 NO_CORRECTION BELOW_THRESHOLD NO_CORRECTION \n", + "2016-12-23 11:00:00 NO_CORRECTION BELOW_THRESHOLD NO_CORRECTION \n", + "2016-12-23 12:00:00 NO_CORRECTION BELOW_THRESHOLD NO_CORRECTION \n", + "2016-12-23 13:00:00 NO_CORRECTION BELOW_THRESHOLD NO_CORRECTION \n", + "2016-12-23 14:00:00 NO_CORRECTION BELOW_THRESHOLD NO_CORRECTION \n", + "2016-12-23 15:00:00 NO_CORRECTION BELOW_THRESHOLD NO_CORRECTION \n", + "2016-12-23 16:00:00 NO_CORRECTION BELOW_THRESHOLD NO_CORRECTION \n", + "2016-12-23 17:00:00 NO_CORRECTION BELOW_THRESHOLD NO_CORRECTION \n", + "2016-12-23 18:00:00 NO_CORRECTION BELOW_THRESHOLD NO_CORRECTION \n", + "2016-12-23 19:00:00 NO_CORRECTION BELOW_THRESHOLD NO_CORRECTION \n", + "2016-12-23 20:00:00 NO_CORRECTION BELOW_THRESHOLD NO_CORRECTION \n", + "2016-12-23 21:00:00 NO_CORRECTION BELOW_THRESHOLD NO_CORRECTION \n", + "2016-12-23 22:00:00 NO_CORRECTION BELOW_THRESHOLD NO_CORRECTION \n", + "2016-12-23 23:00:00 NO_CORRECTION BELOW_THRESHOLD NO_CORRECTION \n", + "2016-12-24 00:00:00 NO_CORRECTION BELOW_THRESHOLD NO_CORRECTION \n", + "2016-12-24 01:00:00 NO_CORRECTION BELOW_THRESHOLD NO_CORRECTION \n", + "2016-12-24 02:00:00 NO_CORRECTION BELOW_THRESHOLD NO_CORRECTION \n", + "2016-12-24 03:00:00 NO_CORRECTION BELOW_THRESHOLD NO_CORRECTION \n", + "2016-12-24 04:00:00 NO_CORRECTION BELOW_THRESHOLD NO_CORRECTION \n", + "2016-12-24 05:00:00 NO_CORRECTION BELOW_THRESHOLD NO_CORRECTION \n", + "2016-12-24 06:00:00 NO_CORRECTION BELOW_THRESHOLD NO_CORRECTION \n", + "2016-12-24 07:00:00 NO_CORRECTION BELOW_THRESHOLD NO_CORRECTION \n", + "2016-12-24 08:00:00 NO_CORRECTION BELOW_THRESHOLD NO_CORRECTION \n", + "\n", + " combine \n", + "2015-06-27 14:30:41 NO_CORRECTION \n", + "2012-09-24 15:00:00 NO_CORRECTION \n", + "2016-12-23 10:00:00 NO_CORRECTION \n", + "2016-12-23 11:00:00 NO_CORRECTION \n", + "2016-12-23 12:00:00 NO_CORRECTION \n", + "2016-12-23 13:00:00 NO_CORRECTION \n", + "2016-12-23 14:00:00 NO_CORRECTION \n", + "2016-12-23 15:00:00 NO_CORRECTION \n", + "2016-12-23 16:00:00 NO_CORRECTION \n", + "2016-12-23 17:00:00 NO_CORRECTION \n", + "2016-12-23 18:00:00 NO_CORRECTION \n", + "2016-12-23 19:00:00 NO_CORRECTION \n", + "2016-12-23 20:00:00 NO_CORRECTION \n", + "2016-12-23 21:00:00 NO_CORRECTION \n", + "2016-12-23 22:00:00 NO_CORRECTION \n", + "2016-12-23 23:00:00 NO_CORRECTION \n", + "2016-12-24 00:00:00 NO_CORRECTION \n", + "2016-12-24 01:00:00 NO_CORRECTION \n", + "2016-12-24 02:00:00 NO_CORRECTION \n", + "2016-12-24 03:00:00 NO_CORRECTION \n", + "2016-12-24 04:00:00 NO_CORRECTION \n", + "2016-12-24 05:00:00 NO_CORRECTION \n", + "2016-12-24 06:00:00 NO_CORRECTION \n", + "2016-12-24 07:00:00 NO_CORRECTION \n", + "2016-12-24 08:00:00 NO_CORRECTION " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "traval.ts_utils.get_correction_status_name(corrections)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The comparison objects are stored as a dictionary under `detector.comparisons`:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, "outputs": [], "source": [ "detect = detector" @@ -682,19 +985,19 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{1: ,\n", - " 2: ,\n", - " 3: ,\n", - " 4: }" + "{1: ,\n", + " 2: ,\n", + " 3: ,\n", + " 4: }" ] }, - "execution_count": 15, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -712,7 +1015,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -728,24 +1031,24 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "kept_in_both 37212\n", - "flagged_in_s1 0\n", - "flagged_in_s2 0\n", - "flagged_in_both 25\n", - "in_all_nan 0\n", - "introduced_in_s1 0\n", - "introduced_in_s2 0\n", - "introduced_in_both 0\n", - "Name: N_obs, dtype: int64" + "kept_in_both 37212.0\n", + "flagged_in_s1 0.0\n", + "flagged_in_s2 0.0\n", + "flagged_in_both 25.0\n", + "in_all_nan 0.0\n", + "introduced_in_s1 0.0\n", + "introduced_in_s2 0.0\n", + "introduced_in_both 0.0\n", + "Name: N_obs, dtype: float64" ] }, - "execution_count": 17, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -763,19 +1066,17 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "metadata": {}, "outputs": [ { "data": { - "image/png": "\n", + "image/png": "", "text/plain": [ - "
" + "
" ] }, - "metadata": { - "needs_background": "light" - }, + "metadata": {}, "output_type": "display_data" } ], @@ -794,37 +1095,35 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "\u001b[0;31mType:\u001b[0m BinaryClassifier\n", - "\u001b[0;31mString form:\u001b[0m \n", - "\u001b[0;31mFile:\u001b[0m ~/Github/traval/traval/binary_classifier.py\n", - "\u001b[0;31mDocstring:\u001b[0m Class for calculating binary classification statistics.\n", - "\u001b[0;31mInit docstring:\u001b[0m\n", - "Initialize class for calculating binary classification statistics.\n", - "\n", - "Parameters\n", - "----------\n", - "tp : int\n", - " number of True Positives (TP)\n", - "fp : int\n", - " number of False Positives (FP)\n", - "tn : int\n", - " number of True Negatives (TN)\n", - "fn : int\n", - " number of False Negatives (FN)\n" - ] - }, - "metadata": {}, - "output_type": "display_data" + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[0;31mType:\u001b[0m BinaryClassifier\n", + "\u001b[0;31mString form:\u001b[0m \n", + "\u001b[0;31mFile:\u001b[0m ~/github/traval/traval/binary_classifier.py\n", + "\u001b[0;31mDocstring:\u001b[0m Class for calculating binary classification statistics.\n", + "\u001b[0;31mInit docstring:\u001b[0m\n", + "Initialize class for calculating binary classification statistics.\n", + "\n", + "Parameters\n", + "----------\n", + "tp : int\n", + " number of True Positives (TP)\n", + "fp : int\n", + " number of False Positives (FP)\n", + "tn : int\n", + " number of True Negatives (TN)\n", + "fn : int\n", + " number of False Negatives (FN)" + ] } ], "source": [ - "comp.bc?" + "?comp.bc" ] }, { @@ -836,58 +1135,56 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "\u001b[0;31mSignature:\u001b[0m \u001b[0mcomp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconfusion_matrix\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mas_array\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mDocstring:\u001b[0m\n", - "Calculate confusion matrix.\n", - "\n", - "Confusion matrix shows the performance of the algorithm given a\n", - "certain truth. An abstract example of the confusion matrix:\n", - "\n", - " | Algorithm |\n", - " |-------------------|\n", - " | error | correct |\n", - "------|---------|---------|---------|\n", - " | error | TP | FN |\n", - "Truth |---------|---------|---------|\n", - " | correct | FP | TN |\n", - "------|---------|---------|---------|\n", - "\n", - "where:\n", - "- TP: True Positives = errors correctly detected by algorithm\n", - "- TN: True Negatives = correct values correctly not flagged by algorithm\n", - "- FP: False Positives = correct values marked as errors by algorithm\n", - "- FN: False Negatives = errors not detected by algorithm\n", - "\n", - "Parameters\n", - "----------\n", - "as_array : bool, optional\n", - " return data as array instead of DataFrame, by default False\n", - "\n", - "Returns\n", - "-------\n", - "data : pd.DataFrame or np.array\n", - " confusion matrix\n", - "\u001b[0;31mFile:\u001b[0m ~/Github/traval/traval/binary_classifier.py\n", - "\u001b[0;31mType:\u001b[0m method\n" - ] - }, - "metadata": {}, - "output_type": "display_data" + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[0;31mSignature:\u001b[0m \u001b[0mcomp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconfusion_matrix\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mas_array\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mDocstring:\u001b[0m\n", + "Calculate confusion matrix.\n", + "\n", + "Confusion matrix shows the performance of the algorithm given a\n", + "certain truth. An abstract example of the confusion matrix:\n", + "\n", + " | Algorithm |\n", + " |-------------------|\n", + " | error | correct |\n", + "------|---------|---------|---------|\n", + " | error | TP | FN |\n", + "Truth |---------|---------|---------|\n", + " | correct | FP | TN |\n", + "------|---------|---------|---------|\n", + "\n", + "where:\n", + "- TP: True Positives = errors correctly detected by algorithm\n", + "- TN: True Negatives = correct values correctly not flagged by algorithm\n", + "- FP: False Positives = correct values marked as errors by algorithm\n", + "- FN: False Negatives = errors not detected by algorithm\n", + "\n", + "Parameters\n", + "----------\n", + "as_array : bool, optional\n", + " return data as array instead of DataFrame, by default False\n", + "\n", + "Returns\n", + "-------\n", + "data : pd.DataFrame or np.array\n", + " confusion matrix\n", + "\u001b[0;31mFile:\u001b[0m ~/github/traval/traval/binary_classifier.py\n", + "\u001b[0;31mType:\u001b[0m method" + ] } ], "source": [ - "comp.bc.confusion_matrix?" + "?comp.bc.confusion_matrix" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -944,7 +1241,7 @@ " correct 0 37212" ] }, - "execution_count": 21, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -969,38 +1266,36 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "\u001b[0;31mType:\u001b[0m property\n", - "\u001b[0;31mString form:\u001b[0m \n", - "\u001b[0;31mDocstring:\u001b[0m \n", - "Specificity or True Negative Rate.\n", - "\n", - "Statistic describing ratio of true negatives identified,\n", - "which also says something about the avoidance of false positives.\n", - "\n", - " Specificity = TN / (TN + FP)\n", - "\n", - "where\n", - "- TN : True Negatives\n", - "- FP : False Positives\n" - ] - }, - "metadata": {}, - "output_type": "display_data" + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[0;31mType:\u001b[0m property\n", + "\u001b[0;31mString form:\u001b[0m \n", + "\u001b[0;31mDocstring:\u001b[0m \n", + "Specificity or True Negative Rate.\n", + "\n", + "Statistic describing ratio of true negatives identified,\n", + "which also says something about the avoidance of false positives.\n", + "\n", + " Specificity = TN / (TN + FP)\n", + "\n", + "where\n", + "- TN : True Negatives\n", + "- FP : False Positives" + ] } ], "source": [ - "comp.bc.specificity?" + "?comp.bc.specificity" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -1016,16 +1311,20 @@ ], "source": [ "print(\n", - " f\"- True Positive Rate = {comp.bc.true_positive_rate} = Sensitivity = {comp.bc.sensitivity}\"\n", + " f\"- True Positive Rate = {comp.bc.true_positive_rate} = \"\n", + " f\" Sensitivity = {comp.bc.sensitivity}\"\n", ")\n", "print(\n", - " f\"- True Negative Rate = {comp.bc.true_negative_rate} = Specificity = {comp.bc.specificity}\"\n", + " f\"- True Negative Rate = {comp.bc.true_negative_rate} = \"\n", + " f\" Specificity = {comp.bc.specificity}\"\n", ")\n", "print(\n", - " f\"- False Positive Rate = {comp.bc.false_positive_rate} = (1 - Specificity) = {1 - comp.bc.specificity}\"\n", + " f\"- False Positive Rate = {comp.bc.false_positive_rate} = \"\n", + " f\"(1 - Specificity) = {1 - comp.bc.specificity}\"\n", ")\n", "print(\n", - " f\"- False Negative Rate = {comp.bc.false_negative_rate} = (1 - Sensitivity) = {1 - comp.bc.sensitivity}\"\n", + " f\"- False Negative Rate = {comp.bc.false_negative_rate} = \"\n", + " f\"(1 - Sensitivity) = {1 - comp.bc.sensitivity}\"\n", ")" ] }, @@ -1070,19 +1369,17 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 26, "metadata": {}, "outputs": [ { "data": { - "image/png": "\n", + "image/png": "", "text/plain": [ - "
" + "
" ] }, - "metadata": { - "needs_background": "light" - }, + "metadata": {}, "output_type": "display_data" } ], @@ -1090,8 +1387,17 @@ "label = \"my first error \\ndetection algorithm\"\n", "ax = traval.plots.roc_plot(\n", " comp.bc.true_positive_rate, comp.bc.false_positive_rate, labels=label\n", - ")" + ")\n", + "ax.axis([-0.01, 1.01, -0.01, 1.01])\n", + "ax.legend(loc=\"lower right\");" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -1110,7 +1416,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.6" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/examples/notebooks/ex02_full_dataset_test.ipynb b/examples/notebooks/ex02_full_dataset_test.ipynb index 265c52d..1bf1851 100644 --- a/examples/notebooks/ex02_full_dataset_test.ipynb +++ b/examples/notebooks/ex02_full_dataset_test.ipynb @@ -21,12 +21,12 @@ "metadata": {}, "outputs": [], "source": [ - "import numpy as np\n", - "import pandas as pd\n", + "import hydropandas as hpd\n", "import matplotlib.pyplot as plt\n", + "import numpy as np\n", "import pystore\n", "from tqdm.notebook import tqdm\n", - "import hydropandas as hpd\n", + "\n", "import traval\n", "from traval import rulelib as rlib" ] @@ -116,9 +116,9 @@ " \"\"\"Get level below which sensor is dry from pystore.\"\"\"\n", " coll = store.collection(name)\n", " inhangdiepte_df = coll.item(\"Inhang.diepte\").to_pandas()\n", - " inhangdiepte = inhangdiepte_df.value.iloc[0]\n", + " # inhangdiepte = inhangdiepte_df.value.iloc[0]\n", " meetpuntNAP_df = coll.item(\"Meetpunt.hoogte\").to_pandas()\n", - " meetpuntNAP = meetpuntNAP_df.value.iloc[0]\n", + " # meetpuntNAP = meetpuntNAP_df.value.iloc[0]\n", " threshold_series = meetpuntNAP_df.value - inhangdiepte_df.value\n", " return threshold_series" ] @@ -265,7 +265,7 @@ "# initialize empty BinaryClassifier\n", "bc_sum = traval.BinaryClassifier(0, 0, 0, 0)\n", "\n", - "for k, dct in dlist.items():\n", + "for _, dct in dlist.items():\n", " # get TPR and FPR\n", " itpr = dct.comparisons[4].bc.true_positive_rate\n", " ifpr = dct.comparisons[4].bc.false_positive_rate\n", diff --git a/examples/notebooks/ex03_testing_rules.ipynb b/examples/notebooks/ex03_testing_rules.ipynb new file mode 100644 index 0000000..f2061e5 --- /dev/null +++ b/examples/notebooks/ex03_testing_rules.ipynb @@ -0,0 +1,2171 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Error detection rules included in traval\n", + "\n", + "This notebook shows simple examples of the error detection rules in `traval`." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "import traval\n", + "from traval import rulelib as rlib" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create a very simple time series:" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2020-01-01 0\n", + "2020-01-02 1\n", + "2020-01-03 2\n", + "2020-01-04 3\n", + "2020-01-05 4\n", + "2020-01-06 5\n", + "2020-01-07 6\n", + "2020-01-08 7\n", + "2020-01-09 8\n", + "2020-01-10 9\n", + "Freq: D, dtype: int64" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "date_range = pd.date_range(\"2020-01-01\", freq=\"D\", periods=10)\n", + "s1 = pd.Series(index=date_range, data=np.arange(10))\n", + "s1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## `rule_ufunc_threshold`: float threshold\n", + "\n", + "Rule comparing series to threshold value." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
correction_codeseries_valuescomparison_values
2020-01-010NaNNaN
2020-01-020NaNNaN
2020-01-030NaNNaN
2020-01-040NaNNaN
2020-01-050NaNNaN
2020-01-0625.05.0
2020-01-0726.05.0
2020-01-0827.05.0
2020-01-0928.05.0
2020-01-1029.05.0
\n", + "
" + ], + "text/plain": [ + " correction_code series_values comparison_values\n", + "2020-01-01 0 NaN NaN\n", + "2020-01-02 0 NaN NaN\n", + "2020-01-03 0 NaN NaN\n", + "2020-01-04 0 NaN NaN\n", + "2020-01-05 0 NaN NaN\n", + "2020-01-06 2 5.0 5.0\n", + "2020-01-07 2 6.0 5.0\n", + "2020-01-08 2 7.0 5.0\n", + "2020-01-09 2 8.0 5.0\n", + "2020-01-10 2 9.0 5.0" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c1 = rlib.rule_ufunc_threshold(s1, (np.greater_equal,), 5)\n", + "assert (c1[\"correction_code\"] == 2).sum() == 5\n", + "c1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## `rule_ufunc_threshold`: threshold series\n", + "\n", + "Rule comparing series to threshold series." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
correction_codeseries_valuescomparison_values
2020-01-010NaNNaN
2020-01-020NaNNaN
2020-01-030NaNNaN
2020-01-040NaNNaN
2020-01-050NaNNaN
2020-01-0625.05.0
2020-01-0726.05.0
2020-01-0827.05.0
2020-01-0928.05.0
2020-01-1029.05.0
\n", + "
" + ], + "text/plain": [ + " correction_code series_values comparison_values\n", + "2020-01-01 0 NaN NaN\n", + "2020-01-02 0 NaN NaN\n", + "2020-01-03 0 NaN NaN\n", + "2020-01-04 0 NaN NaN\n", + "2020-01-05 0 NaN NaN\n", + "2020-01-06 2 5.0 5.0\n", + "2020-01-07 2 6.0 5.0\n", + "2020-01-08 2 7.0 5.0\n", + "2020-01-09 2 8.0 5.0\n", + "2020-01-10 2 9.0 5.0" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# rule_ufunc_threshold: series\n", + "idx = date_range[:3].to_list() + date_range[-4:-1].to_list()\n", + "thresh_series = pd.Series(index=idx, data=5.0)\n", + "full_threshold_series = traval.ts_utils.resample_short_series_to_long_series(\n", + " thresh_series, s1\n", + ")\n", + "c2 = rlib.rule_ufunc_threshold(s1, (np.greater_equal,), thresh_series)\n", + "assert (c2[\"correction_code\"] == 2).sum() == 5\n", + "c2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## `rule_diff_ufunc_threshold`\n", + "\n", + "Rule comparing diff of series to threshold value." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
correction_codeseries_valuescomparison_values
2020-01-010NaNNaN
2020-01-020NaNNaN
2020-01-030NaNNaN
2020-01-040NaNNaN
2020-01-0525.01.1
2020-01-060NaNNaN
2020-01-070NaNNaN
2020-01-080NaNNaN
2020-01-090NaNNaN
2020-01-100NaNNaN
\n", + "
" + ], + "text/plain": [ + " correction_code series_values comparison_values\n", + "2020-01-01 0 NaN NaN\n", + "2020-01-02 0 NaN NaN\n", + "2020-01-03 0 NaN NaN\n", + "2020-01-04 0 NaN NaN\n", + "2020-01-05 2 5.0 1.1\n", + "2020-01-06 0 NaN NaN\n", + "2020-01-07 0 NaN NaN\n", + "2020-01-08 0 NaN NaN\n", + "2020-01-09 0 NaN NaN\n", + "2020-01-10 0 NaN NaN" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# rule_diff_ufunc_threshold\n", + "s1.loc[date_range[4]] += 1\n", + "c3 = rlib.rule_diff_ufunc_threshold(s1, (np.greater_equal,), 1.1)\n", + "assert (c3[\"correction_code\"] == 2).sum() == 1\n", + "c3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## `rule_other_ufunc_threshold`\n", + "\n", + "Rule comparing other series to threshold." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
correction_codeseries_valuescomparison_values
2020-01-01-20.05.0
2020-01-02-21.05.0
2020-01-03-22.05.0
2020-01-04-23.05.0
2020-01-05-24.05.0
2020-01-060NaNNaN
2020-01-070NaNNaN
2020-01-080NaNNaN
2020-01-090NaNNaN
2020-01-100NaNNaN
\n", + "
" + ], + "text/plain": [ + " correction_code series_values comparison_values\n", + "2020-01-01 -2 0.0 5.0\n", + "2020-01-02 -2 1.0 5.0\n", + "2020-01-03 -2 2.0 5.0\n", + "2020-01-04 -2 3.0 5.0\n", + "2020-01-05 -2 4.0 5.0\n", + "2020-01-06 0 NaN NaN\n", + "2020-01-07 0 NaN NaN\n", + "2020-01-08 0 NaN NaN\n", + "2020-01-09 0 NaN NaN\n", + "2020-01-10 0 NaN NaN" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# rule_other_ufunc_threshold\n", + "date_range = pd.date_range(\"2020-01-01\", freq=\"D\", periods=10)\n", + "s1 = pd.Series(index=date_range, data=np.arange(10))\n", + "val = s1.copy()\n", + "c4 = rlib.rule_other_ufunc_threshold(s1, val, (np.less,), 5)\n", + "assert (c4[\"correction_code\"] == -2).sum() == 5\n", + "c4" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## `rule_max_gradient`\n", + "\n", + "Rule that checks the maximum gradient between two values." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
correction_codeseries_valuescomparison_values
2020-01-010NaNNaN
2020-01-020NaNNaN
2020-01-030NaNNaN
2020-01-040NaNNaN
2020-01-0525.01.0
2020-01-060NaNNaN
2020-01-070NaNNaN
2020-01-080NaNNaN
2020-01-090NaNNaN
2020-01-100NaNNaN
\n", + "
" + ], + "text/plain": [ + " correction_code series_values comparison_values\n", + "2020-01-01 0 NaN NaN\n", + "2020-01-02 0 NaN NaN\n", + "2020-01-03 0 NaN NaN\n", + "2020-01-04 0 NaN NaN\n", + "2020-01-05 2 5.0 1.0\n", + "2020-01-06 0 NaN NaN\n", + "2020-01-07 0 NaN NaN\n", + "2020-01-08 0 NaN NaN\n", + "2020-01-09 0 NaN NaN\n", + "2020-01-10 0 NaN NaN" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# rule_max_gradient\n", + "date_range = pd.date_range(\"2020-01-01\", freq=\"D\", periods=10)\n", + "s1 = pd.Series(index=date_range, data=np.arange(10))\n", + "s1.loc[date_range[4]] += 1\n", + "c5 = rlib.rule_max_gradient(s1, max_step=1.0, max_timestep=\"1D\")\n", + "assert (c5[\"correction_code\"] == 2).sum() == 1\n", + "c5" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## `rule_spike_detection`\n", + "\n", + "Rule that detects spikes, single observations that differ significantly from both\n", + "neighbors." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
correction_codeseries_valuescomparison_values
2020-01-010NaNNaN
2020-01-020NaNNaN
2020-01-030NaNNaN
2020-01-040NaNNaN
2020-01-05997.0NaN
2020-01-060NaNNaN
2020-01-070NaNNaN
2020-01-080NaNNaN
2020-01-090NaNNaN
2020-01-100NaNNaN
\n", + "
" + ], + "text/plain": [ + " correction_code series_values comparison_values\n", + "2020-01-01 0 NaN NaN\n", + "2020-01-02 0 NaN NaN\n", + "2020-01-03 0 NaN NaN\n", + "2020-01-04 0 NaN NaN\n", + "2020-01-05 99 7.0 NaN\n", + "2020-01-06 0 NaN NaN\n", + "2020-01-07 0 NaN NaN\n", + "2020-01-08 0 NaN NaN\n", + "2020-01-09 0 NaN NaN\n", + "2020-01-10 0 NaN NaN" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# rule_spike_detection\n", + "date_range = pd.date_range(\"2020-01-01\", freq=\"D\", periods=10)\n", + "s1 = pd.Series(index=date_range, data=np.arange(10))\n", + "s1.iloc[4] += 3\n", + "c6 = rlib.rule_spike_detection(s1, threshold=2, spike_tol=2)\n", + "assert (c6[\"correction_code\"] == 99).sum() == 1\n", + "c6" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## `rule_offset_detection`\n", + "\n", + "Rule that looks for periods that are offset relative to the rest of the time series." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
correction_codeseries_valuescomparison_values
2020-01-010.0NaNNaN
2020-01-020.0NaNNaN
2020-01-030.0NaNNaN
2020-01-0499.0NaNNaN
2020-01-0599.0NaNNaN
2020-01-0699.0NaNNaN
2020-01-0799.0NaNNaN
2020-01-080.0NaNNaN
2020-01-090.0NaNNaN
2020-01-100.0NaNNaN
\n", + "
" + ], + "text/plain": [ + " correction_code series_values comparison_values\n", + "2020-01-01 0.0 NaN NaN\n", + "2020-01-02 0.0 NaN NaN\n", + "2020-01-03 0.0 NaN NaN\n", + "2020-01-04 99.0 NaN NaN\n", + "2020-01-05 99.0 NaN NaN\n", + "2020-01-06 99.0 NaN NaN\n", + "2020-01-07 99.0 NaN NaN\n", + "2020-01-08 0.0 NaN NaN\n", + "2020-01-09 0.0 NaN NaN\n", + "2020-01-10 0.0 NaN NaN" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# rule_offset_detection\n", + "date_range = pd.date_range(\"2020-01-01\", freq=\"D\", periods=10)\n", + "s1 = pd.Series(index=date_range, data=np.arange(10))\n", + "s1.iloc[3:7] += 10\n", + "c7 = rlib.rule_offset_detection(s1, threshold=5, updown_diff=2.0)\n", + "assert (c7[\"correction_code\"] == 99).sum() == 4\n", + "c7" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## `rule_outside_n_sigma`\n", + "\n", + "Rule that checks if measurements are outside $N$ standard deviations of the time series." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
correction_codeseries_valuescomparison_values
2020-01-01-20.01.47235
2020-01-02-21.01.47235
2020-01-030NaNNaN
2020-01-040NaNNaN
2020-01-050NaNNaN
2020-01-060NaNNaN
2020-01-070NaNNaN
2020-01-080NaNNaN
2020-01-0928.07.52765
2020-01-1029.07.52765
\n", + "
" + ], + "text/plain": [ + " correction_code series_values comparison_values\n", + "2020-01-01 -2 0.0 1.47235\n", + "2020-01-02 -2 1.0 1.47235\n", + "2020-01-03 0 NaN NaN\n", + "2020-01-04 0 NaN NaN\n", + "2020-01-05 0 NaN NaN\n", + "2020-01-06 0 NaN NaN\n", + "2020-01-07 0 NaN NaN\n", + "2020-01-08 0 NaN NaN\n", + "2020-01-09 2 8.0 7.52765\n", + "2020-01-10 2 9.0 7.52765" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# rule_outside_n_sigma\n", + "date_range = pd.date_range(\"2020-01-01\", freq=\"D\", periods=10)\n", + "s1 = pd.Series(index=date_range, data=np.arange(10))\n", + "c8 = rlib.rule_outside_n_sigma(s1, n=1.0)\n", + "assert (c8[\"correction_code\"] == -2).sum() == 2\n", + "assert (c8[\"correction_code\"] == 2).sum() == 2\n", + "c8" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## `rule_diff_outside_of_n_sigma`\n", + "\n", + "Rule that checks if the diff of a series lies outside of $N$ standard deviations of the\n", + "differences." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
correction_codeseries_valuescomparison_values
2020-01-010NaNNaN
2020-01-020NaNNaN
2020-01-030NaNNaN
2020-01-040NaNNaN
2020-01-050NaNNaN
2020-01-060NaNNaN
2020-01-0722.01.054093
2020-01-0822.01.054093
2020-01-0922.01.054093
2020-01-1022.01.054093
\n", + "
" + ], + "text/plain": [ + " correction_code series_values comparison_values\n", + "2020-01-01 0 NaN NaN\n", + "2020-01-02 0 NaN NaN\n", + "2020-01-03 0 NaN NaN\n", + "2020-01-04 0 NaN NaN\n", + "2020-01-05 0 NaN NaN\n", + "2020-01-06 0 NaN NaN\n", + "2020-01-07 2 2.0 1.054093\n", + "2020-01-08 2 2.0 1.054093\n", + "2020-01-09 2 2.0 1.054093\n", + "2020-01-10 2 2.0 1.054093" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# rule_diff_outside_of_n_sigma\n", + "date_range = pd.date_range(\"2020-01-01\", freq=\"D\", periods=10)\n", + "s1 = pd.Series(index=date_range, data=np.arange(10))\n", + "s1.iloc[5:] += np.arange(5)\n", + "c9 = rlib.rule_diff_outside_of_n_sigma(s1, 2.0)\n", + "assert (c9[\"correction_code\"] == 2).sum() == 4\n", + "c9" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## `rule_outside_bandwidth`\n", + "\n", + "Rule checking values lie outside some given upper and lower thresholds." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
correction_codeseries_valuescomparison_values
2020-01-01-20.01.000000
2020-01-02-21.01.111111
2020-01-030NaNNaN
2020-01-040NaNNaN
2020-01-050NaNNaN
2020-01-060NaNNaN
2020-01-070NaNNaN
2020-01-080NaNNaN
2020-01-0928.07.888889
2020-01-1029.08.000000
\n", + "
" + ], + "text/plain": [ + " correction_code series_values comparison_values\n", + "2020-01-01 -2 0.0 1.000000\n", + "2020-01-02 -2 1.0 1.111111\n", + "2020-01-03 0 NaN NaN\n", + "2020-01-04 0 NaN NaN\n", + "2020-01-05 0 NaN NaN\n", + "2020-01-06 0 NaN NaN\n", + "2020-01-07 0 NaN NaN\n", + "2020-01-08 0 NaN NaN\n", + "2020-01-09 2 8.0 7.888889\n", + "2020-01-10 2 9.0 8.000000" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# rule_outside_bandwidth\n", + "date_range = pd.date_range(\"2020-01-01\", freq=\"D\", periods=10)\n", + "s1 = pd.Series(index=date_range, data=np.arange(10))\n", + "lb = pd.Series(index=date_range[[0, -1]], data=[1, 2])\n", + "ub = pd.Series(index=date_range[[0, -1]], data=[7, 8])\n", + "c10 = rlib.rule_outside_bandwidth(s1, lb, ub)\n", + "assert (c10[\"correction_code\"] == -2).sum() == 2\n", + "assert (c10[\"correction_code\"] == 2).sum() == 2\n", + "c10" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## `rule_shift_to_manual_obs`\n", + "\n", + "Rule that corrects observations and shifts them to manual observations using linear\n", + "interpolation of the differences between the time series and the manual observations." + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# rule_shift_to_manual_obs\n", + "date_range = pd.date_range(\"2020-01-01\", freq=\"D\", periods=10)\n", + "s1 = pd.Series(index=date_range, data=np.arange(10))\n", + "h = pd.Series(index=date_range[[1, -1]], data=[2, 10])\n", + "a = rlib.rule_shift_to_manual_obs(s1, h, max_dt=\"2D\", method=\"linear\")\n", + "assert (a.iloc[1:] == s1.iloc[1:] + 1).all()\n", + "assert a.iloc[0] == s1.iloc[0]\n", + "ax = s1.plot()\n", + "h.plot(ax=ax, marker=\"x\", ls=\"none\")\n", + "a.plot(ax=ax, ls=\"dashed\");" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## `rule_compare_to_manual_obs`\n", + "\n", + "Rule that compares a time series to manual observations. Values are marked as suspect\n", + "when the linear interpolated difference between the time series and the manual\n", + "observations exceeds some threshold." + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
correction_codeseries_valuescomparison_values
2020-01-010NaNNaN
2020-01-020NaNNaN
2020-01-030NaNNaN
2020-01-040NaNNaN
2020-01-050NaNNaN
2020-01-060NaNNaN
2020-01-070NaNNaN
2020-01-08-2-1.250-1.0
2020-01-09-2-1.625-1.0
2020-01-10-2-2.000-1.0
\n", + "
" + ], + "text/plain": [ + " correction_code series_values comparison_values\n", + "2020-01-01 0 NaN NaN\n", + "2020-01-02 0 NaN NaN\n", + "2020-01-03 0 NaN NaN\n", + "2020-01-04 0 NaN NaN\n", + "2020-01-05 0 NaN NaN\n", + "2020-01-06 0 NaN NaN\n", + "2020-01-07 0 NaN NaN\n", + "2020-01-08 -2 -1.250 -1.0\n", + "2020-01-09 -2 -1.625 -1.0\n", + "2020-01-10 -2 -2.000 -1.0" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# rule compare_to_manual_obs\n", + "date_range = pd.date_range(\"2020-01-01\", freq=\"D\", periods=10)\n", + "s1 = pd.Series(index=date_range, data=np.arange(10))\n", + "h = pd.Series(index=date_range[[1, -1]], data=[2, 7])\n", + "c11 = rlib.rule_compare_to_manual_obs(\n", + " s1, h, threshold=1.0, max_dt=\"2D\", method=\"linear\"\n", + ")\n", + "ax = s1.plot(label=\"series\")\n", + "h.plot(ax=ax, marker=\"o\", ls=\"none\", label=\"manual observations\")\n", + "s1.loc[c11[\"correction_code\"] != 0].plot(\n", + " ax=ax, marker=\"x\", ls=\"none\", label=\"suspect observations\", c=\"C3\"\n", + ")\n", + "ax.legend(loc=(0, 1), frameon=False, ncol=3, fontsize=\"small\")\n", + "c11" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## `rule_combine_corrections_or`\n", + "\n", + "Rule for combining results of any number of other rules. Observations are suspect if\n", + "ANY rule flags an observation as suspect." + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
correction_codeseries_valuescomparison_values
2020-01-0199NaNNaN
2020-01-020NaNNaN
2020-01-030NaNNaN
2020-01-040NaNNaN
2020-01-050NaNNaN
2020-01-060NaNNaN
2020-01-070NaNNaN
2020-01-080NaNNaN
2020-01-090NaNNaN
2020-01-1099NaNNaN
\n", + "
" + ], + "text/plain": [ + " correction_code series_values comparison_values\n", + "2020-01-01 99 NaN NaN\n", + "2020-01-02 0 NaN NaN\n", + "2020-01-03 0 NaN NaN\n", + "2020-01-04 0 NaN NaN\n", + "2020-01-05 0 NaN NaN\n", + "2020-01-06 0 NaN NaN\n", + "2020-01-07 0 NaN NaN\n", + "2020-01-08 0 NaN NaN\n", + "2020-01-09 0 NaN NaN\n", + "2020-01-10 99 NaN NaN" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# rule_combine_corrections_or\n", + "date_range = pd.date_range(\"2020-01-01\", freq=\"D\", periods=10)\n", + "s1 = pd.DataFrame(index=date_range, columns=[\"correction_code\"], data=0)\n", + "s2 = s1.copy()\n", + "s1.iloc[0] = 99\n", + "s2.iloc[-1] = -2\n", + "c11 = rlib.rule_combine_corrections_or(s1, s2)\n", + "assert (c11[\"correction_code\"] == 99).sum() == 2\n", + "c11" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## `rule_combine_corrections_and`\n", + "\n", + "Rule for combining results of any number of other rules. Observations are suspect if\n", + "ALL rules flag an observation as suspect." + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
correction_codeseries_valuescomparison_values
2020-01-010NaNNaN
2020-01-0299NaNNaN
2020-01-030NaNNaN
2020-01-040NaNNaN
2020-01-050NaNNaN
2020-01-060NaNNaN
2020-01-070NaNNaN
2020-01-080NaNNaN
2020-01-090NaNNaN
2020-01-100NaNNaN
\n", + "
" + ], + "text/plain": [ + " correction_code series_values comparison_values\n", + "2020-01-01 0 NaN NaN\n", + "2020-01-02 99 NaN NaN\n", + "2020-01-03 0 NaN NaN\n", + "2020-01-04 0 NaN NaN\n", + "2020-01-05 0 NaN NaN\n", + "2020-01-06 0 NaN NaN\n", + "2020-01-07 0 NaN NaN\n", + "2020-01-08 0 NaN NaN\n", + "2020-01-09 0 NaN NaN\n", + "2020-01-10 0 NaN NaN" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# rule_combine_corrections_and\n", + "date_range = pd.date_range(\"2020-01-01\", freq=\"D\", periods=10)\n", + "s1 = pd.DataFrame(index=date_range, columns=[\"correction_code\"], data=0)\n", + "s2 = s1.copy()\n", + "s1.iloc[0:2] = 99\n", + "s2.iloc[1:3] = -2\n", + "c12 = rlib.rule_combine_corrections_and(s1, s2)\n", + "assert (c12[\"correction_code\"] == 99).sum() == 1\n", + "c12" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## `rule_funcdict`\n", + "\n", + "Rule that takes a dictionary of functions and applies those iteratively to the original\n", + "time series. Observations are suspect if any rule flags an observation as suspect." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
correction_codeseries_valuescomparison_values
2020-01-01990.0NaN
2020-01-02991.0NaN
2020-01-03992.0NaN
2020-01-040NaNNaN
2020-01-050NaNNaN
2020-01-060NaNNaN
2020-01-070NaNNaN
2020-01-080NaNNaN
2020-01-09998.0NaN
2020-01-10999.0NaN
\n", + "
" + ], + "text/plain": [ + " correction_code series_values comparison_values\n", + "2020-01-01 99 0.0 NaN\n", + "2020-01-02 99 1.0 NaN\n", + "2020-01-03 99 2.0 NaN\n", + "2020-01-04 0 NaN NaN\n", + "2020-01-05 0 NaN NaN\n", + "2020-01-06 0 NaN NaN\n", + "2020-01-07 0 NaN NaN\n", + "2020-01-08 0 NaN NaN\n", + "2020-01-09 99 8.0 NaN\n", + "2020-01-10 99 9.0 NaN" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# rule_funcdict_to_nan\n", + "date_range = pd.date_range(\"2020-01-01\", freq=\"D\", periods=10)\n", + "s1 = pd.Series(index=date_range, data=np.arange(10))\n", + "fdict = {\"lt_3\": lambda s: s < 3.0, \"gt_7\": lambda s: s > 7.0}\n", + "c13 = rlib.rule_funcdict(s1, fdict)\n", + "assert (c13[\"correction_code\"] == 99).sum() == 5\n", + "c13" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## `rule_keep_comments`\n", + "\n", + "Rule that keeps observations that have some comment associated with it. Can be used to\n", + "filter validated time series comments to obtain specific observations." + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
correction_codeseries_valuescomparison_values
2020-01-01990.0keep
2020-01-02991.0keep
2020-01-03992.0keep
2020-01-04993.0keep
2020-01-050NaN
2020-01-060NaN
2020-01-070NaN
2020-01-080NaN
2020-01-090NaN
2020-01-100NaN
\n", + "
" + ], + "text/plain": [ + " correction_code series_values comparison_values\n", + "2020-01-01 99 0.0 keep\n", + "2020-01-02 99 1.0 keep\n", + "2020-01-03 99 2.0 keep\n", + "2020-01-04 99 3.0 keep\n", + "2020-01-05 0 NaN \n", + "2020-01-06 0 NaN \n", + "2020-01-07 0 NaN \n", + "2020-01-08 0 NaN \n", + "2020-01-09 0 NaN \n", + "2020-01-10 0 NaN " + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# rule_keep_comments\n", + "date_range = pd.date_range(\"2020-01-01\", freq=\"D\", periods=10)\n", + "raw = pd.Series(index=date_range, data=np.arange(10), dtype=float)\n", + "comments = [\"keep\"] * 4 + [\"\"] * 3 + [\"discard\"] * 3\n", + "comment_series = pd.Series(index=raw.index, data=comments)\n", + "c14 = rlib.rule_keep_comments(raw, [\"keep\"], comment_series)\n", + "assert (c14[\"correction_code\"] == 99).sum() == 4\n", + "assert (c14[\"comparison_values\"] == \"keep\").sum() == 4\n", + "c14" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "artesia", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pyproject.toml b/pyproject.toml index a0db3bb..0477869 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta" [project] name = "traval" dynamic = ["version"] -description = "Python package for applying automatic error detection algorithms to timeseries. Create custom error detection algorithms to support data validation workflows." +description = "Python package for applying automatic error detection algorithms to time series. Create custom error detection algorithms to support data validation workflows." license = { file = "LICENSE" } readme = "readme.md" authors = [{ name = "D.A. Brakenhoff" }] @@ -66,10 +66,23 @@ packages = ["traval"] [tool.setuptools.dynamic] version = { attr = "traval.version.__version__" } -[tool.black] +[tool.ruff] line-length = 88 +extend-include = ["*.ipynb"] -[tool.isort] -profile = "black" -src_paths = ["traval"] -line_length = 88 +[tool.ruff.lint] +# See: https://docs.astral.sh/ruff/rules/ +select = [ + "C4", # flake8-comprehensions + "E", # pycodestyle + "F", # pyflakes + "I", # isort + "PT", # pytest-style + "D", # pydocstyle + "B", # flake8-bugbear + "NPY", # numpy +] +ignore = ["D100", "D102", "D103", "D401"] + +[tool.ruff.lint.pydocstyle] +convention = "numpy" diff --git a/readme.md b/readme.md index f3e57de..6e34fa1 100644 --- a/readme.md +++ b/readme.md @@ -6,63 +6,71 @@ # traval -Tools for applying automatic error detection algorithms to timeseries. +Tools for applying automatic error detection algorithms to time series. ## Introduction -This module is set up to provide tools for applying any error detection -algorithm to any timeseries. The module consists of three main components: +This module is set up to provide tools for applying any error detection +algorithm to any time series. The module consists of three main components: -- `RuleSet`: the RuleSet object is a highly flexible object for defining error detection algorithms based on (user-defined) functions. -- `Detector`: a data management object for storing timeseries and error detection results. -- `SeriesComparison*`: objects for comparing timeseries. These objects include plots for visualizing the comparisons. +- `RuleSet`: the RuleSet object is a highly flexible object for defining error + detection algorithms based on (user-defined) functions. +- `Detector`: a data management object for storing time series and error detection + results. +- `SeriesComparison*`: objects for comparing time series. These objects include plots + for visualizing the comparisons. The general workflow consists of the following steps: -1. Define error detection algorithm(s). -2. Load data, i.e. raw timeseries data and optionally timeseries representing the "truth" to see how well the algorithms perform. -3. Initialize Detector objects and apply algorithms to timeseries. -4. Store and analyze the results. +1. Define error detection algorithm(s). +2. Load data, i.e. raw time series data and optionally time series representing the + "truth" to see how well the algorithms perform. +3. Initialize Detector objects and apply algorithms to time series. +4. Store and analyze the results. -For more detailed information and examples, please refer to the notebooks in +For more detailed information and examples, please refer to the notebooks in the examples directory. ## Installation To install the traval module, follow these steps: -1. Clone the repository from GitHub. -2. Open a terminal and navigate to the module root directory: `/traval` -3. Type `pip install -e .` +1. Clone the repository from GitHub. +2. Open a terminal and navigate to the module root directory: `/traval` +3. Type `pip install -e .` ## Usage -The basic usage of the module is described below. To start using the module, +The basic usage of the module is described below. To start using the module, import the package: ```python ->>> import traval +import traval ``` -The first step is generally to define an error detection algorithm. This is +The first step is generally to define an error detection algorithm. This is done with the `RuleSet` object: ```python ->>> ruleset = traval.RuleSet("my_first_algorithm") +ruleset = traval.RuleSet("my_first_algorithm") ``` -Add a detection rule (using a general rule from the library contained within +Add a detection rule (using a general rule from the library contained within the module). In this case the rule states any value above 10.0 is suspect: ```python ->>> ruleset.add_rule("rule1", traval.rulelib.rule_ufunc_threshold , apply_to=0, - kwargs={"ufunc": (np.greater,), "threshold": 10.0}) +ruleset.add_rule( + "rule1", + traval.rulelib.rule_ufunc_threshold, + apply_to=0, + kwargs={"ufunc": (np.greater,), "threshold": 10.0} +) ``` Take a look at the ruleset by just typing `ruleset`: ```python ->>> ruleset +ruleset ``` ```text @@ -71,25 +79,25 @@ RuleSet: 'my_first_algorithm' 1: rule1 0 ``` -Next define a Detector object. This object is designed to store a timeseries -and the intermediate and final results after applying an error detection -algorithm. Initialize the Detector object with some timeseries. In this example -we assume there is a timeseries called `raw_series`: +Next define a Detector object. This object is designed to store a time series +and the intermediate and final results after applying an error detection +algorithm. Initialize the Detector object with some time series. In this example +we assume there is a time series called `raw_series`: ```python >>> detect = traval.Detector(raw_series) ``` -Apply our first algorithm to the timeseries. +Apply our first algorithm to the time series. ```python >>> detect.apply_ruleset(ruleset) ``` -By default, the result of each step in the algorithm is compared to the -original series and stored in the `detect.comparisons` attribute. Take a -look at the comparison between the raw data and the result of the error -detection algorithm. +By default, the result of each step in the algorithm is compared to the +original series and stored in the `detect.comparisons` attribute. Take a +look at the comparison between the raw data and the result of the error +detection algorithm. Since we only defined one step, step 1 represents the final result. @@ -97,7 +105,7 @@ Since we only defined one step, step 1 represents the final result. >>> cp = detect.comparisons[1] # result of step 1 = final result ``` -The `SeriesComparison*` objects contain methods to visualize the comparison, +The `SeriesComparison*` objects contain methods to visualize the comparison, or summarize the number of observations in each category: ```python @@ -105,9 +113,9 @@ or summarize the number of observations in each category: >>> cp.summary # series containing number of observations in each category ``` -For more detailed explanation and more complex examples, see the notebook(s) +For more detailed explanation and more complex examples, see the notebook(s) in the examples directory. ## Author -- D.A. Brakenhoff, Artesia, 2020 +- D.A. Brakenhoff, Artesia, 2020 diff --git a/tests/test_001.py b/tests/test_001.py index b83ffc9..ba2dea7 100644 --- a/tests/test_001.py +++ b/tests/test_001.py @@ -1,2 +1,3 @@ +# ruff: noqa: D100 D103 def test_import(): - import traval + pass diff --git a/tests/test_002_ruleset.py b/tests/test_002_ruleset.py index f95790d..0306bcc 100644 --- a/tests/test_002_ruleset.py +++ b/tests/test_002_ruleset.py @@ -1,19 +1,21 @@ +# ruff: noqa: D100 D103 import numpy as np import pandas as pd + import traval from traval.ruleset import RuleSet def func1(s): mask = s > 10 - s = pd.Series(index=s.index, data=0.0) + s = pd.DataFrame(index=s.index, data=0.0, columns=["correction_code"]) s.loc[mask] = np.nan return s def func2(s, val): mask = s < val - s = pd.Series(index=s.index, data=0.0) + s = pd.DataFrame(index=s.index, data=0.0, columns=["correction_code"]) s.loc[mask] = np.nan return s @@ -32,76 +34,80 @@ def func4(*args): return result -def test_init(): +def get_empty_ruleset(): + return RuleSet(name="test") + + +def get_filled_ruleset(): rset = traval.RuleSet(name="test") + rset.add_rule("gt10", func1, apply_to=0) + rset.add_rule("less_than_value", func2, apply_to=1, kwargs={"val": 0}) return rset +def test_init(): + _ = traval.RuleSet(name="test") + + def test_add_rules(): - rset = test_init() + rset = traval.RuleSet(name="test") rset.add_rule("gt10", func1, apply_to=0) rset.add_rule("less_than_value", func2, apply_to=1, kwargs={"val": 0}) - return rset def test_update_rules(): - rset = test_add_rules() + rset = traval.RuleSet(name="test") + rset.add_rule("gt10", func1, apply_to=0) + rset.add_rule("less_than_value", func2, apply_to=1, kwargs={"val": 0}) rset.update_rule("less_than_value", func2, apply_to=1, kwargs={"val": func3}) - return rset def test_to_dataframe(): - rset = test_add_rules() - rdf = rset.to_dataframe() - return rdf + rset = get_filled_ruleset() + _ = rset.to_dataframe() def test_applyself_static_kwargs(): series = pd.Series(index=range(10), data=range(-5, 23, 3), name="test_series") - rset = test_add_rules() + rset = get_filled_ruleset() _, _ = rset(series) - return def test_applyself_callable_kwargs(): series = pd.Series(index=range(10), data=range(-5, 23, 3), name="test_series") - rset = test_update_rules() + rset = get_filled_ruleset() + rset.update_rule("less_than_value", func2, apply_to=1, kwargs={"val": func3}) _, _ = rset(series) - return def test_applyself_combine(): - rset = test_init() + rset = traval.RuleSet(name="test") rset.add_rule("+1", lambda s: s + 1, apply_to=0) rset.add_rule("add 0+1", func4, apply_to=(0, 1)) series = pd.Series(index=range(10), data=0.0, name="test_series") d, _ = rset(series) assert (d[len(d) - 1] == 1.0).all() - return d def test_del_rules(): - rset = test_add_rules() + rset = get_filled_ruleset() rset.del_rule("gt10") assert len(rset.rules) == 1 - return def test_to_from_pickle(): - rset = test_add_rules() + rset = get_filled_ruleset() rset.to_pickle("test.pkl") rset = RuleSet.from_pickle("test.pkl") import os os.remove("test.pkl") - return def test_to_from_json(): - rset = test_add_rules() + rset = get_filled_ruleset() rset.to_json("test.json") rset = RuleSet.from_json("test.json") import os os.remove("test.json") - return diff --git a/tests/test_003_detector.py b/tests/test_003_detector.py index 1b91318..e8588d1 100644 --- a/tests/test_003_detector.py +++ b/tests/test_003_detector.py @@ -1,22 +1,42 @@ +# ruff: noqa: D100 D103 import numpy as np import pandas as pd +from test_002_ruleset import get_filled_ruleset + import traval -from test_002_ruleset import test_add_rules + +def get_detector(): + s = pd.Series( + index=range(10), data=np.arange(-5, 23, 3, dtype=float), name="test_series" + ) + return traval.Detector(s) + + +def get_detector_with_result(): + d = get_detector() + rset = get_filled_ruleset() + t = pd.Series( + index=range(10), data=np.arange(-5, 23, 3, dtype=float), name="test_series" + ) + t[t < 0] = np.nan + t[t > 10] = np.nan + d = get_detector() + d.set_truth(t) + d.apply_ruleset(rset) + return d def test_init_detector(): s = pd.Series( index=range(10), data=np.arange(-5, 23, 3, dtype=float), name="test_series" ) - d = traval.Detector(s) - return d + traval.Detector(s) def test_repr(): - d = test_init_detector() + d = get_detector() d.__repr__() - return d def test_add_truth(): @@ -25,56 +45,53 @@ def test_add_truth(): ) t[t < 0] = np.nan t[t > 10] = np.nan - d = test_init_detector() + d = get_detector() d.set_truth(t) - return d def test_apply_ruleset(): - rset = test_add_rules() - d = test_add_truth() + rset = get_filled_ruleset() + t = pd.Series( + index=range(10), data=np.arange(-5, 23, 3, dtype=float), name="test_series" + ) + t[t < 0] = np.nan + t[t > 10] = np.nan + d = get_detector() + d.set_truth(t) d.apply_ruleset(rset) - return d def test_reset(): - d = test_apply_ruleset() + d = get_detector_with_result() d.reset() assert not hasattr(d, "ts_result") - return def test_confusion_matrix(): - d = test_apply_ruleset() + d = get_detector_with_result() _ = d.confusion_matrix() - return def test_uniqueness(): - d = test_apply_ruleset() + d = get_detector_with_result() _ = d.uniqueness() - return def test_plot_overview(): - d = test_apply_ruleset() + d = get_detector_with_result() _ = d.plot_overview() - return def test_get_series(): - d = test_apply_ruleset() + d = get_detector_with_result() _ = d.get_series(2, category="tp") - return def test_get_corrections(): - d = test_apply_ruleset() + d = get_detector_with_result() _ = d.get_corrections_dataframe() - return def test_get_final_result(): - d = test_apply_ruleset() + d = get_detector_with_result() _ = d.get_final_result() - return diff --git a/tests/test_004_comparison.py b/tests/test_004_comparison.py index b7199cc..082fbae 100644 --- a/tests/test_004_comparison.py +++ b/tests/test_004_comparison.py @@ -1,5 +1,7 @@ +# ruff: noqa: D100 D103 import numpy as np import pandas as pd + import traval @@ -12,8 +14,7 @@ def test_series_comparison(): s2 = pd.Series(index=idx2, data=2.0) s2.loc["2020-04-01":"2020-04-30"] = np.nan - sc = traval.SeriesComparison(s1, s2) - return sc + _ = traval.SeriesComparison(s1, s2) def test_series_relative_comparison(): @@ -72,8 +73,6 @@ def test_series_relative_comparison(): for k, v in checkresult.items(): assert summary.loc[k] == v - return scr - def test_relative_comparison_stats(): base_idx = pd.date_range("2020-01-01", periods=110, freq="D") @@ -94,9 +93,6 @@ def test_relative_comparison_stats(): assert scr.bc.false_positive_rate + scr.bc.specificity == 1 assert scr.bc.false_negative_rate + scr.bc.sensitivity == 1 - return scr - -def test_confusion_matrix(): - cp = test_relative_comparison_stats() - return cp.bc.confusion_matrix() + # test confusion matrix + scr.bc.confusion_matrix() diff --git a/tests/test_005_plots.py b/tests/test_005_plots.py index 1181485..b8996dc 100644 --- a/tests/test_005_plots.py +++ b/tests/test_005_plots.py @@ -1,5 +1,7 @@ +# ruff: noqa: D100 D103 import numpy as np import pandas as pd + from traval import SeriesComparison, SeriesComparisonRelative @@ -20,10 +22,9 @@ def test_series_comparison_plot(): sc = SeriesComparison(s1, s2) - ax = sc.plots.plot_series_comparison( + sc.plots.plot_series_comparison( mark_different=True, mark_identical=True, mark_unique=True ) - return ax def test_relative_series_comparison_plot(): @@ -43,8 +44,6 @@ def test_relative_series_comparison_plot(): scr = SeriesComparisonRelative(s1, s2, b) - ax = scr.plots.plot_relative_comparison( + scr.plots.plot_relative_comparison( mark_unique=True, mark_different=True, mark_identical=True, mark_introduced=True ) - - return ax diff --git a/tests/test_006_rulelib.py b/tests/test_006_rulelib.py index efd945b..d37cbeb 100644 --- a/tests/test_006_rulelib.py +++ b/tests/test_006_rulelib.py @@ -1,6 +1,7 @@ +# ruff: noqa: D100 D103 import numpy as np import pandas as pd -import pytest + from traval import rulelib as rlib @@ -9,8 +10,7 @@ def test_rule_ufunc_threshold_float(): date_range = pd.date_range("2020-01-01", freq="D", periods=10) s1 = pd.Series(index=date_range, data=np.arange(10)) c1 = rlib.rule_ufunc_threshold(s1, (np.greater_equal,), 5) - assert c1.iloc[5:].isna().sum() == 5 - return c1 + assert (c1["correction_code"] == 2).sum() == 5 def test_rule_ufunc_threshold_series(): @@ -20,8 +20,7 @@ def test_rule_ufunc_threshold_series(): idx = date_range[:3].to_list() + date_range[-4:-1].to_list() thresh_series = pd.Series(index=idx, data=5.0) c2 = rlib.rule_ufunc_threshold(s1, (np.greater_equal,), thresh_series) - assert c2.iloc[5:].isna().sum() == 5 - return c2 + assert (c2["correction_code"] == 2).sum() == 5 def test_rule_diff_ufunc_threshold(): @@ -30,8 +29,7 @@ def test_rule_diff_ufunc_threshold(): s1 = pd.Series(index=date_range, data=np.arange(10)) s1.loc[date_range[4]] += 1 c3 = rlib.rule_diff_ufunc_threshold(s1, (np.greater_equal,), 1.1) - assert c3.iloc[4:5].isna().all() - return c3 + assert (c3["correction_code"] == 2).sum() == 1 def test_rule_other_ufunc_threshold(): @@ -40,8 +38,7 @@ def test_rule_other_ufunc_threshold(): s1 = pd.Series(index=date_range, data=np.arange(10)) val = s1.copy() c4 = rlib.rule_other_ufunc_threshold(s1, val, (np.less,), 5) - assert c4.iloc[:5].isna().sum() == 5 - return c4 + assert (c4["correction_code"] == -2).sum() == 5 def test_rule_max_gradient(): @@ -50,8 +47,7 @@ def test_rule_max_gradient(): s1 = pd.Series(index=date_range, data=np.arange(10)) s1.loc[date_range[4]] += 1 c5 = rlib.rule_max_gradient(s1, max_step=1.0, max_timestep="1D") - assert c5.iloc[4:5].isna().all() - return c5 + assert (c5["correction_code"] == 2).sum() == 1 def test_rule_spike_detection(): @@ -60,8 +56,7 @@ def test_rule_spike_detection(): s1 = pd.Series(index=date_range, data=np.arange(10)) s1.iloc[4] += 3 c6 = rlib.rule_spike_detection(s1, threshold=2, spike_tol=2) - assert c6.iloc[4:5].isna().all() - return c6 + assert (c6["correction_code"] == 99).sum() == 1 def test_offset_detection(): @@ -70,8 +65,7 @@ def test_offset_detection(): s1 = pd.Series(index=date_range, data=np.arange(10)) s1.iloc[3:7] += 10 c7 = rlib.rule_offset_detection(s1, threshold=5, updown_diff=2.0) - assert c7.iloc[3:7].isna().sum() == 4 - return c7 + assert (c7["correction_code"] == 99).sum() == 4 def test_rule_outside_n_sigma(): @@ -79,8 +73,8 @@ def test_rule_outside_n_sigma(): date_range = pd.date_range("2020-01-01", freq="D", periods=10) s1 = pd.Series(index=date_range, data=np.arange(10)) c8 = rlib.rule_outside_n_sigma(s1, n=1.0) - assert c8.iloc[[0, 1, 8, 9]].isna().sum() == 4 - return c8 + assert (c8["correction_code"] == -2).sum() == 2 + assert (c8["correction_code"] == 2).sum() == 2 def test_rule_diff_outside_of_n_sigma(): @@ -88,9 +82,8 @@ def test_rule_diff_outside_of_n_sigma(): date_range = pd.date_range("2020-01-01", freq="D", periods=10) s1 = pd.Series(index=date_range, data=np.arange(10)) s1.iloc[5:] += np.arange(5) - c9 = rlib.rule_diff_outside_of_n_sigma(s1, 1.0) - assert c9.iloc[6:].isna().sum() == 4 - return c9 + c9 = rlib.rule_diff_outside_of_n_sigma(s1, 2.0) + assert (c9["correction_code"] == 2).sum() == 4 def test_rule_outside_bandwidth(): @@ -100,8 +93,19 @@ def test_rule_outside_bandwidth(): lb = pd.Series(index=date_range[[0, -1]], data=[1, 2]) ub = pd.Series(index=date_range[[0, -1]], data=[7, 8]) c10 = rlib.rule_outside_bandwidth(s1, lb, ub) - assert c10.iloc[[0, 1, 8, 9]].isna().sum() == 4 - return c10 + assert (c10["correction_code"] == -2).sum() == 2 + assert (c10["correction_code"] == 2).sum() == 2 + + +def test_rule_compare_to_manual_obs(): + # rule_shift_to_manual_obs + date_range = pd.date_range("2020-01-01", freq="D", periods=10) + s1 = pd.Series(index=date_range, data=np.arange(10)) + h = pd.Series(index=date_range[[1, -1]], data=[2, 7]) + c11 = rlib.rule_compare_to_manual_obs( + s1, h, threshold=1.0, max_dt="2D", method="linear" + ) + assert (c11["correction_code"] == -2).sum() == 3 def test_rule_shift_to_manual_obs(): @@ -112,7 +116,6 @@ def test_rule_shift_to_manual_obs(): a = rlib.rule_shift_to_manual_obs(s1, h, max_dt="2D") assert (a.iloc[1:] == s1.iloc[1:] + 1).all() assert a.iloc[0] == s1.iloc[0] - return a def test_rule_combine_nan_or(): @@ -122,9 +125,18 @@ def test_rule_combine_nan_or(): s2 = s1.copy() s1.iloc[0] = np.nan s2.iloc[-1] = np.nan - c11 = rlib.rule_combine_nan_or(s1, s2) - assert c11.iloc[[0, -1]].isna().sum() == 2 - return c11 + c11a = rlib.rule_combine_nan_or(s1, s2) + assert c11a.iloc[[0, -1]].isna().sum() == 2 + + +def test_rule_combine_corrections_or(): + date_range = pd.date_range("2020-01-01", freq="D", periods=10) + s1 = pd.DataFrame(index=date_range, columns=["correction_code"], data=0) + s2 = s1.copy() + s1.iloc[0] = 99 + s2.iloc[-1] = -2 + c11b = rlib.rule_combine_corrections_or(s1, s2) + assert (c11b["correction_code"] == 99).sum() == 2 def test_rule_combine_nan_and(): @@ -134,9 +146,19 @@ def test_rule_combine_nan_and(): s2 = s1.copy() s1.iloc[0:2] = np.nan s2.iloc[1:3] = np.nan - c11 = rlib.rule_combine_nan_and(s1, s2) - assert c11.isna().sum() == 2 - return c11 + c12a = rlib.rule_combine_nan_and(s1, s2) + assert c12a.isna().sum() == 2 + + +def test_rule_combine_corrections_and(): + # rule_combine_nan + date_range = pd.date_range("2020-01-01", freq="D", periods=10) + s1 = pd.DataFrame(index=date_range, columns=["correction_code"], data=0) + s2 = s1.copy() + s1.iloc[0:2] = 99 + s2.iloc[1:3] = -2 + c12b = rlib.rule_combine_corrections_and(s1, s2) + assert (c12b["correction_code"] == 99).sum() == 1 def test_rule_funcdict_to_nan(): @@ -144,9 +166,8 @@ def test_rule_funcdict_to_nan(): date_range = pd.date_range("2020-01-01", freq="D", periods=10) s1 = pd.Series(index=date_range, data=np.arange(10)) fdict = {"lt_3": lambda s: s < 3.0, "gt_7": lambda s: s > 7.0} - c12 = rlib.rule_funcdict_to_nan(s1, fdict) - assert c12.iloc[[0, 1, 2, -2, -1]].isna().sum() == 5 - return c12 + c13 = rlib.rule_funcdict(s1, fdict) + assert (c13["correction_code"] == 99).sum() == 5 def test_rule_keep_comments(): @@ -155,16 +176,6 @@ def test_rule_keep_comments(): raw = pd.Series(index=date_range, data=np.arange(10), dtype=float) comments = ["keep"] * 4 + [""] * 3 + ["discard"] * 3 comment_series = pd.Series(index=raw.index, data=comments) - val = raw.copy() - val += 1.0 - val.loc[comment_series == "keep"] = np.nan - f = rlib.rule_keep_comments(raw, ["keep"], comment_series, val) - assert (f.loc[comment_series == "keep"] == 0).all() - assert (f.loc[comment_series != "keep"] == 1).all() - - -@pytest.mark.skip -def test_rule_pastas_outside_pi(): - # rule_pastas_outside_pi - # skip for now - pass + c14 = rlib.rule_keep_comments(raw, ["keep"], comment_series) + assert (c14["correction_code"] == 99).sum() == 4 + assert (c14["comparison_values"] == "keep").sum() == 4 diff --git a/tests/test_007_binaryclassifier.py b/tests/test_007_binaryclassifier.py index b8467c0..b548b3f 100644 --- a/tests/test_007_binaryclassifier.py +++ b/tests/test_007_binaryclassifier.py @@ -1,14 +1,15 @@ +# ruff: noqa: D100 D103 from pandas import Series + from traval import BinaryClassifier -def test_bc(): - bc = BinaryClassifier(9, 1, 9, 1) - return bc +def get_bc(): + return BinaryClassifier(9, 1, 9, 1) def test_all_stats(): - bc = test_bc() + bc = get_bc() stats = bc.get_all_statistics() answer = { "tp": 9.0, @@ -31,14 +32,12 @@ def test_all_stats(): "mcc": 0.8, } assert (stats == Series(answer)).all() - return def test_add(): - bc = test_bc() + bc = get_bc() bcsum = bc + bc assert bcsum.tp == 18 assert bcsum.fp == 2 assert bcsum.tn == 18 assert bcsum.fn == 2 - return diff --git a/tests/test_008_travalparameters.py b/tests/test_008_travalparameters.py index 85a2e8f..4910959 100644 --- a/tests/test_008_travalparameters.py +++ b/tests/test_008_travalparameters.py @@ -1,6 +1,8 @@ +# ruff: noqa: D100 D103 import os import numpy as np + from traval import RuleSet, TravalParameters, rulelib @@ -51,18 +53,17 @@ def get_ruleset2(): def test_tp_from_ruleset(): rset = get_ruleset1() - tp = TravalParameters.from_ruleset(rset) - return tp + TravalParameters.from_ruleset(rset) def test_tp_from_ruleset_w_locations(): rset = get_ruleset1() - tp = TravalParameters.from_ruleset(rset, locations=["loc1"]) - return tp + TravalParameters.from_ruleset(rset, locations=["loc1"]) def test_tp_get_parameters_defaults(): - tp = test_tp_from_ruleset() + rset = get_ruleset1() + tp = TravalParameters.from_ruleset(rset) _ = tp.get_parameters() # return all defaults _ = tp.get_parameters(rulename="gt10") # return all params for rule p3 = tp.get_parameters(rulename="gt10", parameter="threshold") # value @@ -76,11 +77,11 @@ def test_tp_get_parameters_defaults(): tp.get_parameters(rulename="gt10", parameter="non-existent-param") except KeyError: pass - return def test_tp_get_parameters_location_specific(): - tp = test_tp_from_ruleset_w_locations() + rset = get_ruleset1() + tp = TravalParameters.from_ruleset(rset, locations=["loc1"]) _ = tp.get_parameters() # return all defaults _ = tp.get_parameters(location="loc1") # return all for location # return loc params for rule @@ -100,7 +101,6 @@ def test_tp_get_parameters_location_specific(): ) except KeyError: pass - return def test_tp_to_from_csv(): @@ -112,7 +112,6 @@ def test_tp_to_from_csv(): mask = tp.defaults["value"].apply(lambda s: tp._test_callable(s)) assert (tp.defaults.loc[~mask].index == tp2.defaults.index).all() assert (tp.defaults.loc[~mask, "value"] == tp2.defaults.loc[~mask, "value"]).all() - return def test_tp_to_from_json(): @@ -124,7 +123,6 @@ def test_tp_to_from_json(): mask = tp.defaults["value"].apply(lambda s: tp._test_callable(s)) assert (tp.defaults.loc[~mask].index == tp2.defaults.index).all() assert (tp.defaults.loc[~mask, "value"] == tp2.defaults.loc[~mask, "value"]).all() - return def test_tp_to_from_pickle(): @@ -135,4 +133,3 @@ def test_tp_to_from_pickle(): os.remove("test.pkl") assert (tp.defaults.index == tp2.defaults.index).all() assert (tp.defaults["value"] == tp2.defaults["value"]).all() - return diff --git a/traval/binary_classifier.py b/traval/binary_classifier.py index bb653e7..563b3d8 100644 --- a/traval/binary_classifier.py +++ b/traval/binary_classifier.py @@ -53,7 +53,7 @@ def from_series_comparison_relative(cls, comparison): Parameters ---------- comparison : traval.SeriesComparisonRelative - object comparing two timeseries with base timeseries + object comparing two time series with base time series Returns ------- @@ -89,7 +89,7 @@ def from_confusion_matrix(cls, cmat): BinaryClassifier BinaryClassifier object based on values in confusion matrix. - See also + See Also -------- BinaryClassifier.confusion_matrix : for explanation (of abbreviations) """ @@ -154,7 +154,6 @@ def confusion_matrix(self, as_array=False): data : pd.DataFrame or np.array confusion matrix """ - # create array with data data = np.zeros((2, 2), dtype=int) # true positives = errors correctly identified @@ -192,7 +191,7 @@ def matthews_correlation_coefficient(self): phi : float the Matthews correlation coefficient - See also + See Also -------- mcc : convenience method for calculating MCC """ @@ -218,7 +217,7 @@ def mcc(self): phi : float the Matthews correlation coefficient - See also + See Also -------- matthews_correlation_coefficient : more information about the statistic """ @@ -268,7 +267,7 @@ def specificity(self): def true_positive_rate(self): """True Positive Rate. Synonym for sensitivity. - See sensitiviy for description. + See sensitivity for description. """ return self.sensitivity @@ -349,7 +348,7 @@ def accuracy(self): @property def prevalence(self): - """Prevalance of true errors in total population. + """Prevalence of true errors in total population. Prevalence = (TP + FN) / (TP + FP + FN + TN) @@ -434,7 +433,6 @@ def get_all_statistics(self, use_abbreviations=True): s : pandas.Series series containing all statistics """ - sdict = {} for k, v in self.stats_abbreviations.items(): if use_abbreviations: diff --git a/traval/detector.py b/traval/detector.py index 8762bf6..005b9e7 100755 --- a/traval/detector.py +++ b/traval/detector.py @@ -5,23 +5,29 @@ import pandas as pd from .ts_comparison import SeriesComparison, SeriesComparisonRelative -from .ts_utils import unique_nans_in_series +from .ts_utils import ( + corrections_as_float, + corrections_as_nan, + mask_corrections_modified_value, + mask_corrections_no_comparison_value, + unique_nans_in_series, +) class Detector: - """Detector object for applying error detection algorithms to timeseries. + """Detector object for applying error detection algorithms to time series. - The Detector is used to apply error detection algorithms to a timeseries + The Detector is used to apply error detection algorithms to a time series and optionally contains a 'truth' series, to which the error detection result can be compared. An example of a 'truth' series is a manually - validated timeseries. Custom error detection algorithms can be defined + validated time series. Custom error detection algorithms can be defined using the RuleSet object. Parameters ---------- series : pd.Series or pd.DataFrame - timeseries to check + time series to check truth : pd.Series or pd.DataFrame, optional series that represents the 'truth', i.e. a benchmark to which the error detection result can be compared, by default None @@ -29,15 +35,14 @@ class Detector: Examples -------- - - Given a timeseries 'series' and some ruleset 'rset': + Given a time series 'series' and some ruleset 'rset': >>> d = Detector(series) >>> d.apply_ruleset(rset) >>> d.plot_overview() - See also + See Also -------- traval.RuleSet : object for defining detection algorithms """ @@ -48,7 +53,7 @@ def __init__(self, series, truth=None): Parameters ---------- series : pd.Series or pd.DataFrame - timeseries to check + time series to check truth : pd.Series or pd.DataFrame, optional series that represents the 'truth', i.e. a benchmark to which the error detection result can be compared, by default None @@ -77,7 +82,7 @@ def _validate_input_series(series): Parameters ---------- series : object - timeseries to check, must be pd.Series or pd.DataFrame. Datatype + time series to check, must be pd.Series or pd.DataFrame. Datatype of series or first column of DataFrame must be float. Raises @@ -85,7 +90,6 @@ def _validate_input_series(series): TypeError if series or dtype of series does not comply """ - # check pd.Series or pd.DataFrame if isinstance(series, pd.Series): dtype = series.dtypes @@ -123,7 +127,7 @@ def apply_ruleset(self, ruleset, compare=True): for convenience. - See also + See Also -------- traval.RuleSet : object for defining detection algorithms """ @@ -238,8 +242,8 @@ def confusion_matrix(self, steps=None, truth=None): def uniqueness(self, truth=None): """Calculate unique contribution per rule to stats. - Note: the calculated statistics per rule contain an undercount, - i.e. when multiple rules mark the same observatin as suspect it is + Note: the calculated statistics per rule are under counted, + i.e. when multiple rules mark the same observation as suspect it is not contained in this result. Parameters @@ -388,7 +392,7 @@ def get_comment_series(self, steps=None): rulenames = [self.ruleset.get_step_name(i) for i in steps] # get corrections - corr = self.get_corrections_dataframe() + corr = self.get_corrections_dataframe(as_correction_codes=True) if corr.empty: corr = pd.DataFrame(index=self.series.index, columns=rulenames, data=0.0) @@ -397,8 +401,8 @@ def get_comment_series(self, steps=None): comments = [] for col in corr.columns: - s = corr[col].copy() - s = s.replace(0.0, "").replace(np.nan, col) + s = pd.Series(index=corr.index, data=col) + s.loc[corr[col] == 0] = "" comments.append(s) comments = pd.concat(comments, axis=1).apply( @@ -422,12 +426,12 @@ def get_results_dataframe(self): return df def get_final_result(self): - """Get final timeseries with flagged values set to NaN. + """Get final time series with flagged values set to NaN. Returns ------- series : pandas.Series - Timeseries produced by final step in RuleSet with flagged + time series produced by final step in RuleSet with flagged values set to NaN. """ key = len(self.results.keys()) - 1 @@ -435,29 +439,54 @@ def get_final_result(self): s.name = self.name return s - def get_corrections_dataframe(self): + def get_corrections_dataframe(self, as_correction_codes=False, as_addable_df=False): """Get DataFrame containing corrections. + Parameters + ---------- + as_correction_codes : bool, optional + return DataFrame with correction codes, by default False + as_addable_df : bool, optional + return DataFrame with corrections dataframe that you can add to the original + time series to obtain the final result. Corrections are NaN when errors are + detected, and nonzero where observations are shifted, and zero everywhere + else. + Returns ------- df : pandas.DataFrame - DataFrame containing corrections. NaN means value is flagged - as suspicious, 0.0 means no correction. + DataFrame containing corrections. """ + if as_correction_codes and as_addable_df: + raise ValueError( + "Only one of 'as_correction_codes' and 'as_addable_df' can be True!" + ) clist = [] for s in self.corrections.values(): if isinstance(s, np.ndarray): - s = pd.Series(dtype=float) - clist.append(s.fillna(-9999)) - - # corrections are nan, 0.0 means nothing is changed - df = ( - pd.concat(clist, axis=1) - .isna() - .astype(float) - .replace(0.0, np.nan) - .replace(1.0, 0.0) - ) + if as_addable_df: + s = pd.Series() + else: + s = pd.Series(name="correction_code") + elif isinstance(s, pd.DataFrame) and "correction_code" in s.columns: + if as_addable_df: + s = corrections_as_nan(s) + corrections_as_float(s) + else: + s = s["correction_code"] + elif isinstance(s, pd.Series): + if as_correction_codes: + s = mask_corrections_no_comparison_value(s, s.isna()).add( + mask_corrections_modified_value(s, s, (s.notnull() & s != 0.0)), + fill_value=0, + ) + s = s["correction_code"] + + clist.append(s) + + # corrections, 0 means nothing is changed, nan means value is missing + df = pd.concat(clist, axis=1) + if as_correction_codes: + df = df.infer_objects(copy=False).fillna(0).astype(int) df.columns = list(self.ruleset.rules.keys()) return df @@ -506,7 +535,7 @@ def get_corrections_comparison(self, truth=None): return df def plot_overview(self, mark_suspects=True, **kwargs): - """Plot timeseries with flagged values per applied rule. + """Plot time series with flagged values per applied rule. Parameters ---------- @@ -518,8 +547,6 @@ def plot_overview(self, mark_suspects=True, **kwargs): ax : list of matplotlib.pyplot.Axes axes objects """ - resultsdf = self.get_results_dataframe() - if "figsize" in kwargs: figsize = kwargs.pop("figsize") else: @@ -534,16 +561,17 @@ def plot_overview(self, mark_suspects=True, **kwargs): **kwargs, ) - for iax, icol in zip(axes, resultsdf): - iax.plot(resultsdf.index, resultsdf[icol], label=icol) + for icol, iax in enumerate(axes): + iresult = self.results[icol] + iax.plot(iresult.index, iresult, label=self.ruleset.get_step_name(icol)) if mark_suspects: - if icol != resultsdf.columns[0]: - corr = self.corrections[resultsdf.columns.get_loc(icol)] - if isinstance(corr, pd.Series): + if icol != 0: + icorr = self.corrections[icol] + if isinstance(icorr, pd.DataFrame): iax.plot( - corr.index, - resultsdf.loc[corr.index].iloc[:, 0], + icorr.index, + self.results[0].loc[icorr.index], marker="x", c="C3", ls="none", diff --git a/traval/params.py b/traval/params.py index d61e2a3..90a2749 100644 --- a/traval/params.py +++ b/traval/params.py @@ -409,7 +409,7 @@ def _combine_parameter_dfs(self): @staticmethod def _test_callable(f): - """Method to test whether parameter value is a callable. + """Test whether parameter value is a callable. Also returns True if callable is stored in a tuple. diff --git a/traval/plots.py b/traval/plots.py index 4d761fd..7edf91f 100755 --- a/traval/plots.py +++ b/traval/plots.py @@ -6,7 +6,7 @@ class ComparisonPlots: - """Mix-in class for plots for comparing timeseries.""" + """Mix-in class for plots for comparing time series.""" color_dict = { "only_in_s1": {"color": "orange"}, @@ -60,16 +60,16 @@ def reset_color_dict(self): def plot_series_comparison( self, mark_unique=True, mark_different=True, mark_identical=True, ax=None ): - """Plot comparison between two timeseries. + """Plot comparison between two time series. Parameters ---------- mark_unique : bool, optional mark unique values with colored X's, by default True mark_different : bool, optional - highlight where timeseries differ with red, by default True + highlight where time series differ with red, by default True mark_identical : bool, optional - highlight where timeseries are identical with green, + highlight where time series are identical with green, by default True ax : axis, optional axis object to plot on, by default None @@ -79,7 +79,6 @@ def plot_series_comparison( ax : axis axis object """ - if ax is None: fig, ax = plt.subplots(1, 1, figsize=(12, 5)) else: @@ -174,7 +173,7 @@ def plot_relative_comparison( mark_introduced=False, ax=None, ): - """Plot comparison between two timeseries relative to base timeseries. + """Plot comparison between two time series relative to base time series. Parameters ---------- @@ -185,7 +184,7 @@ def plot_relative_comparison( mark_identical : bool, optional highlight where series are identical with green, by default True mark_introduced : bool, optional - mark observations that are not in the base timeseries with X's, + mark observations that are not in the base time series with X's, by default False ax : axis, optional axis to plot on, by default None @@ -195,7 +194,6 @@ def plot_relative_comparison( ax : axis axis handle """ - ax = self.plot_series_comparison( mark_unique=mark_unique, mark_different=mark_different, @@ -479,7 +477,6 @@ def det_plot(fpr, fnr, labels, ax=None, **kwargs): ax : matplotlib.pyplot.Axes axes handle """ - if not isinstance(fpr, list): fpr = [fpr] if not isinstance(fnr, list): diff --git a/traval/rulelib.py b/traval/rulelib.py index 04024cb..6e12c9d 100755 --- a/traval/rulelib.py +++ b/traval/rulelib.py @@ -5,9 +5,16 @@ import pandas as pd from .ts_utils import ( + CorrectionCode, diff_with_gap_awareness, + get_empty_corrections_df, interpolate_series_to_new_index, - mask_corrections_as_nan, + mask_corrections_above_below, + mask_corrections_above_threshold, + mask_corrections_below_threshold, + mask_corrections_equal_value, + mask_corrections_no_comparison_value, + mask_corrections_not_equal_value, resample_short_series_to_long_series, smooth_lower_bound, smooth_upper_bound, @@ -15,7 +22,18 @@ ) -def rule_funcdict_to_nan(series, funcdict): +def _ufunc_corrections(series, ufunc, threshold, mask): + if "greater" in ufunc.__name__: + return mask_corrections_above_threshold(series, threshold, mask) + elif "less" in ufunc.__name__: + return mask_corrections_below_threshold(series, threshold, mask) + elif ufunc.__name__ == "equal": + return mask_corrections_equal_value(series, threshold, mask) + else: + return mask_corrections_not_equal_value(series, threshold, mask) + + +def rule_funcdict(series, funcdict): """Detection rule, flag values with dictionary of functions. Use dictionary of functions to identify suspect values and set @@ -24,17 +42,17 @@ def rule_funcdict_to_nan(series, funcdict): Parameters ---------- series : pd.Series - timeseries in which suspect values are identified + time series in which suspect values are identified funcdict : dict dictionary with function names as keys and functions/methods as - values. Each function is applied to each value in the timeseries + values. Each function is applied to each value in the time series using `series.apply(func)`. Suspect values are those where the function evaluates to True. Returns ------- corrections: pd.Series - a series with same index as the input timeseries containing + a series with same index as the input time series containing corrections. Suspect values (according to the provided functions) are set to np.nan. """ @@ -43,19 +61,19 @@ def rule_funcdict_to_nan(series, funcdict): mask = series.apply(func) else: mask = or_(mask, series.apply(func)) - return mask_corrections_as_nan(series, mask) + return mask_corrections_no_comparison_value(series, mask) def rule_max_gradient(series, max_step=0.5, max_timestep="1D"): """Detection rule, flag values when maximum gradient exceeded. - Set values tot NaN when maximum gradient between two - observations is exceeded. + Flag values when maximum gradient between two observations is exceeded. + Use negative max_step to flag values with negative gradient. Parameters ---------- series : pd.Series - timeseries in which suspect values are identified + time series in which suspect values are identified max_step : float, optional max jump between two observations within given timestep, by default 0.5 @@ -66,15 +84,19 @@ def rule_max_gradient(series, max_step=0.5, max_timestep="1D"): Returns ------- corrections: pd.Series - a series with same index as the input timeseries containing + a series with same index as the input time series containing corrections. Suspect values are set to np.nan. """ - conversion = pd.Timedelta(max_timestep) / pd.Timedelta("1S") + conversion = pd.Timedelta(max_timestep) / pd.Timedelta("1s") grad = ( series.diff() / series.index.to_series().diff().dt.total_seconds() * conversion ) - mask = grad.abs() > max_step - return mask_corrections_as_nan(series, mask) + if max_step > 0.0: + mask = grad > max_step + return mask_corrections_above_threshold(series, max_step, mask) + else: + mask = grad < -max_step + return mask_corrections_below_threshold(series, max_step, mask) def rule_hardmax(series, threshold, offset=0.0): @@ -100,13 +122,13 @@ def rule_ufunc_threshold(series, ufunc, threshold, offset=0.0): Parameters ---------- series : pd.Series - timeseries in which suspect values are identified + time series in which suspect values are identified ufunc : tuple tuple containing ufunc (i.e. (numpy.greater_equal,) ). The function must be callable according to `ufunc(series, threshold)`. The function is passed as a tuple to bypass RuleSet logic. threshold : float or pd.Series - value or timeseries to compare series with + value or time series to compare series with offset : float, optional value that is added to the threshold, e.g. if some extra tolerance is allowable. Default value is 0.0. @@ -114,16 +136,18 @@ def rule_ufunc_threshold(series, ufunc, threshold, offset=0.0): Returns ------- corrections: pd.Series - a series with same index as the input timeseries containing + a series with same index as the input time series containing corrections. Suspect values are set to np.nan. """ ufunc = ufunc[0] if isinstance(threshold, pd.Series): full_threshold_series = resample_short_series_to_long_series(threshold, series) + threshold = full_threshold_series.add(offset) mask = ufunc(series, full_threshold_series.add(offset)) else: - mask = ufunc(series, threshold + offset) - return mask_corrections_as_nan(series, mask) + threshold = threshold + offset + mask = ufunc(series, threshold) + return _ufunc_corrections(series, ufunc, threshold, mask) def rule_diff_ufunc_threshold(series, ufunc, threshold, max_gap="7D"): @@ -142,13 +166,13 @@ def rule_diff_ufunc_threshold(series, ufunc, threshold, max_gap="7D"): Parameters ---------- series : pd.Series - timeseries in which suspect values are identified + time series in which suspect values are identified ufunc : tuple tuple containing ufunc (i.e. (numpy.greater_equal,) ). The function must be callable according to `ufunc(series, threshold)`. The function is passed as a tuple to bypass RuleSet logic. threshold : float - value to compare diff of timeseries to + value to compare diff of time series to max_gap : str, optional only considers observations within this maximum gap between measurements to calculate diff, by default "7D". @@ -156,21 +180,20 @@ def rule_diff_ufunc_threshold(series, ufunc, threshold, max_gap="7D"): Returns ------- corrections: pd.Series - a series with same index as the input timeseries containing + a series with same index as the input time series containing corrections. Suspect values are set to np.nan. """ ufunc = ufunc[0] # identify gaps and set diff value after gap to nan diff = diff_with_gap_awareness(series, max_gap=max_gap) - mask = ufunc(diff.abs(), threshold) - return mask_corrections_as_nan(series, mask) + mask = ufunc(diff, threshold) + return _ufunc_corrections(series, ufunc, threshold, mask) def rule_other_ufunc_threshold(series, other, ufunc, threshold): """Detection rule, flag values based on other series and threshold. - Set values to Nan based on comparison of another timeseries with a - threshold value. + Correct values based on comparison of another time series with a threshold value. The argument ufunc is a tuple containing an operator function (i.e. '>', '<', '>=', '<='). These are passed using their named equivalents, e.g. in @@ -181,44 +204,45 @@ def rule_other_ufunc_threshold(series, other, ufunc, threshold): Parameters ---------- series : pd.Series - timeseries in which suspect values are identified, only used + time series in which suspect values are identified, only used to test if index of other overlaps other : pd.Series - other timeseries based on which suspect values are identified + other time series based on which suspect values are identified ufunc : tuple tuple containing ufunc (i.e. (numpy.greater_equal,) ). The function must be callable according to `ufunc(series, threshold)`. The function is passed as a tuple to bypass RuleSet logic. threshold : float - value to compare timeseries to + value to compare time series to Returns ------- corrections: pd.Series - a series with same index as the input timeseries containing + a series with same index as the input time series containing corrections. Suspect values are set to np.nan. """ ufunc = ufunc[0] mask = ufunc(other, threshold) shared_idx = series.index.intersection(other.loc[mask].index) - return mask_corrections_as_nan(series, shared_idx) + other_values = other.reindex(series.index).loc[series.index] + return _ufunc_corrections(other_values, ufunc, threshold, shared_idx) def rule_spike_detection(series, threshold=0.15, spike_tol=0.15, max_gap="7D"): - """Detection rule, identify spikes in timeseries and set to NaN. + """Detection rule, identify spikes in time series and set to NaN. - Spikes are sudden jumps in the value of a timeseries that last 1 timestep. + Spikes are sudden jumps in the value of a time series that last 1 timestep. They can be both negative or positive. Parameters ---------- series : pd.Series - timeseries in which suspect values are identified + time series in which suspect values are identified threshold : float, optional the minimum size of the jump to qualify as a spike, by default 0.15 spike_tol : float, optional - offset between value of timeseries before spike and after spike, - by default 0.15. After a spike, the value of the timeseries is usually + offset between value of time series before spike and after spike, + by default 0.15. After a spike, the value of the time series is usually close to but not identical to the value that preceded the spike. Use this parameter to control how close the value has to be. max_gap : str, optional @@ -228,14 +252,14 @@ def rule_spike_detection(series, threshold=0.15, spike_tol=0.15, max_gap="7D"): Returns ------- corrections: pd.Series - a series with same index as the input timeseries containing + a series with same index as the input time series containing corrections. Suspect values are set to np.nan. """ upspikes, downspikes = spike_finder( series, threshold=threshold, spike_tol=spike_tol, max_gap=max_gap ) mask = upspikes.index.union(downspikes.index) - return mask_corrections_as_nan(series, mask) + return mask_corrections_no_comparison_value(series, mask) def rule_offset_detection( @@ -257,7 +281,7 @@ def rule_offset_detection( Parameters ---------- series : pd.Series - timeseries in which to look for offset errors + time series in which to look for offset errors threshold : float, optional minimum jump to consider as offset error, by default 0.35 updown_diff : float, optional @@ -278,7 +302,7 @@ def rule_offset_detection( Returns ------- corrections: pd.Series - a series with same index as the input timeseries containing + a series with same index as the input time series containing corrections. Suspect values are set to np.nan. """ verbose = False @@ -356,11 +380,19 @@ def rule_offset_detection( ] periods = [jump_df.index[0], series.index[-1]] - corrections = pd.Series( - index=series.index, data=np.zeros(series.index.size), fastpath=True + # manually compute corrections dataframe + corrections = pd.DataFrame( + index=series.index, + data={ + "correction_code": np.zeros(series.size, dtype=float), + "series_values": np.full(series.size, np.nan), + "comparison_values": np.full(series.size, np.nan), + }, ) for j in range(0, len(periods), 2): - corrections.loc[periods[j] : periods[j + 1] - pd.Timedelta(seconds=30)] = np.nan + corrections.loc[ + periods[j] : periods[j + 1] - pd.Timedelta(seconds=30), "correction_code" + ] = 99 if return_df: return corrections, df, jump_df else: @@ -368,27 +400,34 @@ def rule_offset_detection( def rule_outside_n_sigma(series, n=2.0): - """Detection rule, set values outside of n * standard deviation to NaN + """Detection rule, set values outside of n * standard deviation to NaN. Parameters ---------- series : pd.Series - timeseries in which suspect values are identified + time series in which suspect values are identified n : float, optional number of standard deviations to use, by default 2 Returns ------- corrections: pd.Series - a series with same index as the input timeseries containing + a series with same index as the input time series containing corrections. Suspect values are set to np.nan. """ - - mask = (series > series.mean() + n * series.std()) | ( - series < series.mean() - n * series.std() + threshold_above = series.mean() + n * series.std() + mask_above = series > threshold_above + threshold_below = series.mean() - n * series.std() + mask_below = series < threshold_below + + return mask_corrections_above_below( + series, + mask_above, + threshold_above, + mask_below, + threshold_below, ) - return mask_corrections_as_nan(series, mask) def rule_diff_outside_of_n_sigma(series, n=2.0, max_gap="7D"): @@ -400,7 +439,7 @@ def rule_diff_outside_of_n_sigma(series, n=2.0, max_gap="7D"): Parameters ---------- series : pd.Series - timeseries in which suspect values are identified + time series in which suspect values are identified n : float, optional number of standard deviations to use, by default 2 max_gap : str, optional @@ -410,15 +449,14 @@ def rule_diff_outside_of_n_sigma(series, n=2.0, max_gap="7D"): Returns ------- corrections: pd.Series - a series with same index as the input timeseries containing + a series with same index as the input time series containing corrections. Suspect values are set to np.nan. """ - # identify gaps and set diff value after gap to nan diff = diff_with_gap_awareness(series, max_gap=max_gap) nsigma = n * diff.std() - mask = (diff.abs() - diff.mean()) > nsigma - return mask_corrections_as_nan(series, mask) + mask = diff.abs() > nsigma + return mask_corrections_above_threshold(diff, nsigma, mask) def rule_outside_bandwidth(series, lowerbound, upperbound): @@ -427,18 +465,18 @@ def rule_outside_bandwidth(series, lowerbound, upperbound): Parameters ---------- series : pd.Series - timeseries in which suspect values are identified + time series in which suspect values are identified lowerbound : pd.Series - timeseries containing the lower bound, if bound values are less + time series containing the lower bound, if bound values are less frequent than series, bound is interpolated to series.index upperbound : pd.Series - timeseries containing the upper bound, if bound values are less + time series containing the upper bound, if bound values are less frequent than series, bound is interpolated to series.index Returns ------- corrections : pd.Series - a series with same index as the input timeseries containing + a series with same index as the input time series containing corrections. Suspect values are set to np.nan. """ if series.index.symmetric_difference(lowerbound.index).size > 0: @@ -446,8 +484,11 @@ def rule_outside_bandwidth(series, lowerbound, upperbound): if series.index.symmetric_difference(upperbound.index).size > 0: upperbound = interpolate_series_to_new_index(upperbound, series.index) - mask = (series > upperbound) | (series < lowerbound) - return mask_corrections_as_nan(series, mask) + mask_above = series > upperbound + mask_below = series < lowerbound + return mask_corrections_above_below( + series, mask_above, upperbound, mask_below, lowerbound + ) def rule_pastas_outside_pi( @@ -463,15 +504,15 @@ def rule_pastas_outside_pi( ): """Detection rule, flag values based on pastas model prediction interval. - Flag suspect outside prediction interval calculated by pastas timeseries + Flag suspect outside prediction interval calculated by pastas time series model. Uses a pastas.Model and a confidence interval as input. Parameters ---------- series : pd.Series - timeseries to identify suspect observations in + time series to identify suspect observations in ml : pastas.Model - timeseries model for series + time series model for series ci : float, optional confidence interval for calculating bandwidth, by default 0.95. Higher confidence interval means that bandwidth is wider and more @@ -495,34 +536,30 @@ def rule_pastas_outside_pi( Returns ------- corrections : pd.Series - a series with same index as the input timeseries containing + a series with same index as the input time series containing corrections. Suspect values are set to np.nan. """ # no model if ml is None: if verbose: print("Warning: No Pastas model found!") - corrections = mask_corrections_as_nan( - series, pd.Series(index=series.index, data=False) - ) - corrections.name = "sim" - # no fit - elif ml.fit is None: + corrections = get_empty_corrections_df(series) + corrections.columns = ["sim", "series_values", "comparison_values"] + # no solver + elif ml.solver is None: if verbose: - print("Warning: Pastas model fit attribute is None!") - corrections = mask_corrections_as_nan( - series, pd.Series(index=series.index, data=False) - ) - corrections.name = "sim" + print("Warning: Model has no attribute solver!") + corrections = get_empty_corrections_df(series) + corrections.columns = ["sim", "series_values", "comparison_values"] # calculate pi else: - if tmin is not None: - ml.settings["tmin"] = tmin - if tmax is not None: - ml.settings["tmax"] = tmax + if tmin is None: + tmin = series.first_valid_index() + if tmax is None: + tmax = series.last_valid_index() # calculate prediction interval - pi = ml.fit.prediction_interval(alpha=(1 - ci)) + pi = ml.solver.prediction_interval(alpha=(1 - ci), tmin=tmin, tmax=tmax) # prediction interval empty if pi.empty: @@ -531,10 +568,8 @@ def rule_pastas_outside_pi( "Warning: calculated prediction interval with " "Pastas model is empty!" ) - corrections = mask_corrections_as_nan( - series, pd.Series(index=series.index, data=False) - ) - corrections.name = "sim" + corrections = get_empty_corrections_df(series) + corrections.columns = ["sim", "series_values", "comparison_values"] else: lower = pi.iloc[:, 0] upper = pi.iloc[:, 1] @@ -553,9 +588,15 @@ def rule_pastas_outside_pi( lower = lower - min_ci / 2.0 corrections = rule_outside_bandwidth(series, lower, upper) - corrections.name = "sim (r^2={0:.3f})".format(ml.stats.rsq()) + corrections.columns = [ + "correction_code", + "series_values", + "comparison_values", + ] + corrections.index.name = f"sim (r^2={ml.stats.rsq():.3f})" if savedir: + savedir.mkdir(exist_ok=True) pi.to_pickle(os.path.join(savedir, f"pi_{ml.name}.pkl")) return corrections @@ -567,80 +608,117 @@ def rule_pastas_percentile_pi( if ml is None: if verbose: print("Warning: No Pastas model found!") - corrections = mask_corrections_as_nan( - series, pd.Series(index=series.index, data=False) - ) - corrections.name = "sim" - # no fit - elif ml.fit is None: + corrections = get_empty_corrections_df(series) + corrections.columns = ["sim", "series_values", "comparison_values"] + # no solver + elif ml.solver is None: if verbose: - print("Warning: Pastas model fit attribute is None!") - corrections = mask_corrections_as_nan( - series, pd.Series(index=series.index, data=False) - ) - corrections.name = "sim" + print("Warning: Model has no solver attribute!") + corrections = get_empty_corrections_df(series) + corrections.columns = ["sim", "series_values", "comparison_values"] # calculate realizations # TODO: work in progress -def rule_keep_comments(series, keep_comments, comment_series, other_series): - """Filter rule, modify timeseries to keep data with certain comments. +def rule_keep_comments(series, keep_comments, comment_series): + """Filter rule, modify time series to keep data with certain comments. - This rule was invented to extract timeseries only containing certain + This rule was invented to extract time series only containing certain types of errors, based on labeled data. For example, to get only erroneous observations caused by sensors above the groundwater level: - - series: the raw timeseries + - series: the raw time series - keep_comments: list of comments to keep, e.g. ['dry sensor'] - - comment_series: timeseries containing the comments for erroneous obs - - other_series: the validated timeseries where the commented observations - were removed (set to NaN). + - comment_series: time series containing the comments for erroneous obs Parameters ---------- series : pd.Series - timeseries to filter + time series to filter keep_comments : list of str list of comments to keep comment_series : pd.Series - timeseries containing comments, should have same index as series - other_series : pd.Series - timeseries containing corrected/adjusted values corresponding - to the commmented entries. + time series containing comments, should have same index as series Returns ------- - corrections : pd.Series - timeseries containing NaN values where comment is in keep_comments + corrections : pd.DataFrame + dataframe containing correction code 99 where comment is in keep_comments and 0 otherwise. """ - new_series = series.copy() - for c in keep_comments: - mask = comment_series.str.startswith(c) - new_series.where(mask, other=other_series, inplace=True) + c = get_empty_corrections_df(series) + c["comparison_values"] = "" + for comment in keep_comments: + mask = comment_series.str.contains(comment) + c.loc[mask, "correction_code"] = CorrectionCode.UNKNOWN_COMPARISON_VALUE + c.loc[mask, "series_values"] = series.loc[mask] + c.loc[mask, "comparison_values"] = comment - corrections = new_series - series - corrections.name = "_".join(keep_comments) + return c - return corrections + +def rule_compare_to_manual_obs( + series, manual_obs, threshold=0.05, method="linear", max_dt="1D" +): + # check if time between manual obs and sensor obs + # are further apart than max_dt: + nearest = series.index.get_indexer(manual_obs.index, method="nearest") + mask = np.abs((series.index[nearest] - manual_obs.index).total_seconds()) <= ( + pd.Timedelta(max_dt) / pd.Timedelta("1s") + ) + + # interpolate raw obs to manual obs times + s_obs = ( + series.reindex(series.index.join(manual_obs.index, how="outer")) + .interpolate(method="time") + .loc[manual_obs.index] + ) + + # calculate diff (manual - sensor, i.e. positive value means + # manual observation is higher) + diff = -(s_obs - manual_obs) + + # use only diff where mask is True (= time between obs < max_dt) + diff = diff.loc[mask] + + # interpolate w/ method + if method == "linear": + diff_full_index = ( + diff.reindex(series.index.join(diff.index, how="outer"), method=None) + .interpolate(method="linear") + .fillna(0.0) + ) + else: + diff_full_index = diff.reindex(series.index, method=method).fillna(0.0) + + mask_above = diff_full_index.loc[series.index] > threshold + mask_below = diff_full_index.loc[series.index] < -threshold + + return mask_corrections_above_below( + diff_full_index.loc[series.index], + mask_above, + threshold, + mask_below, + -threshold, + ) def rule_shift_to_manual_obs( series, hseries, method="linear", max_dt="1D", reset_dates=None ): - """Adjustment rule, for shifting timeseries onto manual observations. + """Adjustment rule, for shifting time series onto manual observations. - Used for shifting timeseries based on sensor observations onto manual + Used for shifting time series based on sensor observations onto manual verification measurements. By default uses linear interpolation between two manual verification observations. Parameters ---------- series : pd.Series - timeseries to adjust + time series to adjust hseries : pd.Series - timeseries containing manual observations + time series containing manual observations method : str, optional method to use for interpolating between two manual observations, by default "linear". Other options are those that are accepted by @@ -656,15 +734,14 @@ def rule_shift_to_manual_obs( Returns ------- adjusted_series : pd.Series - timeseries containing adjustments to shift series onto manual + time series containing adjustments to shift series onto manual observations. """ # check if time between manual obs and sensor obs # are further apart than max_dt: - # nearest = hseries.index.map(lambda t: series.index[series.index.get_indexer([t], method="nearest")]) nearest = series.index.get_indexer(hseries.index, method="nearest") mask = np.abs((series.index[nearest] - hseries.index).total_seconds()) <= ( - pd.Timedelta(max_dt) / pd.Timedelta("1S") + pd.Timedelta(max_dt) / pd.Timedelta("1s") ) # interpolate raw obs to manual obs times @@ -674,8 +751,9 @@ def rule_shift_to_manual_obs( .loc[hseries.index] ) - # calculate diff - diff = s_obs - hseries + # calculate diff (manual - sensor, i.e. positive value means + # manual observation is higher) + diff = -(s_obs - hseries) # use only diff where mask is True (= time between obs < max_dt) diff = diff.loc[mask] @@ -694,13 +772,13 @@ def rule_shift_to_manual_obs( else: diff_full_index = diff.reindex(series.index, method=method).fillna(0.0) - adjusted_series = series - diff_full_index + adjusted_series = series + diff_full_index return adjusted_series def rule_combine_nan_or(*args): - """Combination rule, combine NaN values for any number of timeseries. + """Combination rule, combine NaN values for any number of time series. Used for combining intermediate results in branching algorithm trees to create one final result, i.e. (s1.isna() OR s2.isna()) @@ -708,7 +786,7 @@ def rule_combine_nan_or(*args): Returns ------- corrections : pd.Series - a series with same index as the input timeseries containing + a series with same index as the input time series containing corrections. Contains NaNs where any of the input series values is NaN. """ @@ -720,8 +798,28 @@ def rule_combine_nan_or(*args): return result +def rule_combine_corrections_or(*args): + """Combination rule, combine corrections for any number of time series. + + Used for combining intermediate results in branching algorithm trees to + create one final result, i.e. (corr_s1 OR corr_s2) + + Returns + ------- + corrections : pd.Series + a series with same index as the input time series containing + corrections. Contains corrections where all of the input series + values contain corrections. + """ + for i, series in enumerate(args): + if i == 0: + c = get_empty_corrections_df(series) + c.loc[series["correction_code"] != 0, "correction_code"] = 99 + return c + + def rule_combine_nan_and(*args): - """Combination rule, combine NaN values for any number of timeseries. + """Combination rule, combine NaN values for any number of time series. Used for combining intermediate results in branching algorithm trees to create one final result, i.e. (s1.isna() AND s2.isna()) @@ -729,7 +827,7 @@ def rule_combine_nan_and(*args): Returns ------- corrections : pd.Series - a series with same index as the input timeseries containing + a series with same index as the input time series containing corrections. Contains NaNs where any of the input series values is NaN. """ @@ -743,6 +841,29 @@ def rule_combine_nan_and(*args): return result +def rule_combine_corrections_and(*args): + """Combination rule, combine corrections for any number of time series. + + Used for combining intermediate results in branching algorithm trees to + create one final result, i.e. (corr_s1 AND corr_s2) + + Returns + ------- + corrections : pd.Series + a series with same index as the input time series containing + corrections. Contains corrections where all of the input series + values contain corrections. + """ + for i, series in enumerate(args): + if i == 0: + mask = series["correction_code"] != 0 + else: + mask = mask & (series["correction_code"] != 0) + c = get_empty_corrections_df(args[0]) + c.loc[mask, "correction_code"] = 99 + return c + + def rule_flat_signal( series, window, @@ -762,7 +883,7 @@ def rule_flat_signal( Parameters ---------- series : pd.Series - timeseries to analyse + time series to analyse window : int number of days in window min_obs : int @@ -779,16 +900,16 @@ def rule_flat_signal( limit. Only search for flat signals above this limit. By default None. hbelow : float, optional - absolute value in units of timeseries signifying an upper limit. + absolute value in units of time series signifying an upper limit. Only search for flat signals below this limit. By default None. habove : float, optional - absolute value in units of timeseries signifying a lower limit. + absolute value in units of time series signifying a lower limit. Only search for flat signals above this limit. By default None. Returns ------- corrections : pd.Series - a series with same index as the input timeseries containing + a series with same index as the input time series containing corrections. Contains NaNs where the signal is considered flat or dead. """ @@ -817,4 +938,4 @@ def rule_flat_signal( mask = stdmask & quantilemask & levelmask mask = mask.reindex(series.index, fill_value=False) - return mask_corrections_as_nan(series, mask) + return mask_corrections_no_comparison_value(series, mask) diff --git a/traval/ruleset.py b/traval/ruleset.py index 6839c90..d4e766b 100755 --- a/traval/ruleset.py +++ b/traval/ruleset.py @@ -12,6 +12,8 @@ class RuleSetEncoder(json.JSONEncoder): + """Encode values in RuleSet to JSON.""" + def default(self, o): if callable(o): return "func:" + o.__name__ @@ -39,7 +41,8 @@ def ruleset_hook(obj): val = getattr(rulelib, funcname) except AttributeError: warnings.warn( - f"Could not load function {funcname} " "from `traval.rulelib`!" + f"Could not load function {funcname} " "from `traval.rulelib`!", + stacklevel=1, ) val = funcname obj[key] = val @@ -49,7 +52,9 @@ def ruleset_hook(obj): try: val = getattr(np, funcname) except AttributeError: - warnings.warn(f"Could not load function {funcname} " "from `numpy`!") + warnings.warn( + f"Could not load function {funcname} " "from `numpy`!", stacklevel=1 + ) val = (funcname,) obj[key] = (val,) elif str(value).startswith("series:"): @@ -83,7 +88,7 @@ class RuleSet: The RuleSet object stores detection rules and other relevant information in a dictionary. The order in which rules are carried out, the functions - that parse the timeseries, the extra arguments required by those functions + that parse the time series, the extra arguments required by those functions are all stored together. The detection functions must take a series as the first argument, and @@ -103,7 +108,6 @@ class RuleSet: Examples -------- - Given two detection functions 'foo' and 'bar': >>> rset = RuleSet(name="foobar") @@ -144,21 +148,20 @@ def __call__(self, series): Parameters ---------- series : pandas.Series or pandas.DataFrame - timeseries to apply rules to + time series to apply rules to Returns ------- d : OrderedDict - Dictionary containing resulting timeseries after applying rules. + Dictionary containing resulting time series after applying rules. Keys represent step numbers (0 is the original series, 1 the outcome of rule #1, etc.) c : OrderedDict - Dictionary containing corrections to timeseries based on rules + Dictionary containing corrections to time series based on rules Keys represent step numbers (1 contains the corrections based on rule #1, etc.). When no correction is available, step contains the value 0. """ - return self._applyself(series) def add_rule(self, name, func, apply_to=None, kwargs=None): @@ -272,7 +275,7 @@ def get_parameters(self, name=None): @staticmethod def _parse_kwargs(kwargs, name=None): - """Internal method, parse keyword arguments dictionary. + """Internal method to parse keyword arguments dictionary. Iterates over keys, values in kwargs dictionary. If value is callable, calls value with 'name' as function argument. The result is stored @@ -290,7 +293,7 @@ def _parse_kwargs(kwargs, name=None): dict dictionary of parsed arguments """ - new_args = dict() + new_args = {} if kwargs is not None: for k, v in kwargs.items(): if callable(v): @@ -300,21 +303,21 @@ def _parse_kwargs(kwargs, name=None): return new_args def _applyself(self, series): - """Internal method, apply ruleset to series. + """Internal method to apply ruleset to series. Parameters ---------- series: pandas.Series or pandas.DataFrame - timeseries to apply rules to + time series to apply rules to Returns ------- d: OrderedDict - Dictionary containing resulting timeseries after applying rules. + Dictionary containing resulting time series after applying rules. Keys represent step numbers (0 is the original series, 1 the outcome of rule # 1, etc.) c: OrderedDict - Dictionary containing corrections to timeseries based on rules + Dictionary containing corrections to time series based on rules Keys represent step numbers(1 contains the corrections based on rule # 1, etc.). When no correction is available, step contains the value 0. @@ -329,8 +332,21 @@ def _applyself(self, series): arg_dict = self._parse_kwargs(irule["kwargs"], name) corr = irule["func"](d[int(irule["apply_to"])], **arg_dict) # store both correction and result - d[i] = d[int(irule["apply_to"])] + corr - c[i] = corr.loc[corr != 0.0].copy() + # support correction code based corrections + if isinstance(corr, pd.DataFrame) and "correction_code" in corr.columns: + d[i] = d[int(irule["apply_to"])].where( + corr["correction_code"] == 0, np.nan + ) + c[i] = corr.loc[corr["correction_code"] != 0.0].copy() + elif isinstance(corr, pd.Series): + # support nan-based corrections + d[i] = d[int(irule["apply_to"])] + corr + c[i] = corr.loc[corr != 0.0] + else: + raise TypeError( + "Corrections computed by rules must be pd.Series containing " + "NaNs or DataFrame containing a column named 'correction_code'." + ) # if apply_to is tuple, collect series as kwargs to func elif isinstance(irule["apply_to"], tuple): # collect results @@ -383,7 +399,7 @@ def to_pickle(self, fname, verbose=True): verbose : bool, optional prints message when operation complete, default is True - See also + See Also -------- from_pickle : load RuleSet from pickle file to_json : store RuleSet as json file (does not support custom functions) @@ -412,7 +428,7 @@ def from_pickle(cls, fname): RuleSet RuleSet object, including custom functions and parameters - See also + See Also -------- to_pickle : store RuleSet as pickle (supports custom functions) to_json : store RuleSet as json file (does not support custom functions) @@ -443,7 +459,7 @@ def to_json(self, fname=None, verbose=True): prints message when operation complete, default is True - See also + See Also -------- from_json : load RuleSet from json file to_pickle : store RuleSet as pickle (supports custom functions) @@ -453,7 +469,7 @@ def to_json(self, fname=None, verbose=True): "Custom functions will not be preserved when storing " "RuleSet as JSON file!" ) - warnings.warn(msg) + warnings.warn(msg, stacklevel=1) rules = deepcopy(self.rules) rules["name"] = self.name if fname is not None: @@ -486,7 +502,7 @@ def from_json(cls, fname): RuleSet: RuleSet object - See also + See Also -------- to_json : store RuleSet as JSON file (does not support custom functions) to_pickle : store RuleSet as pickle (supports custom functions) @@ -520,9 +536,9 @@ def get_resolved_ruleset(self, name): new_ruleset = deepcopy(self.rules) for rule in new_ruleset.values(): rule["kwargs"] = self._parse_kwargs(rule["kwargs"], name=name) - + # create new object with resolved parameters rset = RuleSet(name) rset.rules = new_ruleset - + return rset diff --git a/traval/ts_comparison.py b/traval/ts_comparison.py index 9a4689f..6c5a833 100755 --- a/traval/ts_comparison.py +++ b/traval/ts_comparison.py @@ -53,7 +53,7 @@ def idx_in_idx2(self): class SeriesComparison: - """Object for comparing two timeseries. + """Object for comparing two time series. Comparison yields the following categories: @@ -77,7 +77,7 @@ class SeriesComparison: """ def __init__(self, s1, s2, names=None, diff_threshold=0.0): - """Compare two timeseries. + """Compare two time series. Parameters ---------- @@ -86,7 +86,7 @@ def __init__(self, s1, s2, names=None, diff_threshold=0.0): s2 : pd.Series or pd.DataFrame second series to compare names : list of str, optional - list of names of timeseries, by default None, which + list of names of time series, by default None, which uses series name, or dataframe column name diff_threshold : float, optional value beyond which a difference is considered significant, by @@ -127,7 +127,7 @@ def __init__(self, s1, s2, names=None, diff_threshold=0.0): @staticmethod def _parse_series(series): - """Internal method to parse timeseries input. + """Internal method to parse time series input. Parameters ---------- @@ -138,7 +138,7 @@ def _parse_series(series): Returns ------- series, comments : pd.Series, pd.Series - returns timeseries and comment series. Comment series is empty + returns time series and comment series. Comment series is empty series if no comments are included in input Raises @@ -248,7 +248,6 @@ def compare_by_comment(self): ValueError if no comment series is found """ - if self.c2n.empty: raise ValueError("No comment series!") @@ -339,64 +338,63 @@ def _check_idx_comparison(self, return_missing=False): class SeriesComparisonRelative(SeriesComparison): - """Object for comparing two timeseries relative to a third timeseries. + """Object for comparing two time series relative to a third time series. Extends the SeriesComparison object to include a comparison between - two timeseries and a third base timeseries. This is used for example, when + two time series and a third base time series. This is used for example, when comparing the results of two error detection outcomes to the original - raw timeseries. + raw time series. Comparison yields both the results from SeriesComparison as well as the - following categories for the relative comparison to the base timeseries: + following categories for the relative comparison to the base time series: - - kept_in_both: both timeseries and the base timeseries contain values + - kept_in_both: both time series and the base time series contain values - flagged_in_s1: value is NaN/missing in series #1 - flagged_in_s2: value is NaN/missing in series #2 - flagged_in_both: value is NaN/missing in both series #1 and series #2 - - in_all_nan: value is NaN in all timeseries (series #1, #2 and base) + - in_all_nan: value is NaN in all time series (series #1, #2 and base) - introduced_in_s1: value is NaN/missing in base but has value in series #1 - introduced_in_s2: value is NaN/missing in base but has value in series #2 - introduced_in_both: value is NaN/missing in base but has value in both - timeseries + time series Parameters ---------- s1 : pd.Series or pd.DataFrame first series to compare truth : pd.Series or pd.DataFrame - second series to compare, if a "truth" timeseries is available - pass it as the second timeseries. Stored in object as 's2'. + second series to compare, if a "truth" time series is available + pass it as the second time series. Stored in object as 's2'. base : pd.Series or pd.DataFrame - timeseries to compare other two series with + time series to compare other two series with diff_threshold : float, optional value beyond which a difference is considered significant, by default 0.0. Two values whose difference is smaller than threshold are considered identical. - See also + See Also -------- - SeriesComparison : Comparison of two timeseries relative to each other + SeriesComparison : Comparison of two time series relative to each other """ def __init__(self, s1, truth, base, diff_threshold=0.0): - """Compare two timeseries relative to a base timeseries. + """Compare two time series relative to a base time series. Parameters ---------- s1 : pd.Series or pd.DataFrame first series to compare truth : pd.Series or pd.DataFrame - second series to compare, if a "truth" timeseries is available - pass it as the second timeseries. Stored in object as 's2'. + second series to compare, if a "truth" time series is available + pass it as the second time series. Stored in object as 's2'. base : pd.Series or pd.DataFrame - timeseries to compare other two series with + time series to compare other two series with diff_threshold : float, optional value beyond which a difference is considered significant, by default 0.0. Two values whose difference is smaller than threshold are considered identical. """ - # Do the original comparison between s1 and s2 super().__init__(s1, truth, diff_threshold=diff_threshold) @@ -417,9 +415,8 @@ def __init__(self, s1, truth, base, diff_threshold=0.0): self.bc = BinaryClassifier.from_series_comparison_relative(self) def _compare_series_to_base(self): - """Internal method for comparing two timseries to base timeseries.""" - - # where Nans in base timeseries + """Internal method for comparing two timseries to base time series.""" + # where Nans in base time series nanmask = self.basen.isna() # prepare some indices @@ -439,7 +436,7 @@ def _compare_series_to_base(self): self.idx_r_in_all_nan = self.basen.loc[nanmask].index.difference(s1s2_union) # self.idx_r_in_all_nan = self.basen.loc[nanmask].index.intersection( # self.idx_in_both_nan) # only where all are NaN - # counts for both NaNs and missing in base timeseries + # counts for both NaNs and missing in base time series self.idx_r_introduced_in_s1 = ( self.basen.loc[nanmask] .index.intersection(only_in_s1) @@ -457,13 +454,13 @@ def _compare_series_to_base(self): ) def _summarize_comparison_to_base(self): - """Internal method for summarizing comparison with base timeseries. + """Internal method for summarizing comparison with base time series. Returns ------- summary : pandas.Series Series summarizing the series comparison relative to base - timeseries, containing counts per category + time series, containing counts per category """ categories = [ "kept_in_both", @@ -496,7 +493,6 @@ def compare_to_base_by_comment(self): ValueError if no comment series is available. """ - if self.c2n.empty: raise ValueError("No comment series!") diff --git a/traval/ts_utils.py b/traval/ts_utils.py index abf26fb..9243182 100644 --- a/traval/ts_utils.py +++ b/traval/ts_utils.py @@ -1,45 +1,247 @@ +from enum import IntEnum + import numpy as np import pandas as pd -def mask_corrections_as_nan(series, mask): - """Get corrections series with NaNs where mask == True. +class CorrectionCode(IntEnum): + """Codes and labels for labeling error detection results.""" + + NO_CORRECTION = 0 + BELOW_THRESHOLD = -2 + NOT_EQUAL_VALUE = -1 + EQUAL_VALUE = 1 + ABOVE_THRESHOLD = 2 + MODIFIED_VALUE = 4 + UNKNOWN_COMPARISON_VALUE = 99 + + +def get_empty_corrections_df(series): + """Method to get corrections empty dataframe. + + Parameters + ---------- + series : pd.Series + time series to apply corrections to + """ + c = pd.DataFrame( + index=series.index, + data={ + "correction_code": CorrectionCode.NO_CORRECTION, + "series_values": np.full(series.size, np.nan), + "comparison_values": np.full(series.size, np.nan), + }, + ) + return c + + +def _mask_corrections(series, values, mask, correction_code): + c = get_empty_corrections_df(series) + c.loc[mask, "series_values"] = series + if values is not None: + if isinstance(values, pd.Series): + c.loc[mask, "comparison_values"] = values.loc[mask] + else: + c.loc[mask, "comparison_values"] = values + c.loc[mask, "correction_code"] = correction_code + return c + + +def mask_corrections_above_below( + series, + mask_above, + threshold_above, + mask_below, + threshold_below, +): + """Get corrections where above threshold. + + Parameters + ---------- + series : pd.Series + time series to apply corrections to + threshold_above : pd.Series + time series with values to compare with + mask_above : DateTimeIndex or boolean np.array + DateTimeIndex containing timestamps where value should be set to NaN, + or boolean array with same length as series set to True where + value should be set to NaN. (Uses pandas .loc[mask] to set values.) + threshold_below : pd.Series + time series with values to compare with + mask_below : DateTimeIndex or boolean np.array + DateTimeIndex containing timestamps where value should be set to NaN, + or boolean array with same length as series set to True where + value should be set to NaN. (Uses pandas .loc[mask] to set values.) + """ + c_above = mask_corrections_above_threshold(series, threshold_above, mask_above) + c_below = mask_corrections_below_threshold(series, threshold_below, mask_below) + return c_above.add(c_below, fill_value=0) + + +def mask_corrections_above_threshold(series, threshold, mask): + """Get corrections where below threshold. Parameters ---------- series : pd.Series - timeseries to provide corrections for + time series to apply corrections to + threshold : pd.Series + time series with values to compare with mask : DateTimeIndex or boolean np.array DateTimeIndex containing timestamps where value should be set to NaN, or boolean array with same length as series set to True where value should be set to NaN. (Uses pandas .loc[mask] to set values.) + """ + return _mask_corrections(series, threshold, mask, CorrectionCode.ABOVE_THRESHOLD) + + +def mask_corrections_below_threshold(series, threshold, mask): + """Get corrections where below threshold. + + Parameters + ---------- + series : pd.Series + time series to apply corrections to + threshold : pd.Series + time series with values to compare with + mask : DateTimeIndex or boolean np.array + DateTimeIndex containing timestamps where value should be set to NaN, + or boolean array with same length as series set to True where + value should be set to NaN. (Uses pandas .loc[mask] to set values.) + """ + return _mask_corrections(series, threshold, mask, CorrectionCode.BELOW_THRESHOLD) + + +def mask_corrections_equal_value(series, values, mask): + """Get corrections where equal to value. + + Parameters + ---------- + series : pd.Series + time series to apply corrections to + values : pd.Series + time series with values to compare with + mask : DateTimeIndex or boolean np.array + DateTimeIndex containing timestamps where value should be set to NaN, + or boolean array with same length as series set to True where + value should be set to NaN. (Uses pandas .loc[mask] to set values.) + """ + return _mask_corrections(series, values, mask, CorrectionCode.EQUAL_VALUE) + + +def mask_corrections_modified_value(series, values, mask): + """Get corrections where value was modified. + + Parameters + ---------- + series : pd.Series + time series to apply corrections to + values : pd.Series + time series with values to compare with + mask : DateTimeIndex or boolean np.array + DateTimeIndex containing timestamps where value should be set to NaN, + or boolean array with same length as series set to True where + value should be set to NaN. (Uses pandas .loc[mask] to set values.) + """ + return _mask_corrections(series, values, mask, CorrectionCode.MODIFIED_VALUE) + + +def mask_corrections_not_equal_value(series, values, mask): + """Get corrections where not equal to value. + + Parameters + ---------- + series : pd.Series + time series to apply corrections to + values : pd.Series + time series with values to compare with + mask : DateTimeIndex or boolean np.array + DateTimeIndex containing timestamps where value should be set to NaN, + or boolean array with same length as series set to True where + value should be set to NaN. (Uses pandas .loc[mask] to set values.) + """ + return _mask_corrections(series, values, mask, CorrectionCode.NOT_EQUAL_VALUE) + + +def mask_corrections_no_comparison_value(series, mask): + """Get corrections where equal to value. + + Parameters + ---------- + series : pd.Series + time series to apply corrections to + mask : DateTimeIndex or boolean np.array + DateTimeIndex containing timestamps where value should be set to NaN, + or boolean array with same length as series set to True where + value should be set to NaN. (Uses pandas .loc[mask] to set values.) + """ + return _mask_corrections( + series, None, mask, CorrectionCode.UNKNOWN_COMPARISON_VALUE + ) + + +def corrections_as_nan(corrections): + """Convert correction code series to NaNs. + + Excludes codes 0 and 4, which are used to indicate no correction and a modification + of the value, respectively. + + Parameters + ---------- + corrections : pd.Series or pd.DataFrame + series or dataframe with correction code Returns ------- c : pd.Series - return corrections series + return corrections series with nans where value is corrected """ - c = pd.Series( - index=series.index, - data=np.zeros(series.index.size), - fastpath=True, - dtype=float, + if isinstance(corrections, pd.DataFrame): + corrections = corrections["correction_code"] + c = pd.Series(index=corrections.index, data=0.0) + # set values where correction code is *not* 0 or 4 to NaN + # (meaning a correction was applied) + c.loc[(corrections != 0) | (corrections != 4)] = np.nan + return c + + +def corrections_as_float(corrections): + """Convert correction code series to NaNs. + + Excludes codes 0 and 4, which are used to indicate no correction and a modification + of the value, respectively. + + Parameters + ---------- + corrections : pd.DataFrame + dataframe with correction code and original + modified values + + Returns + ------- + c : pd.Series + return corrections series with floats where value is modified + """ + c = pd.Series(index=corrections.index, data=0.0) + # set values where correction code is 4 to difference between original and modified + mask = corrections["correction_code"] == 4 + c.loc[mask] = ( + corrections.loc[mask, "comparison_values"] + - corrections.loc[mask, "series_values"] ) - c.loc[mask] = np.nan return c def resample_short_series_to_long_series(short_series, long_series): - """Resample a short timeseries to index from a longer timeseries. + """Resample a short time series to index from a longer time series. First uses 'ffill' then 'bfill' to fill new series. Parameters ---------- short_series : pd.Series - short timeseries + short time series long_series : pd.Series - long timeseries + long time series Returns ------- @@ -55,17 +257,17 @@ def resample_short_series_to_long_series(short_series, long_series): first_date_after = long_series.loc[mask].index[0] new_series.loc[first_date_after] = short_series.iloc[i] - new_series = new_series.fillna(method="ffill").fillna(method="bfill") + new_series = new_series.ffill().bfill() return new_series def diff_with_gap_awareness(series, max_gap="7D"): - """Get diff of timeseries with a limit on gap between two values. + """Get diff of time series with a limit on gap between two values. Parameters ---------- series : pd.Series - timeseries to calculate diff for + time series to calculate diff for max_gap : str, optional maximum period between two observations for calculating diff, otherwise set value to NaN, by default "7D" @@ -73,7 +275,7 @@ def diff_with_gap_awareness(series, max_gap="7D"): Returns ------- diff : pd.Series - timeseries with diff, with NaNs whenever two values are farther apart + time series with diff, with NaNs whenever two values are farther apart than max_gap. """ diff = series.diff() @@ -86,20 +288,20 @@ def diff_with_gap_awareness(series, max_gap="7D"): def spike_finder(series, threshold=0.15, spike_tol=0.15, max_gap="7D"): - """Find spikes in timeseries. + """Find spikes in time series. - Spikes are sudden jumps in the value of a timeseries that last 1 timestep. + Spikes are sudden jumps in the value of a time series that last 1 timestep. They can be both negative or positive. Parameters ---------- series : pd.Series - timeseries to find spikes in + time series to find spikes in threshold : float, optional the minimum size of the jump to qualify as a spike, by default 0.15 spike_tol : float, optional - offset between value of timeseries before spike and after spike, - by default 0.15. After a spike, the value of the timeseries is usually + offset between value of time series before spike and after spike, + by default 0.15. After a spike, the value of the time series is usually close to but not identical to the value that preceded the spike. Use this parameter to control how close the value has to be. max_gap : str, optional @@ -112,7 +314,6 @@ def spike_finder(series, threshold=0.15, spike_tol=0.15, max_gap="7D"): pandas DateTimeIndex objects containing timestamps of upward and downward spikes. """ - # identify gaps and set diff value after gap to nan diff = diff_with_gap_awareness(series, max_gap=max_gap) @@ -140,7 +341,7 @@ def spike_finder(series, threshold=0.15, spike_tol=0.15, max_gap="7D"): def bandwidth_moving_avg_n_sigma(series, window, n): - """Calculate bandwidth around timeseries based moving average + n * std. + """Calculate bandwidth around time series based moving average + n * std. Parameters ---------- @@ -165,7 +366,7 @@ def bandwidth_moving_avg_n_sigma(series, window, n): def interpolate_series_to_new_index(series, new_index): - """Interpolate timeseries to new DateTimeIndex. + """Interpolate time series to new DateTimeIndex. Parameters ---------- @@ -183,7 +384,7 @@ def interpolate_series_to_new_index(series, new_index): s_interp = np.interp( new_index, series.index.asi8, series.values, left=np.nan, right=np.nan ) - si = pd.Series(index=new_index, data=s_interp, dtype=float, fastpath=True) + si = pd.Series(index=new_index, data=s_interp, dtype=float) return si @@ -202,7 +403,6 @@ def unique_nans_in_series(series, *args): mask : pd.Series mask with value True where NaN is unique to series """ - mask = series.isna() for s in args: @@ -214,30 +414,29 @@ def unique_nans_in_series(series, *args): return mask -def create_synthetic_raw_timeseries(raw_series, truth_series, comments): - """Create synthetic raw timeseries. +def create_synthetic_raw_time_series(raw_series, truth_series, comments): + """Create synthetic raw time series. Updates 'truth_series' (where values are labelled with a comment) with values from raw_series. Used for removing unlabeled changes between - a raw and validated timeseries. + a raw and validated time series. Parameters ---------- raw_series : pd.Series - timeseries with raw data + time series with raw data truth_series : pd.Series - timeseries with validated data + time series with validated data comments : pd.Series - timeseries with comments. Index must be same as 'truth_series'. + time series with comments. Index must be same as 'truth_series'. When value does not have a comment it must be an empty string: ''. Returns ------- s : pd.Series - synthetic raw timeseries, same as truth_series but updated with + synthetic raw time series, same as truth_series but updated with raw_series where value has been commented. """ - if truth_series.index.symmetric_difference(comments.index).size > 0: raise ValueError("'truth_series' and 'comments' must have same index!") @@ -265,7 +464,7 @@ def shift_series_forward_backward(s, freqstr="1D"): def smooth_upper_bound(b, smoothfreq="1D"): smoother = shift_series_forward_backward(b, freqstr=smoothfreq) smoother.iloc[:, 0] = smoother.iloc[:, 0].interpolate(method="linear") - smoother.iloc[:, 2] = smoother.iloc[:, 1].interpolate(method="linear") + smoother.iloc[:, 2] = smoother.iloc[:, 2].interpolate(method="linear") return smoother.max(axis=1).loc[smoother.iloc[:, 1].dropna().index] @@ -274,3 +473,19 @@ def smooth_lower_bound(b, smoothfreq="1D"): smoother.iloc[:, 0] = smoother.iloc[:, 0].interpolate(method="linear") smoother.iloc[:, 2] = smoother.iloc[:, 2].interpolate(method="linear") return smoother.min(axis=1).loc[smoother.iloc[:, 1].dropna().index] + + +def get_correction_status_name(corrections): + """Get correction status name from correction codes. + + Parameters + ---------- + correction_code : pd.DataFrame or pd.Series + dataframe or series containing corrections codes + + Returns + ------- + pd.DataFrame or pd.Series + dataframe or series filled with correction status name + """ + return corrections.fillna(0).map(lambda c: CorrectionCode(c).name) diff --git a/traval/version.py b/traval/version.py index 3d26edf..3d18726 100644 --- a/traval/version.py +++ b/traval/version.py @@ -1 +1 @@ -__version__ = "0.4.1" +__version__ = "0.5.0"