From 3de6dd374efb20e83e2640e80e7ff3983e21e6cf Mon Sep 17 00:00:00 2001 From: Riley Clement Date: Tue, 2 Nov 2021 23:17:41 +1100 Subject: [PATCH 1/3] DOC UPDATE --- README.md | 35 +++++++++++++++++++++++++++++++--- docs/getting_started/index.rst | 2 +- docs/index.rst | 2 +- pyproject.toml | 2 +- 4 files changed, 35 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index b5649e2..924c23d 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ # piso - pandas interval set operations -**piso** exists to bring set operations (union, intersection, difference + more) to [pandas'](https://pandas.pydata.org/) interval classes, specifically +**piso** exists to bring set operations (union, intersection, difference + more), analytical methods, and lookup and join functionality to [pandas'](https://pandas.pydata.org/) interval classes, specifically - pandas.Interval - pandas.arrays.IntervalArray @@ -36,11 +36,40 @@ Currently, there is a lack of such functionality in pandas, although it has been [(3, 4]] Length: 1, closed: right, dtype: interval[int64] + +>>> arr.piso.contains([2, 3, 5]) + 2 3 5 +(1, 5] True True True +(3, 6] False False True +(2, 4] False True False + +>>> df = pd.DataFrame( +... {"A":[4,3], "B":["x","y"]}, +... index=pd.IntervalIndex.from_tuples([(1,3), (5,7)]), +... ) + +>>> s = pd.Series( +... [True, False], +... index=pd.IntervalIndex.from_tuples([(2,4), (5,6)]), +... name="C", +... ) + +>>> piso.join(df, s) + A B C +(1, 2] 4 x NaN +(2, 3] 4 x True +(5, 6] 3 y False +(6, 7] 3 y NaN + +>>> piso.join(df, s, how="inner") + A B C +(2, 3] 4 x True +(5, 6] 3 y False ``` The domain of the intervals can be either numerical, `pandas.Timestamp` or `pandas.Timedelta`. -A small [case study](https://piso.readthedocs.io/en/latest/user_guide/calendar.html) using piso can be found in the [user guide](https://piso.readthedocs.io/en/latest/user_guide/index.html). Further examples, and a detailed explanation of functionality, are provided in the [API reference](https://piso.readthedocs.io/en/latest/reference/index.html). +Several [case studies](https://piso.readthedocs.io/en/latest/user_guide/case_studies/index.html) using piso can be found in the [user guide](https://piso.readthedocs.io/en/latest/user_guide/index.html). Further examples, and a detailed explanation of functionality, are provided in the [API reference](https://piso.readthedocs.io/en/latest/reference/index.html). Visit [https://piso.readthedocs.io](https://piso.readthedocs.io/) for the documentation. @@ -70,7 +99,7 @@ This project is licensed under the [MIT License](https://github.com/staircase-de ## Acknowledgments -Currently, piso is a pure-python implentation which relies heavily on [staircase](https://www.staircase.dev) and [pandas](https://pandas.pydata.org/). It is clearly designed to operate as part of the *pandas ecosystem*. The colours for the piso logo have been assimilated from pandas as a homage, and is not to intended to imply and affiliation with, or endorsement by, pandas. +Currently, piso is a pure-python implentation which relies heavily on [staircase](https://www.staircase.dev) and [pandas](https://pandas.pydata.org/). It is designed to operate as part of the *pandas ecosystem*. The colours for the piso logo have been assimilated from pandas as a homage, and is not to intended to imply and affiliation with, or endorsement by, pandas. Additionally, two classes have been borrowed, almost verbatim, from the pandas source code: diff --git a/docs/getting_started/index.rst b/docs/getting_started/index.rst index 2066910..5434476 100644 --- a/docs/getting_started/index.rst +++ b/docs/getting_started/index.rst @@ -20,7 +20,7 @@ To install the latest version through conda-forge:: Package overview ---------------- -`piso` exists to bring set operations to :mod:`pandas` interval classes, specifically +`piso` exists to bring set operations (union, intersection, difference + more), analytical methods, and lookup and join functionality to :mod:`pandas` interval classes, specifically - :class:`pandas.Interval` - :class:`pandas.arrays.IntervalArray` diff --git a/docs/index.rst b/docs/index.rst index fd9f913..239418e 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -16,7 +16,7 @@ .. rst-class:: center -Pandas Interval Set Operations: methods for set operations for pandas' Interval, IntervalArray and IntervalIndex +Pandas Interval Set Operations: methods for set operations, analytics, lookups and joins on pandas' Interval, IntervalArray and IntervalIndex .. image:: img/powered_by_staircase.svg :target: https://www.staircase.dev diff --git a/pyproject.toml b/pyproject.toml index a0079b0..eed5dd0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ build-backend = "poetry.masonry.api" [tool.poetry] name = "piso" version = "0.5.0" -description = "Pandas Interval Set Operations: methods for set operations for pandas' Interval, IntervalArray and IntervalIndex" +description = "Pandas Interval Set Operations: methods for set operations, analytics, lookups and joins on pandas' Interval, IntervalArray and IntervalIndex" readme = "README.md" authors = ["Riley Clement "] maintainers = ["Riley Clement "] From daac8badd3f036cfb7de0beabaeafcfca660eefd Mon Sep 17 00:00:00 2001 From: Venaturum Date: Thu, 4 Nov 2021 22:55:21 +1100 Subject: [PATCH 2/3] =?UTF-8?q?closed=20values=20extended=20to=20include?= =?UTF-8?q?=20"both"=20and=20"neither"=20for=20.isdisjoin=E2=80=A6=20(#34)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * closed values extended to include "both" and "neither" for .isdisjoint .contains .lookup .get_indexer * degenerate intervals now allowed for .isdisjoint --- docs/getting_started/index.rst | 4 +- docs/release_notes/index.rst | 7 +++ piso/docstrings/accessor.py | 6 +++ piso/docstrings/intervalarray.py | 10 ++++- piso/docstrings/ndframe.py | 2 +- piso/intervalarray.py | 29 ++++++++----- piso/util.py | 2 +- tests/test_single_interval_array.py | 66 ++++++++++++++++++++++++++--- 8 files changed, 106 insertions(+), 20 deletions(-) diff --git a/docs/getting_started/index.rst b/docs/getting_started/index.rst index 5434476..378b1c4 100644 --- a/docs/getting_started/index.rst +++ b/docs/getting_started/index.rst @@ -30,12 +30,14 @@ Currently, there is a lack of such functionality in `pandas`, although it has be An array of intervals can be interpreted in two different ways. It can be seen as a container for intervals, which are sets, or if the intervals are disjoint it may be seen as a set itself. Both interpretations are supported by the methods introduced by :mod:`piso`. -The domain of the intervals can be either numerical, :class:`pandas.Timestamp` or :class:`pandas.Timedelta`. Currently, :mod:`piso` is limited to intervals which: +The domain of the intervals can be either numerical, :class:`pandas.Timestamp` or :class:`pandas.Timedelta`. Currently, most of the set operaitons in :mod:`piso` are limited to intervals which: - have a non-zero length - have a finite, length - are left-closed right-open, or right-closed left-open +To check if these restrictions apply to a particular method, please consult the :ref:`api`. + Several :ref:`case studies ` using :mod:`piso` can be found in the :ref:`user guide `. Further examples, and a detailed explanation of functionality, are provided in the :ref:`api`. diff --git a/docs/release_notes/index.rst b/docs/release_notes/index.rst index 7b6f60c..d488c53 100644 --- a/docs/release_notes/index.rst +++ b/docs/release_notes/index.rst @@ -7,6 +7,13 @@ Release notes ADD UNRELEASED CHANGES ABOVE THIS LINE +The following methods were extended to accommodate intervals with *closed = "both"* or *"neither"* + +- :func:`piso.contains` (and :meth:`ArrayAccessor.contains() `) +- :func:`piso.get_indexer` (and :meth:`ArrayAccessor.get_indexer() `) +- :func:`piso.lookup` +- :func:`piso.isdisjoint` (and :meth:`ArrayAccessor.get_indexer() `) + **v0.5.0 2021-11-02** Added the following methods diff --git a/piso/docstrings/accessor.py b/piso/docstrings/accessor.py index dc9963d..d349427 100644 --- a/piso/docstrings/accessor.py +++ b/piso/docstrings/accessor.py @@ -543,6 +543,9 @@ def join_params(list_of_param_strings): isdisjoint_doc = ( """ Indicates whether one, or more, sets are disjoint or not. + +*interval_array* must be left-closed or right-closed if *interval_arrays is non-empty. +If no arguments are provided then this restriction does not apply. """ + template_doc ) @@ -691,6 +694,8 @@ def join_params(list_of_param_strings): Given a set of disjoint intervals (contained in the interval array that the accessor belongs to) and a value, or vector, *x*, returns the index positions of the interval which contains each value in x. +*interval_array* can be left-closed, right-closed, both or neither. + Parameters ---------- x : scalar, or array-like of scalars @@ -739,6 +744,7 @@ def join_params(list_of_param_strings): ---------- x : scalar, or array-like of scalars Values in *x* should belong to the same domain as the intervals in *interval_array*. + May be left-closed, right-closed, both, or neither. include_index : boolean, default True Indicates whether to return a :class:`numpy.ndarray` or :class:`pandas.DataFrame` indexed by *interval_array* and column names equal to *x* diff --git a/piso/docstrings/intervalarray.py b/piso/docstrings/intervalarray.py index 7650548..c37beba 100644 --- a/piso/docstrings/intervalarray.py +++ b/piso/docstrings/intervalarray.py @@ -544,6 +544,9 @@ def join_params(list_of_param_strings): isdisjoint_doc = ( """ Indicates whether one, or more, sets are disjoint or not. + +*interval_array* must be left-closed or right-closed if *interval_arrays is non-empty. +If *interval_array* is the only argument then this restriction does not apply. """ + template_doc ) @@ -599,6 +602,7 @@ def join_params(list_of_param_strings): ---------- interval_array : :class:`pandas.IntervalIndex` or :class:`pandas.arrays.IntervalArray` Contains the (possibly overlapping) intervals which partially, or wholly cover the domain. + May be left-closed, right-closed, both, or neither. domain : :py:class:`tuple`, :class:`pandas.Interval`, :class:`pandas.IntervalIndex` or :class:`pandas.arrays.IntervalArray`, optional Specifies the domain over which to calculate the "coverage". If *domain* is `None`, then the domain is considered to be the extremities of the intervals contained in *interval_array* @@ -646,7 +650,7 @@ def join_params(list_of_param_strings): Parameters ---------- interval_array : :class:`pandas.IntervalIndex` or :class:`pandas.arrays.IntervalArray` - Contains the (possibly overlapping) intervals. + Contains the (possibly overlapping) intervals. Must be left-closed or right-closed. domain : :py:class:`tuple`, :class:`pandas.Interval`, :class:`pandas.IntervalIndex` or :class:`pandas.arrays.IntervalArray`, optional Specifies the domain over which to calculate the "complement". If *domain* is `None`, then the domain is considered to be the extremities of the intervals contained in *interval_array* @@ -699,6 +703,8 @@ def join_params(list_of_param_strings): Given a set of disjoint intervals and a value, or vector, *x* returns the index positions of the interval which contains each value in x. +*interval_array* can be left-closed, right-closed, both or neither. + Parameters ---------- interval_array : :class:`pandas.IntervalIndex` or :class:`pandas.arrays.IntervalArray` @@ -746,7 +752,7 @@ def join_params(list_of_param_strings): Parameters ---------- interval_array : :class:`pandas.IntervalIndex` or :class:`pandas.arrays.IntervalArray` - Contains the intervals. Must be left-closed or right-closed. + Contains the intervals. May be left-closed, right-closed, both, or neither. x : scalar, or array-like of scalars Values in *x* should belong to the same domain as the intervals in *interval_array*. include_index : boolean, default True diff --git a/piso/docstrings/ndframe.py b/piso/docstrings/ndframe.py index 12a903c..0753a24 100644 --- a/piso/docstrings/ndframe.py +++ b/piso/docstrings/ndframe.py @@ -67,7 +67,7 @@ ---------- *frames_or_series : argument list of :class:`pandas.DataFrame` or :class:`pandas.Series` May contain two or more arguments, all of which must be indexed by a - :class:`pandas.IntervalIndex` containing disjoint intervals. + :class:`pandas.IntervalIndex` containing disjoint intervals. The index can have any *closed* value. Every :class:`pandas.Series` must have a name. how : {"left", "right", "inner", "outer"}, default "left" What sort of join to perform. diff --git a/piso/intervalarray.py b/piso/intervalarray.py index fc3bd2c..f5b873d 100644 --- a/piso/intervalarray.py +++ b/piso/intervalarray.py @@ -16,11 +16,12 @@ def _check_matched_closed(interval_arrays): assert closed_values.count(closed_values[0]) == len(closed_values) -def _validate_array_of_intervals_arrays(*interval_arrays): +def _validate_array_of_intervals_arrays(*interval_arrays, validate_intervals=True): assert len(interval_arrays) > 0 _check_matched_closed(interval_arrays) - for arr in interval_arrays: - _validate_intervals(arr) + if validate_intervals: + for arr in interval_arrays: + _validate_intervals(arr) def _get_return_type(interval_array, return_type): @@ -108,7 +109,9 @@ def symmetric_difference( @Appender(docstrings.isdisjoint_docstring, join="\n", indents=1) def isdisjoint(interval_array, *interval_arrays): - _validate_array_of_intervals_arrays(interval_array, *interval_arrays) + _validate_array_of_intervals_arrays( + interval_array, *interval_arrays, validate_intervals=bool(interval_arrays) + ) if interval_arrays: stairs = _make_stairs(interval_array, *interval_arrays) result = stairs.max() <= 1 @@ -117,7 +120,10 @@ def isdisjoint(interval_array, *interval_arrays): else: arr = np.stack([interval_array.left.values, interval_array.right.values]) arr = arr[arr[:, 0].argsort()] - result = np.all(arr[0, 1:] >= arr[1, :-1]) + if interval_array.closed == "both": + result = np.all(arr[0, 1:] > arr[1, :-1]) + else: + result = np.all(arr[0, 1:] >= arr[1, :-1]) return result @@ -185,6 +191,7 @@ def coverage(interval_array, domain=None): @Appender(docstrings.complement_docstring, join="\n", indents=1) def complement(interval_array, domain=None): + _validate_intervals(interval_array) stepfunction = _interval_x_to_stairs(interval_array).invert() if isinstance(domain, (pd.IntervalIndex, pd.arrays.IntervalArray)): domain = _interval_x_to_stairs(domain) @@ -200,11 +207,13 @@ def contains(interval_array, x, include_index=True): starts = interval_array.left.values ends = interval_array.right.values x = pd.Series(x).values - if interval_array.closed == "right": - result = np.less_equal.outer(x, ends) & np.greater.outer(x, starts) - else: - result = np.less.outer(x, ends) & np.greater_equal.outer(x, starts) - result = result.transpose() + right_compare = ( + np.less_equal if interval_array.closed in ("right", "both") else np.less + ) + left_compare = ( + np.greater_equal if interval_array.closed in ("left", "both") else np.greater + ) + result = (right_compare.outer(x, ends) & left_compare.outer(x, starts)).transpose() if include_index: return pd.DataFrame(result, index=interval_array, columns=x) return result diff --git a/piso/util.py b/piso/util.py index 5f05796..2c12efa 100644 --- a/piso/util.py +++ b/piso/util.py @@ -7,7 +7,7 @@ def _validate_intervals(interval_array): if not all(interval_array.length): # test for degenerate intervals raise DegenerateIntervalError(interval_array) if interval_array.closed not in ("left", "right"): - raise ClosedValueError + raise ClosedValueError(interval_array.closed) def _interval_x_to_stairs(interval_array): diff --git a/tests/test_single_interval_array.py b/tests/test_single_interval_array.py index d086fbe..ccbbf3a 100644 --- a/tests/test_single_interval_array.py +++ b/tests/test_single_interval_array.py @@ -454,6 +454,7 @@ def make_date(x): return interval_array.from_arrays( interval_array.left.map(make_date), interval_array.right.map(make_date), + interval_array.closed, ) @@ -477,7 +478,7 @@ def make_date(x): ) @pytest.mark.parametrize( "closed", - ["left", "right"], + ["left", "right", "neither"], ) @pytest.mark.parametrize( "date_type", @@ -487,7 +488,9 @@ def make_date(x): "how", ["supplied", "accessor", "package"], ) -def test_isdisjoint(interval_index, tuples, expected, closed, date_type, how): +def test_isdisjoint_left_right_neither( + interval_index, tuples, expected, closed, date_type, how +): interval_array = make_ia_from_tuples(interval_index, tuples, closed) interval_array = map_to_dates(interval_array, date_type) @@ -495,6 +498,42 @@ def test_isdisjoint(interval_index, tuples, expected, closed, date_type, how): assert result == expected +@pytest.mark.parametrize( + "interval_index", + [True, False], +) +@pytest.mark.parametrize( + "tuples, expected", + [ + ([], True), + ([(1, 2), (2, 3)], False), + ([(1, 2), (3, 3)], True), + ([(1, 2), (3, 4)], True), + ([(1, 3), (2, 4)], False), + ([(1, 4), (2, 3)], False), + ([(1, 2), (2, 3), (3, 4)], False), + ([(1, 2), (3, 4), (5, 6)], True), + ([(1, 3), (2, 4), (5, 6)], False), + ([(1, 4), (2, 3), (5, 6)], False), + ], +) +@pytest.mark.parametrize( + "date_type", + ["timestamp", "numpy", "datetime", "timedelta", None], +) +@pytest.mark.parametrize( + "how", + ["supplied", "accessor", "package"], +) +def test_isdisjoint_both(interval_index, tuples, expected, date_type, how): + + interval_array = make_ia_from_tuples(interval_index, tuples, "both") + interval_array = map_to_dates(interval_array, date_type) + print(interval_array) + result = perform_op(interval_array, how=how, function=piso_intervalarray.isdisjoint) + assert result == expected + + @pytest.mark.parametrize( "interval_index", [True, False], @@ -632,8 +671,14 @@ def test_complement(interval_index, domain, expected_tuples, closed, how): (4, "left", -1), (3, "right", -1), (4, "right", 0), + (3, "both", 0), + (4, "both", 0), + (3, "neither", -1), + (4, "neither", -1), ([3, 9, 12], "left", np.array([0, 1, -1])), ([3, 9, 12], "right", np.array([-1, 1, -1])), + ([3, 9, 12], "both", np.array([0, 1, -1])), + ([3, 9, 12], "neither", np.array([-1, 1, -1])), ], ) @pytest.mark.parametrize( @@ -678,8 +723,12 @@ def test_get_indexer_exception(how): [ (0, "left", [[True], [False], [False]]), (0, "right", [[False], [False], [False]]), + (0, "both", [[True], [False], [False]]), + (0, "neither", [[False], [False], [False]]), (6, "left", [[False], [False], [False]]), (6, "right", [[False], [False], [True]]), + (6, "neither", [[False], [False], [False]]), + (6, "both", [[False], [False], [True]]), ( [2, 4, 5], "left", @@ -690,6 +739,16 @@ def test_get_indexer_exception(how): "right", [[True, True, False], [False, True, True], [False, True, True]], ), + ( + [2, 4, 5], + "both", + [[True, True, False], [True, True, True], [False, True, True]], + ), + ( + [2, 4, 5], + "neither", + [[True, False, False], [False, True, False], [False, True, True]], + ), ], ) @pytest.mark.parametrize( @@ -709,9 +768,6 @@ def test_contains(interval_index, x, closed, expected, how, include_index): how=how, function=piso_intervalarray.contains, ) - print(result) - print(ia) - print(x) if include_index: expected_result = pd.DataFrame(expected, index=ia, columns=np.array(x, ndmin=1)) pd.testing.assert_frame_equal(result, expected_result, check_dtype=False) From 9b6658cf13388c49de34e902f443a3729d2dd3cc Mon Sep 17 00:00:00 2001 From: Riley Clement Date: Thu, 4 Nov 2021 23:02:55 +1100 Subject: [PATCH 3/3] v0.6.0 --- docs/release_notes/index.rst | 2 ++ pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/release_notes/index.rst b/docs/release_notes/index.rst index d488c53..7419e52 100644 --- a/docs/release_notes/index.rst +++ b/docs/release_notes/index.rst @@ -7,6 +7,8 @@ Release notes ADD UNRELEASED CHANGES ABOVE THIS LINE +**v0.6.0 2021-11-05** + The following methods were extended to accommodate intervals with *closed = "both"* or *"neither"* - :func:`piso.contains` (and :meth:`ArrayAccessor.contains() `) diff --git a/pyproject.toml b/pyproject.toml index eed5dd0..00f1e53 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "poetry.masonry.api" [tool.poetry] name = "piso" -version = "0.5.0" +version = "0.6.0" description = "Pandas Interval Set Operations: methods for set operations, analytics, lookups and joins on pandas' Interval, IntervalArray and IntervalIndex" readme = "README.md" authors = ["Riley Clement "]