From dffb3d29dafac4d96c63d7bf69decffbf454581f Mon Sep 17 00:00:00 2001 From: venaturum Date: Mon, 1 Nov 2021 18:20:54 +1100 Subject: [PATCH 01/10] added piso.join (+ tests + docs) (#GH23) --- docs/getting_started/index.rst | 2 +- docs/reference/package.rst | 3 +- docs/release_notes/index.rst | 3 + docs/requirements.txt | 2 +- docs/user_guide/case_studies/football.rst | 108 ++++++ docs/user_guide/case_studies/index.rst | 52 ++- piso/__init__.py | 2 +- piso/docstrings/ndframe.py | 106 ++++++ piso/ndframe.py | 99 +++++ tests/test_ndframe.py | 437 ++++++++++++++++++++++ 10 files changed, 808 insertions(+), 6 deletions(-) create mode 100644 docs/user_guide/case_studies/football.rst diff --git a/docs/getting_started/index.rst b/docs/getting_started/index.rst index 28d3e68..2066910 100644 --- a/docs/getting_started/index.rst +++ b/docs/getting_started/index.rst @@ -36,7 +36,7 @@ The domain of the intervals can be either numerical, :class:`pandas.Timestamp` o - have a finite, length - are left-closed right-open, or right-closed left-open -A small :ref:`case study ` using :mod:`piso` can be found in the :ref:`user guide `. Further examples, and a detailed explanation of functionality, are provided in the :ref:`api`. +Several :ref:`case studies ` using :mod:`piso` can be found in the :ref:`user guide `. Further examples, and a detailed explanation of functionality, are provided in the :ref:`api`. Versioning diff --git a/docs/reference/package.rst b/docs/reference/package.rst index 4638e3b..4cdfcdc 100644 --- a/docs/reference/package.rst +++ b/docs/reference/package.rst @@ -21,4 +21,5 @@ Top level functions coverage complement get_indexer - lookup \ No newline at end of file + lookup + join \ No newline at end of file diff --git a/docs/release_notes/index.rst b/docs/release_notes/index.rst index 79efb5d..d952728 100644 --- a/docs/release_notes/index.rst +++ b/docs/release_notes/index.rst @@ -4,6 +4,9 @@ Release notes ======================== +Added the following methods + +- :meth:`piso.join` for *join operations* on interval indexes ADD UNRELEASED CHANGES ABOVE THIS LINE diff --git a/docs/requirements.txt b/docs/requirements.txt index 43f0b78..8438de2 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -2,7 +2,7 @@ ipykernel sphinx == 4.0.2 nbsphinx == 0.8.6 sphinx-panels -staircase +staircase >= 2.1 pandas numpy Pygments diff --git a/docs/user_guide/case_studies/football.rst b/docs/user_guide/case_studies/football.rst new file mode 100644 index 0000000..56a2ecd --- /dev/null +++ b/docs/user_guide/case_studies/football.rst @@ -0,0 +1,108 @@ +.. _user_guide.football_example: + + +Analysis of scores in a football match +======================================= + +In this example we will look at a football match from 2009: + + The Champions League quarter-final between Chelsea and Liverpool + in 2009 is recognised as among the best games of all time. + Liverpool scored twice in the first half at 19'and 28'. Chelsea then + opened their account in the second half with three unanswered goals + at 51', 57' and 76'. Liverpool responded with two goals at 81' and 83' + to put themselves ahead, however Chelsea drew with a last minute goal + at 89' and advanced to the next stage on aggregate. + + +We start by importing :mod:`pandas` and :mod:`piso` + +.. ipython:: python + + import pandas as pd + import piso + + +For the analysis we will create a :class:`pandas.Series`, indexed by a :class:`pandas.IntervalIndex` for each team. The values of each series will be the team's score and the interval index, defined by :class:`pandas.Timedelta`, will describe the durations corresponding to each score. We define the following function which creates such a Series, given the minute marks for each score. + +.. ipython:: python + + def make_series(goal_time_mins): + breaks = pd.to_timedelta([0] + goal_time_mins + [90], unit="min") + ii = pd.IntervalIndex.from_breaks(breaks) + return pd.Series(range(len(ii)), index = ii, name="score") + +We can now create each Series. + +.. ipython:: python + + chelsea = make_series([51,57,76,89]) + liverpool = make_series([19,28,81,83]) + +For reference, the Series corresponding to `chelsea` is + +.. ipython:: python + + chelsea + +To enable analysis for separate halves of the game we'll define a similar Series which defines the time intervals for each half + +.. ipython:: python + + halves = pd.Series( + ["1st", "2nd"], + pd.IntervalIndex.from_breaks(pd.to_timedelta([0, 45, 90], unit="min")), + name="half", + ) + halves + +We can now perform a join on these three Series. Since `chelsea` and `liverpool` Series have the same name it will be necessary to provide suffixes to differentiate the columns in the result. The `halves` Series does not have the same name, but a suffix must be defined for each of the join operands if there are any overlaps. + +.. ipython:: python + + CvsL = piso.join(chelsea, liverpool, halves, suffixes=["_chelsea", "_liverpool", ""]) + CvsL + +By default, the :func:`piso.join` function performs a left-join. Since every interval index represents the same domain, that is `(0', 90']`, all join types - *left*, *right*, *inner*, *outer* - will give the same result. + +Using this dataframe we will now provide answers for miscellaneous questions. In particular we will filter the dataframe based on values in the columns, then sum the lengths of the intervals in the filtered index. + + +**How much game time did Chelsea lead for?** + +.. ipython:: python + + CvsL.query("score_chelsea > score_liverpool").index.length.sum() + + +**How much game time did Liverpool lead for?** + +.. ipython:: python + + CvsL.query("score_liverpool > score_chelsea").index.length.sum() + +**How much game time were the teams tied for?** + +.. ipython:: python + + CvsL.query("score_liverpool == score_chelsea").index.length.sum() + +**How much game time in the first half were the teams tied for?** + +.. ipython:: python + + CvsL.query("score_chelsea == score_liverpool and half == '1st'").index.length.sum() + +**For how long did Liverpool lead Chelsea by exactly one goal (split by half)?** + +.. ipython:: python + + CvsL.groupby("half").apply( + lambda df: df.query("score_liverpool - score_chelsea == 1").index.length.sum() + ) + +**What was the score at the 80 minute mark?** + +.. ipython:: python + + piso.lookup(CvsL, pd.Timedelta(80, unit="min")) \ No newline at end of file diff --git a/docs/user_guide/case_studies/index.rst b/docs/user_guide/case_studies/index.rst index b3b060a..2b16d79 100644 --- a/docs/user_guide/case_studies/index.rst +++ b/docs/user_guide/case_studies/index.rst @@ -5,9 +5,57 @@ Case studies *************** -.. toctree:: +.. panels:: + + **Finding common gaps in daily calendars** + + This case study introduces the use of :mod:`piso` for set operations such as :func:`piso.intersection` and :func:`piso.union` and applies it to an example where personal calendars are represented by interval arrays. + + .. link-button:: calendar + :type: ref + :text: + :classes: stretched-link + + --- + + **Verifying a maintenance schedule** + + This case study introduces the use of :mod:`piso` for analysis with functions that return scalars, such as :func:`piso.issuperset` and :func:`piso.coverage`. In this example maintenance schedules and windows of opportunity are represented by interval arrays. + + .. link-button:: maintenance + :type: ref + :text: + :classes: stretched-link + + --- + + **Estimating tax payable** + + This case study demonstrates the use of :func:`piso.lookup` where tax brackets are represented by a :class:`pandas.DataFrame`, indexed by a :class:`pandas.IntervalIndex`. The tax payable for an array of income values is calculated by efficiently finding the corresponding tax brackets. + + .. link-button:: tax + :type: ref + :text: + :classes: stretched-link + + + --- + + **Analysis of scores in a football match** + + This case study introduces the idea of *joins* using :class:`pandas.IntervalIndex`. Using :func:`piso.join` a dataframe is constructed, indexed by intervals for unique score combinations in the 2009 Chelsea vs Liverpool Champions League quarter-final. + + .. link-button:: football + :type: ref + :text: + :classes: stretched-link + + + .. toctree:: + :hidden: :maxdepth: 1 calendar maintenance - tax \ No newline at end of file + tax + football \ No newline at end of file diff --git a/piso/__init__.py b/piso/__init__.py index 92bed1f..fbe954a 100644 --- a/piso/__init__.py +++ b/piso/__init__.py @@ -10,7 +10,7 @@ symmetric_difference, union, ) -from piso.ndframe import lookup +from piso.ndframe import join, lookup def register_accessors(): diff --git a/piso/docstrings/ndframe.py b/piso/docstrings/ndframe.py index af4bf18..12a903c 100644 --- a/piso/docstrings/ndframe.py +++ b/piso/docstrings/ndframe.py @@ -47,3 +47,109 @@ 6 NaN Name: A, dtype: float64 """ + + +join_docstring = """ +Joins multiple dataframes or series by their :class:`pandas.IntervalIndex`. + +Each interval in a :class:`pandas.IntervalIndex` is considered a set, and the interval index containing them a set defined by their union. +Join types are as follows: + +- left: the set defined by the interval index of the result is the same as the set defined by the index of the first argument in *frames_or_series* + +- right: the set defined by the interval index of the result is the same as the set defined by the index of the last argument in *frames_or_series* + +- inner: the set defined by the interval index of the result is the intersection of sets defined by interval indexes from all join arguments + +- outer: the set defined by the interval index of the result is the union of sets defined by interval indexes from all join arguments + +Parameters +---------- +*frames_or_series : argument list of :class:`pandas.DataFrame` or :class:`pandas.Series` + May contain two or more arguments, all of which must be indexed by a + :class:`pandas.IntervalIndex` containing disjoint intervals. + Every :class:`pandas.Series` must have a name. +how : {"left", "right", "inner", "outer"}, default "left" + What sort of join to perform. +suffixes : list of str or None, default None + Suffixes to use for overlapping columns. If used then should be same length as *frames_or_series*. +sort : bool, default False + Order result DataFrame lexicographically by the join key. If False, the order of the join key depends on the join type. + +Returns +---------- +:class:`pandas.DataFrame` + A dataframe containing columns from elements of *frames_or_series* + +Examples +---------- + +>>> import pandas as pd +>>> import piso + +>>> df = pd.DataFrame( +... {"A":[4,3], "B":["x","y"]}, +... index=pd.IntervalIndex.from_tuples([(1,3), (5,7)]), +... ) +>>> s = pd.Series( +... [True, False], +... index=pd.IntervalIndex.from_tuples([(2,4), (5,6)]), +... name="C", +... ) + +>>> piso.join(df, s) + A B C +(1, 2] 4 x NaN +(2, 3] 4 x True +(5, 6] 3 y False +(6, 7] 3 y NaN + +>>> piso.join(df, s, how="right") + A B C +(2, 3] 4.0 x True +(3, 4] NaN NaN True +(5, 6] 3.0 y False + +>>> piso.join(df, s, how="inner") + A B C +(2, 3] 4 x True +(5, 6] 3 y False + +>>> piso.join(df, s, how="outer") + A B C +(1, 2] 4.0 x NaN +(2, 3] 4.0 x True +(5, 6] 3.0 y False +(6, 7] 3.0 y NaN +(3, 4] NaN NaN True + +>>> piso.join(df, s, how="outer", sort=True) + A B C +(1, 2] 4.0 x NaN +(2, 3] 4.0 x True +(3, 4] NaN NaN True +(5, 6] 3.0 y False +(6, 7] 3.0 y NaN + +>>> piso.join(df, df, suffixes=["", "2"]) + A B A2 B2 +(1, 3] 4 x 4 x +(5, 7] 3 y 3 y + +>>> df2 = pd.DataFrame( +... {"D":[1,2]}, +... index=pd.IntervalIndex.from_tuples([(1,2), (6,7)]), +... ) + +>>> piso.join(df, s, df2) + A B C D +(1, 2] 4 x NaN 1.0 +(2, 3] 4 x True NaN +(5, 6] 3 y False NaN +(6, 7] 3 y NaN 2.0 + +>>> piso.join(df, s, df2, how="right") + D A B C +(1, 2] 1 4 x NaN +(6, 7] 2 3 y NaN +""" diff --git a/piso/ndframe.py b/piso/ndframe.py index be771ad..c10bd7a 100644 --- a/piso/ndframe.py +++ b/piso/ndframe.py @@ -1,3 +1,5 @@ +import itertools + import numpy as np import pandas as pd @@ -23,3 +25,100 @@ def lookup(frame_or_series, x): .iloc[indexer >= 0] .reindex(x) ) + + +def _assert_has_disjoint_interval_index(frame_or_series): + if not isinstance(frame_or_series.index, pd.IntervalIndex): + raise ValueError( + "Dataframe, or Series, should have IntervalIndex only. Found {type(frame_or_series.index)}." + ) + if frame_or_series.index.is_overlapping: + raise ValueError( + "IntervalIndex of DataFrame, or Series, cannot contain overlapping intervals." + ) + + +def _get_valid_closed(indexes): + if not all([indexes[0].closed == index.closed for index in indexes[1:]]): + raise ValueError("All IntervalIndex must have the same closed attribute.") + closed = indexes[0].closed + if closed not in ("left", "right"): + raise ValueError( + f"Only IntervalIndex with closed attribute of 'left' or 'right' supported. Found '{closed}'." + ) + return closed + + +def _get_indexers(*dfs): + closed = _get_valid_closed([df.index for df in dfs]) + breaks = itertools.chain( + itertools.chain.from_iterable(df.index.left.values for df in dfs), + itertools.chain.from_iterable(df.index.right.values for df in dfs), + ) + tiling_index = pd.IntervalIndex.from_breaks(sorted(set(breaks))) + lookups = tiling_index.left if closed == "left" else tiling_index.right + indexers = [intervalarray.get_indexer(df.index, lookups) for df in dfs] + return tiling_index, indexers + + +def _handle_overlapping_columns(frames, suffixes): + col_counts = pd.Series.value_counts(list(itertools.chain.from_iterable(frames))) + common_columns = col_counts[col_counts > 1].index + if len(common_columns) > 0: + if len(suffixes) != len(frames): + raise ValueError( + "Overlapping column names found. A suffix must be supplied for every join argument." + ) + frames = [ + df.rename(columns=dict(zip(common_columns, common_columns + suffix))) + for df, suffix in zip(frames, suffixes) + ] + return frames + + +@Appender(docstrings.join_docstring, join="\n", indents=1) +def join(*frames_or_series, how="left", suffixes=None, sort=False): + if len(frames_or_series) < 2: + raise ValueError("Join operation requires more than one operand.") + for obj in frames_or_series: + _assert_has_disjoint_interval_index(obj) + + def frameify(obj): + if isinstance(obj, pd.Series): + if obj.name is None: + raise ValueError("Series arguments to join must be named.") + obj = obj.to_frame() + return obj + + if suffixes is None: + suffixes = [] + new_frames = [frameify(obj) for obj in frames_or_series] + return _join(*new_frames, how=how, suffixes=suffixes, sort=sort) + + +def _join(*frames, how, suffixes, sort): + + tiling_index, indexers = _get_indexers(*frames) + + if how in ("left", "right"): + i = 0 if how == "left" else -1 + final_indexer = indexers[i] >= 0 + else: + stacked_indexers = np.stack(indexers) >= 0 + log_func = np.any if how == "outer" else np.all + final_indexer = log_func(stacked_indexers, axis=0) + + def _reindex(df, indexer): + adjusted_indexer = final_indexer & (indexer >= 0) + return df.iloc[indexer[adjusted_indexer]].set_index( + tiling_index[adjusted_indexer] + ) + + new_frames = [_reindex(df, indexer) for df, indexer in zip(frames, indexers)] + new_frames = _handle_overlapping_columns(new_frames, suffixes) + + if how == "right": + # hack for pandas not working with right joins on interval index (for unknown reasons) + columns = list(itertools.chain.from_iterable([df.columns for df in new_frames])) + return new_frames[-1].join(new_frames[:-1], how="left", sort=sort)[columns] + return pd.DataFrame.join(new_frames[0], new_frames[1:], how=how, sort=sort) diff --git a/tests/test_ndframe.py b/tests/test_ndframe.py index 75dd106..58dd10e 100644 --- a/tests/test_ndframe.py +++ b/tests/test_ndframe.py @@ -19,6 +19,20 @@ def make_ndframe(is_frame, closed, date_type): return df["A"] +def make_ndframe2(is_frame, closed, date_type): + + ia = pd.IntervalIndex.from_tuples([(2, 4), (5, 6), (8, 9)], closed=closed) + if date_type: + ia = map_to_dates(ia, date_type) + df = pd.DataFrame( + {"C": [8, 7, 6], "D": [True, False, True]}, + index=ia, + ) + if is_frame: + return df + return df["C"] + + def make_date(x, date_type): ts = pd.Timestamp(f"2021-10-{x}") if date_type == "numpy": @@ -93,3 +107,426 @@ def test_lookup_exception(): df = pd.DataFrame([1, 2, 3]) with pytest.raises(ValueError): piso.lookup(df, [1, 2]) + + +@pytest.mark.parametrize( + "date_type", + ["timestamp", "numpy", "datetime", "timedelta", None], +) +@pytest.mark.parametrize( + "closed", + ["left", "right"], +) +def test_left_join_frame(closed, date_type): + + ndframe = make_ndframe(True, closed, date_type) + ndframe2 = make_ndframe2(True, closed, date_type) + + index = pd.IntervalIndex.from_tuples([(1, 2), (2, 3), (5, 6), (6, 7)]) + if date_type: + index = map_to_dates(index, date_type) + + result = piso.join(ndframe, ndframe2, how="left") + expected = pd.DataFrame( + { + "A": [4, 4, 3, 3], + "B": ["x", "x", "y", "y"], + "C": [np.nan, 8, 7, np.nan], + "D": [np.nan, True, False, np.nan], + }, + index=index, + ) + pd.testing.assert_frame_equal(result, expected, check_dtype=False) + + +@pytest.mark.parametrize( + "date_type", + ["timestamp", "numpy", "datetime", "timedelta", None], +) +@pytest.mark.parametrize( + "closed", + ["left", "right"], +) +def test_right_join_frame(closed, date_type): + + ndframe = make_ndframe(True, closed, date_type) + ndframe2 = make_ndframe2(True, closed, date_type) + + index = pd.IntervalIndex.from_tuples([(2, 3), (3, 4), (5, 6), (8, 9)]) + if date_type: + index = map_to_dates(index, date_type) + + result = piso.join(ndframe, ndframe2, how="right") + expected = pd.DataFrame( + { + "A": [4, np.nan, 3, np.nan], + "B": ["x", np.nan, "y", np.nan], + "C": [8, 8, 7, 6], + "D": [True, True, False, True], + }, + index=index, + ) + pd.testing.assert_frame_equal(result, expected, check_dtype=False) + + +@pytest.mark.parametrize( + "date_type", + ["timestamp", "numpy", "datetime", "timedelta", None], +) +@pytest.mark.parametrize( + "closed", + ["left", "right"], +) +def test_inner_join_frame(closed, date_type): + + ndframe = make_ndframe(True, closed, date_type) + ndframe2 = make_ndframe2(True, closed, date_type) + + index = pd.IntervalIndex.from_tuples([(2, 3), (5, 6)]) + if date_type: + index = map_to_dates(index, date_type) + + result = piso.join(ndframe, ndframe2, how="inner") + expected = pd.DataFrame( + { + "A": [4, 3], + "B": ["x", "y"], + "C": [8, 7], + "D": [True, False], + }, + index=index, + ) + pd.testing.assert_frame_equal(result, expected, check_dtype=False) + + +@pytest.mark.parametrize( + "date_type", + ["timestamp", "numpy", "datetime", "timedelta", None], +) +@pytest.mark.parametrize( + "closed", + ["left", "right"], +) +def test_outer_join_frame(closed, date_type): + + ndframe = make_ndframe(True, closed, date_type) + ndframe2 = make_ndframe2(True, closed, date_type) + + index = pd.IntervalIndex.from_tuples( + [(1, 2), (2, 3), (3, 4), (5, 6), (6, 7), (8, 9)] + ) + if date_type: + index = map_to_dates(index, date_type) + + result = piso.join(ndframe, ndframe2, how="outer", sort=True) + expected = pd.DataFrame( + { + "A": [4, 4, np.nan, 3, 3, np.nan], + "B": ["x", "x", np.nan, "y", "y", np.nan], + "C": [np.nan, 8, 8, 7, np.nan, 6], + "D": [np.nan, True, True, False, np.nan, True], + }, + index=index, + ) + pd.testing.assert_frame_equal(result, expected, check_dtype=False) + + +@pytest.mark.parametrize( + "date_type", + ["timestamp", "numpy", "datetime", "timedelta", None], +) +@pytest.mark.parametrize( + "closed", + ["left", "right"], +) +@pytest.mark.parametrize( + "how", + ["left", "right", "inner", "outer"], +) +def test_join_frame_lsuffix(closed, date_type, how): + + ndframe = make_ndframe(True, closed, date_type) + + index = pd.IntervalIndex.from_tuples([(1, 3), (5, 7)]) + if date_type: + index = map_to_dates(index, date_type) + + result = piso.join(ndframe, ndframe, how=how, suffixes=["_1", ""]) + expected = pd.DataFrame( + { + "A_1": [4, 3], + "B_1": ["x", "y"], + "A": [4, 3], + "B": ["x", "y"], + }, + index=index, + ) + pd.testing.assert_frame_equal(result, expected, check_dtype=False, check_like=True) + + +@pytest.mark.parametrize( + "date_type", + ["timestamp", "numpy", "datetime", "timedelta", None], +) +@pytest.mark.parametrize( + "closed", + ["left", "right"], +) +@pytest.mark.parametrize( + "how", + ["left", "right", "inner", "outer"], +) +def test_join_frame_rsuffix(closed, date_type, how): + + ndframe = make_ndframe(True, closed, date_type) + + index = pd.IntervalIndex.from_tuples([(1, 3), (5, 7)]) + if date_type: + index = map_to_dates(index, date_type) + + result = piso.join(ndframe, ndframe, how=how, suffixes=["", "_1"]) + expected = pd.DataFrame( + { + "A_1": [4, 3], + "B_1": ["x", "y"], + "A": [4, 3], + "B": ["x", "y"], + }, + index=index, + ) + pd.testing.assert_frame_equal(result, expected, check_dtype=False, check_like=True) + + +# ------ joins with Series ---------------- + + +@pytest.mark.parametrize( + "date_type", + ["timestamp", "numpy", "datetime", "timedelta", None], +) +@pytest.mark.parametrize( + "closed", + ["left", "right"], +) +def test_left_join_series(closed, date_type): + + ndframe = make_ndframe(False, closed, date_type) + ndframe2 = make_ndframe2(False, closed, date_type) + + index = pd.IntervalIndex.from_tuples([(1, 2), (2, 3), (5, 6), (6, 7)]) + if date_type: + index = map_to_dates(index, date_type) + + result = piso.join(ndframe, ndframe2, how="left") + expected = pd.DataFrame( + { + "A": [4, 4, 3, 3], + "C": [np.nan, 8, 7, np.nan], + }, + index=index, + ) + pd.testing.assert_frame_equal(result, expected, check_dtype=False) + + +@pytest.mark.parametrize( + "date_type", + ["timestamp", "numpy", "datetime", "timedelta", None], +) +@pytest.mark.parametrize( + "closed", + ["left", "right"], +) +def test_right_join_series(closed, date_type): + + ndframe = make_ndframe(False, closed, date_type) + ndframe2 = make_ndframe2(False, closed, date_type) + + index = pd.IntervalIndex.from_tuples([(2, 3), (3, 4), (5, 6), (8, 9)]) + if date_type: + index = map_to_dates(index, date_type) + + result = piso.join(ndframe, ndframe2, how="right") + expected = pd.DataFrame( + { + "A": [4, np.nan, 3, np.nan], + "C": [8, 8, 7, 6], + }, + index=index, + ) + pd.testing.assert_frame_equal(result, expected, check_dtype=False) + + +@pytest.mark.parametrize( + "date_type", + ["timestamp", "numpy", "datetime", "timedelta", None], +) +@pytest.mark.parametrize( + "closed", + ["left", "right"], +) +def test_inner_join_series(closed, date_type): + + ndframe = make_ndframe(False, closed, date_type) + ndframe2 = make_ndframe2(False, closed, date_type) + + index = pd.IntervalIndex.from_tuples([(2, 3), (5, 6)]) + if date_type: + index = map_to_dates(index, date_type) + + result = piso.join(ndframe, ndframe2, how="inner") + expected = pd.DataFrame( + { + "A": [4, 3], + "C": [8, 7], + }, + index=index, + ) + pd.testing.assert_frame_equal(result, expected, check_dtype=False) + + +@pytest.mark.parametrize( + "date_type", + ["timestamp", "numpy", "datetime", "timedelta", None], +) +@pytest.mark.parametrize( + "closed", + ["left", "right"], +) +def test_outer_join_series(closed, date_type): + + ndframe = make_ndframe(False, closed, date_type) + ndframe2 = make_ndframe2(False, closed, date_type) + + index = pd.IntervalIndex.from_tuples( + [(1, 2), (2, 3), (3, 4), (5, 6), (6, 7), (8, 9)] + ) + if date_type: + index = map_to_dates(index, date_type) + + result = piso.join(ndframe, ndframe2, how="outer", sort=True) + expected = pd.DataFrame( + { + "A": [4, 4, np.nan, 3, 3, np.nan], + "C": [np.nan, 8, 8, 7, np.nan, 6], + }, + index=index, + ) + pd.testing.assert_frame_equal(result, expected, check_dtype=False) + + +@pytest.mark.parametrize( + "date_type", + ["timestamp", "numpy", "datetime", "timedelta", None], +) +@pytest.mark.parametrize( + "closed", + ["left", "right"], +) +@pytest.mark.parametrize( + "how", + ["left", "right", "inner", "outer"], +) +def test_join_series_lsuffix(closed, date_type, how): + + ndframe = make_ndframe(False, closed, date_type) + + index = pd.IntervalIndex.from_tuples([(1, 3), (5, 7)]) + if date_type: + index = map_to_dates(index, date_type) + + result = piso.join(ndframe, ndframe, how=how, suffixes=["_1", ""]) + expected = pd.DataFrame( + { + "A_1": [4, 3], + "A": [4, 3], + }, + index=index, + ) + pd.testing.assert_frame_equal(result, expected, check_dtype=False, check_like=True) + + +@pytest.mark.parametrize( + "date_type", + ["timestamp", "numpy", "datetime", "timedelta", None], +) +@pytest.mark.parametrize( + "closed", + ["left", "right"], +) +@pytest.mark.parametrize( + "how", + ["left", "right", "inner", "outer"], +) +def test_join_series_rsuffix(closed, date_type, how): + + ndframe = make_ndframe(False, closed, date_type) + + index = pd.IntervalIndex.from_tuples([(1, 3), (5, 7)]) + if date_type: + index = map_to_dates(index, date_type) + + result = piso.join(ndframe, ndframe, how=how, suffixes=["", "_1"]) + expected = pd.DataFrame( + { + "A_1": [4, 3], + "A": [4, 3], + }, + index=index, + ) + pd.testing.assert_frame_equal(result, expected, check_dtype=False, check_like=True) + + +# ---------- join exceptions --------------------------------- + + +def test_lookup_exception_1(): + df = pd.DataFrame([1, 2, 3]) + with pytest.raises(ValueError): + piso.join(df, df) + + +def test_lookup_exception_2(): + df = pd.DataFrame([1, 2, 3]) + with pytest.raises(ValueError): + piso.join(df, df, suffixes=[""]) + + +def test_lookup_exception_3(): + df = pd.DataFrame([1, 2], pd.IntervalIndex.from_tuples([(1, 3), (2, 4)])) + df2 = pd.DataFrame({"col": [1, 2]}, pd.IntervalIndex.from_tuples([(1, 2), (3, 4)])) + with pytest.raises(ValueError): + piso.join(df, df2) + + +def test_lookup_exception_4(): + df = pd.DataFrame({"col1": [1, 2]}, pd.IntervalIndex.from_tuples([(1, 2), (3, 4)])) + with pytest.raises(ValueError): + piso.join(df, df) + + +def test_lookup_exception_5(): + s = pd.Series([1, 2], pd.IntervalIndex.from_tuples([(1, 2), (3, 4)]), name="col") + with pytest.raises(ValueError): + piso.join(s, s) + + +def test_lookup_exception_6(): + df = pd.DataFrame({"col1": [1, 2]}, pd.IntervalIndex.from_tuples([(1, 2), (3, 4)])) + s = pd.Series([1, 2], pd.IntervalIndex.from_tuples([(1, 2), (3, 4)])) + with pytest.raises(ValueError): + piso.join(df, s) + + +def test_lookup_exception_7(): + df = pd.DataFrame({"col1": [1, 2]}, pd.IntervalIndex.from_tuples([(1, 2), (3, 4)])) + with pytest.raises(ValueError): + piso.join(df) + + +def test_lookup_exception_8(): + df = pd.DataFrame({"col1": [1, 2]}, pd.IntervalIndex.from_tuples([(1, 2), (3, 4)])) + df2 = pd.DataFrame( + {"col2": [1, 2]}, pd.IntervalIndex.from_tuples([(1, 2), (3, 4)], closed="both") + ) + with pytest.raises(ValueError): + piso.join(df, df2) From 116bc0bce357d554f8446a27282174767df70ee4 Mon Sep 17 00:00:00 2001 From: venaturum Date: Mon, 1 Nov 2021 18:29:28 +1100 Subject: [PATCH 02/10] additional tests for closed values of indexes --- tests/test_ndframe.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tests/test_ndframe.py b/tests/test_ndframe.py index 58dd10e..75af161 100644 --- a/tests/test_ndframe.py +++ b/tests/test_ndframe.py @@ -524,9 +524,22 @@ def test_lookup_exception_7(): def test_lookup_exception_8(): - df = pd.DataFrame({"col1": [1, 2]}, pd.IntervalIndex.from_tuples([(1, 2), (3, 4)])) + df = pd.DataFrame( + {"col1": [1, 2]}, pd.IntervalIndex.from_tuples([(1, 2), (3, 4)], closed="both") + ) df2 = pd.DataFrame( {"col2": [1, 2]}, pd.IntervalIndex.from_tuples([(1, 2), (3, 4)], closed="both") ) with pytest.raises(ValueError): piso.join(df, df2) + + +def test_lookup_exception_9(): + df = pd.DataFrame( + {"col1": [1, 2]}, pd.IntervalIndex.from_tuples([(1, 2), (3, 4)], closed="left") + ) + df2 = pd.DataFrame( + {"col2": [1, 2]}, pd.IntervalIndex.from_tuples([(1, 2), (3, 4)], closed="right") + ) + with pytest.raises(ValueError): + piso.join(df, df2) From 0a03ee38f05f28e7ef6fa950a5ec83dbd6b6983b Mon Sep 17 00:00:00 2001 From: Riley Clement Date: Mon, 1 Nov 2021 23:27:00 +1100 Subject: [PATCH 03/10] adding football case study with staircase --- docs/user_guide/case_studies/football.rst | 5 +- .../case_studies/football_staircase.rst | 127 ++++++++++++++++++ docs/user_guide/faq.rst | 2 +- 3 files changed, 132 insertions(+), 2 deletions(-) create mode 100644 docs/user_guide/case_studies/football_staircase.rst diff --git a/docs/user_guide/case_studies/football.rst b/docs/user_guide/case_studies/football.rst index 56a2ecd..adc348d 100644 --- a/docs/user_guide/case_studies/football.rst +++ b/docs/user_guide/case_studies/football.rst @@ -105,4 +105,7 @@ Using this dataframe we will now provide answers for miscellaneous questions. I .. ipython:: python - piso.lookup(CvsL, pd.Timedelta(80, unit="min")) \ No newline at end of file + piso.lookup(CvsL, pd.Timedelta(80, unit="min")) + + +This analysis is also straightforward using :mod:`staircase.Stairs`. For more information on this please see the :ref:`corresponding example with staircase ` \ No newline at end of file diff --git a/docs/user_guide/case_studies/football_staircase.rst b/docs/user_guide/case_studies/football_staircase.rst new file mode 100644 index 0000000..fbc2e1e --- /dev/null +++ b/docs/user_guide/case_studies/football_staircase.rst @@ -0,0 +1,127 @@ +.. _user_guide.football_staircase_example: + + +Analysis of scores in a football match (using staircase) +=========================================================== + +.. ipython:: python + :suppress: + + import matplotlib.pyplot as plt + import matplotlib.ticker as ticker + plt.style.use('seaborn') + +This example demonstrates how :mod:`staircase` can be used to mirror the functionality +and analysis presented in the :ref:`corresponding example with piso `. + + The Champions League quarter-final between Chelsea and Liverpool + in 2009 is recognised as among the best games of all time. + Liverpool scored twice in the first half at 19'and 28'. Chelsea then + opened their account in the second half with three unanswered goals + at 51', 57' and 76'. Liverpool responded with two goals at 81' and 83' + to put themselves ahead, however Chelsea drew with a last minute goal + at 89' and advanced to the next stage on aggregate. + + +We start by importing :mod:`pandas` and :mod:`staircase` + +.. ipython:: python + + import pandas as pd + import staircase as sc + + +For the analysis we will create a :class:`staircase.Stairs` for each team, and wrap them up in a :class:`pandas.Series` which is indexed by the club names. Using a Series in this way is by no means necessary but can be useful. We'll create a function `make_stairs` which takes the minute marks of the goals and returns a :class:`staircase.Stairs`. Each step function will be monotonically non-decreasing. + +.. ipython:: python + + def make_stairs(goal_time_mins): + breaks = pd.to_timedelta(goal_time_mins, unit="min") + return sc.Stairs(start=breaks).clip(pd.Timedelta(0), pd.Timedelta("90m")) + + scores = pd.Series( + { + "chelsea":make_stairs([51,57,76,89]), + "liverpool":make_stairs([19,28,81,83]), + } + ) + scores + + +To clarify we plot these step functions below. + +.. ipython:: python + :suppress: + + fig, axes = plt.subplots(ncols=2, figsize=(8,3), sharey=True) + vals = scores["chelsea"].step_values + vals.index = vals.index/pd.Timedelta("1min") + sc.Stairs.from_values(0, vals).plot(axes[0]) + axes[0].set_title("Chelsea") + axes[0].set_xlabel("time (mins)") + axes[0].set_ylabel("score") + axes[0].yaxis.set_major_locator(ticker.MultipleLocator()) + axes[0].set_xlim(0,90) + vals = scores["liverpool"].step_values + vals.index = vals.index/pd.Timedelta("1min") + sc.Stairs.from_values(0, vals).plot(axes[1]) + axes[1].set_title("Liverpool") + axes[1].set_xlabel("time (mins)") + axes[1].set_ylabel("score") + @savefig case_study_football_staircase.png + plt.tight_layout(); + + +To enable analysis for separate halves of the game we'll define a similar Series which defines the time intervals for each half with tuples of :class:`pandas.Timedeltas`. + +.. ipython:: python + + halves = pd.Series( + { + "1st":(pd.Timedelta(0), pd.Timedelta("45m")), + "2nd":(pd.Timedelta("45m"), pd.Timedelta("90m")), + } + ) + halves + + +We can now use our *scores* and *halves* Series to provide answers for miscellaneous questions. Note that comparing :class:`staircase.Stairs` objects with relational operators produces boolean-valued step functions (Stairs objects). Finding the integral of these boolean step functions is equivalent to summing up lengths of intervals in the domain where the step function is equal to one. + +**How much game time did Chelsea lead for?** + +.. ipython:: python + + (scores["chelsea"] > scores["liverpool"]).integral() + + +**How much game time did Liverpool lead for?** + +.. ipython:: python + + (scores["chelsea"] < scores["liverpool"]).integral() + +**How much game time were the teams tied for?** + +.. ipython:: python + + (scores["chelsea"] == scores["liverpool"]).integral() + +**How much game time in the first half were the teams tied for?** + +.. ipython:: python + + (scores["chelsea"] == scores["liverpool"]).where(halves["1st"]).integral() + +**For how long did Liverpool lead Chelsea by exactly one goal (split by half)?** + +.. ipython:: python + + halves.apply(lambda x: + (scores["liverpool"]==scores["chelsea"]+1).where(x).integral() + ) + +**What was the score at the 80 minute mark?** + +.. ipython:: python + + sc.sample(scores, pd.Timedelta("80m")) \ No newline at end of file diff --git a/docs/user_guide/faq.rst b/docs/user_guide/faq.rst index 8a05afc..4418837 100644 --- a/docs/user_guide/faq.rst +++ b/docs/user_guide/faq.rst @@ -77,4 +77,4 @@ Frequently asked questions .. dropdown:: What if I want to map intervals with a scalar? This question may arise if, for example, a :class:`pandas.Series` with a numerical dtype, was indexed with a :class:`pandas.IntervalIndex`. - Given two intervals, and their associated scalar values, a user may wish to find the overlap of these intervals, and map it to the minimum of the two scalar values - or perhaps the addition of the scalar values. These sorts of manipulations can be achieved via :mod:`staircase`. There is a one-to-one mapping between sets of disjoint intervals (with associated scalars) and step functions, which is what motivates the internal implementations of `piso`. :mod:`staircase` provides a comprehensive range of arithmetic, logical, relational and statistical methods for working with step functions. + Given two intervals, and their associated scalar values, a user may wish to find the overlap of these intervals, and map it to the minimum of the two scalar values - or perhaps the addition of the scalar values. These sorts of manipulations can be achieved via :mod:`staircase`. There is a one-to-one mapping between sets of disjoint intervals (with associated scalars) and step functions, which is what motivates the internal implementations of `piso`. :mod:`staircase` provides a comprehensive range of arithmetic, logical, relational and statistical methods for working with step functions. For related case studies see the :ref:`football case study with piso ` and the :ref:`football case study with staircase ` From 8821899b0622892fde0ab603e9baaec2ca001dff Mon Sep 17 00:00:00 2001 From: venaturum Date: Tue, 2 Nov 2021 15:33:05 +1100 Subject: [PATCH 04/10] faster implementation for get_indexer --- piso/intervalarray.py | 16 +++++++++------- piso/ndframe.py | 6 ++---- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/piso/intervalarray.py b/piso/intervalarray.py index 7df05f3..811b0ed 100644 --- a/piso/intervalarray.py +++ b/piso/intervalarray.py @@ -199,10 +199,12 @@ def complement(interval_array, domain=None): def get_indexer(interval_array, x): if not isdisjoint(interval_array): raise ValueError("get_indexer method is only valid for disjoint intervals.") - return sc.Stairs( - start=interval_array.left, - end=interval_array.right, - value=range(1, len(interval_array) + 1), - initial_value=-1, - closed=interval_array.closed, - )(x) + starts = interval_array.left.values + ends = interval_array.right.values + x = pd.Series(x).values + num_ints = len(starts) + if interval_array.closed == "right": + m1 = np.less_equal.outer(x, ends) & np.greater.outer(x, starts) + else: + m1 = np.less.outer(x, ends) & np.greater_equal.outer(x, starts) + return (m1.dot(np.linspace(1, num_ints, num_ints)) - 1).astype(int) diff --git a/piso/ndframe.py b/piso/ndframe.py index c10bd7a..5c1a919 100644 --- a/piso/ndframe.py +++ b/piso/ndframe.py @@ -12,11 +12,9 @@ def lookup(frame_or_series, x): if not isinstance(frame_or_series.index, pd.IntervalIndex): raise ValueError("DataFrame or Series must be indexed by an IntervalIndex") - indexer = intervalarray.get_indexer(frame_or_series.index, x) - if not hasattr(indexer, "__len__"): - indexer = np.array([indexer]) if not hasattr(x, "__len__"): - x = [x] + x = np.array(x, ndmin=1) + indexer = intervalarray.get_indexer(frame_or_series.index, x) return ( frame_or_series.__class__( data=frame_or_series, From a473b70bdd8a862ff0a23de2974ffbe78a1f7aa8 Mon Sep 17 00:00:00 2001 From: venaturum Date: Tue, 2 Nov 2021 15:46:37 +1100 Subject: [PATCH 05/10] doc update --- docs/release_notes/index.rst | 4 ++-- docs/user_guide/case_studies/football.rst | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/release_notes/index.rst b/docs/release_notes/index.rst index d952728..e412d5c 100644 --- a/docs/release_notes/index.rst +++ b/docs/release_notes/index.rst @@ -4,9 +4,9 @@ Release notes ======================== -Added the following methods -- :meth:`piso.join` for *join operations* on interval indexes +- added :func:`piso.join` for *join operations* with interval indexes +- faster implementation for :func:`piso.get_indexer` ADD UNRELEASED CHANGES ABOVE THIS LINE diff --git a/docs/user_guide/case_studies/football.rst b/docs/user_guide/case_studies/football.rst index adc348d..bdb58be 100644 --- a/docs/user_guide/case_studies/football.rst +++ b/docs/user_guide/case_studies/football.rst @@ -108,4 +108,4 @@ Using this dataframe we will now provide answers for miscellaneous questions. I piso.lookup(CvsL, pd.Timedelta(80, unit="min")) -This analysis is also straightforward using :mod:`staircase.Stairs`. For more information on this please see the :ref:`corresponding example with staircase ` \ No newline at end of file +This analysis is also straightforward using :mod:`staircase`. For more information on this please see the :ref:`corresponding example with staircase ` \ No newline at end of file From 3a5afa9f0dfa85f4dc7f78b5236f9fbc14d85d6a Mon Sep 17 00:00:00 2001 From: Venaturum Date: Tue, 2 Nov 2021 20:19:15 +1100 Subject: [PATCH 06/10] added contains functions (+ docs + tests) (#GH27) (#28) --- docs/reference/accessors.rst | 1 + docs/reference/package.rst | 1 + docs/release_notes/index.rst | 4 ++- piso/__init__.py | 1 + piso/accessor.py | 8 +++++ piso/docstrings/accessor.py | 50 +++++++++++++++++++++++++++ piso/docstrings/intervalarray.py | 51 +++++++++++++++++++++++++++ piso/intervalarray.py | 25 +++++++++----- tests/test_single_interval_array.py | 53 +++++++++++++++++++++++++++++ 9 files changed, 185 insertions(+), 9 deletions(-) diff --git a/docs/reference/accessors.rst b/docs/reference/accessors.rst index b87774f..0ad0a59 100644 --- a/docs/reference/accessors.rst +++ b/docs/reference/accessors.rst @@ -18,4 +18,5 @@ Accessors ArrayAccessor.issubset ArrayAccessor.coverage ArrayAccessor.complement + ArrayAccessor.contains ArrayAccessor.get_indexer \ No newline at end of file diff --git a/docs/reference/package.rst b/docs/reference/package.rst index 4cdfcdc..59f084c 100644 --- a/docs/reference/package.rst +++ b/docs/reference/package.rst @@ -20,6 +20,7 @@ Top level functions issubset coverage complement + contains get_indexer lookup join \ No newline at end of file diff --git a/docs/release_notes/index.rst b/docs/release_notes/index.rst index e412d5c..0553e79 100644 --- a/docs/release_notes/index.rst +++ b/docs/release_notes/index.rst @@ -5,7 +5,9 @@ Release notes ======================== -- added :func:`piso.join` for *join operations* with interval indexes +- added :meth:`piso.join` for *join operations* with interval indexes +- added :meth:`piso.contains` +- added :meth:`ArrayAccessor.contains() ` - faster implementation for :func:`piso.get_indexer` ADD UNRELEASED CHANGES ABOVE THIS LINE diff --git a/piso/__init__.py b/piso/__init__.py index fbe954a..87dee33 100644 --- a/piso/__init__.py +++ b/piso/__init__.py @@ -1,5 +1,6 @@ from piso.intervalarray import ( complement, + contains, coverage, difference, get_indexer, diff --git a/piso/accessor.py b/piso/accessor.py index 470ac64..4fd5e03 100644 --- a/piso/accessor.py +++ b/piso/accessor.py @@ -162,6 +162,14 @@ def get_indexer(self, x): x, ) + @Appender(docstrings.contains_docstring, join="\n", indents=1) + def contains(self, x, include_index=True): + return intervalarray.contains( + self._interval_array, + x, + include_index, + ) + def _register_accessors(): _register_accessor("piso", pd.IntervalIndex)(ArrayAccessor) diff --git a/piso/docstrings/accessor.py b/piso/docstrings/accessor.py index 3174325..dc9963d 100644 --- a/piso/docstrings/accessor.py +++ b/piso/docstrings/accessor.py @@ -726,3 +726,53 @@ def join_params(list_of_param_strings): >>> arr.piso.get_indexer([1,2,7,13]) array([ 0, 0, 1, -1], dtype=int64) """ + + +contains_docstring = """ +Check pair-wise if a set of intervals, belonging to the object the accessor belongs to, contains a set of values. + +Returns a 2-dimensional boolean mask *M* of shape *(m,n)* where *m* is the number of intervals, and +*n* is the number of points. The element in the i-th row and j-th column is True if +the i-th interval contains the j-th point. + +Parameters +---------- +x : scalar, or array-like of scalars + Values in *x* should belong to the same domain as the intervals in *interval_array*. +include_index : boolean, default True + Indicates whether to return a :class:`numpy.ndarray` or :class:`pandas.DataFrame` indexed + by *interval_array* and column names equal to *x* + +Returns +---------- +:class:`numpy.ndarray` or :class:`pandas.DataFrame` + Two dimensional and boolean valued. Return type dependent on *include_index*. + +Examples +----------- + +>>> import pandas as pd +>>> import piso +>>> piso.register_accessors() + +>>> arr = pd.arrays.IntervalArray.from_tuples( +... [(0, 4), (2, 5)], +... ) + +>>> arr.piso.contains(1) + 1 +(0, 4] True +(2, 5] False + +>>> arr.piso.contains([0, 1, 3, 4]) + 0 1 3 4 +(0, 4] False True True True +(2, 5] False False True True + +>>> arr.piso.contains([0, 1, 3, 4], include_index=False) +array([[False, True, True, True], + [False, False, True, True]]) + +>>> pd.IntervalIndex.from_tuples([(0,2)]).piso.contains(1, include_index=False) +array([[ True]]) +""" diff --git a/piso/docstrings/intervalarray.py b/piso/docstrings/intervalarray.py index 04be9e6..7650548 100644 --- a/piso/docstrings/intervalarray.py +++ b/piso/docstrings/intervalarray.py @@ -734,3 +734,54 @@ def join_params(list_of_param_strings): >>> piso.get_indexer(arr, [1,2,7,13]) array([ 0, 0, 1, -1], dtype=int64) """ + + +contains_docstring = """ +Check pair-wise if a set of intervals contains a set of values + +Returns a 2-dimensional boolean mask *M* of shape *(m,n)* where *m* is the number of intervals, and +*n* is the number of points. The element in the i-th row and j-th column is True if +the i-th interval contains the j-th point. + +Parameters +---------- +interval_array : :class:`pandas.IntervalIndex` or :class:`pandas.arrays.IntervalArray` + Contains the intervals. Must be left-closed or right-closed. +x : scalar, or array-like of scalars + Values in *x* should belong to the same domain as the intervals in *interval_array*. +include_index : boolean, default True + Indicates whether to return a :class:`numpy.ndarray` or :class:`pandas.DataFrame` indexed + by *interval_array* and column names equal to *x* + +Returns +---------- +:class:`numpy.ndarray` or :class:`pandas.DataFrame` + Two dimensional and boolean valued. Return type dependent on *include_index*. + +Examples +----------- + +>>> import pandas as pd +>>> import piso + +>>> arr = pd.arrays.IntervalArray.from_tuples( +... [(0, 4), (2, 5)], +... ) + +>>> piso.contains(arr, 1) + 1 +(0, 4] True +(2, 5] False + +>>> piso.contains(arr, [0, 1, 3, 4]) + 0 1 3 4 +(0, 4] False True True True +(2, 5] False False True True + +>>> piso.contains(arr, [0, 1, 3, 4], include_index=False) +array([[False, True, True, True], + [False, False, True, True]]) + +>>> piso.contains(pd.IntervalIndex.from_tuples([(0,2)]), 1, include_index=False) +array([[ True]]) +""" diff --git a/piso/intervalarray.py b/piso/intervalarray.py index 811b0ed..fc3bd2c 100644 --- a/piso/intervalarray.py +++ b/piso/intervalarray.py @@ -195,16 +195,25 @@ def complement(interval_array, domain=None): return _boolean_stairs_to_interval_array(result, interval_array.__class__) -@Appender(docstrings.get_indexer_docstring, join="\n", indents=1) -def get_indexer(interval_array, x): - if not isdisjoint(interval_array): - raise ValueError("get_indexer method is only valid for disjoint intervals.") +@Appender(docstrings.contains_docstring, join="\n", indents=1) +def contains(interval_array, x, include_index=True): starts = interval_array.left.values ends = interval_array.right.values x = pd.Series(x).values - num_ints = len(starts) if interval_array.closed == "right": - m1 = np.less_equal.outer(x, ends) & np.greater.outer(x, starts) + result = np.less_equal.outer(x, ends) & np.greater.outer(x, starts) else: - m1 = np.less.outer(x, ends) & np.greater_equal.outer(x, starts) - return (m1.dot(np.linspace(1, num_ints, num_ints)) - 1).astype(int) + result = np.less.outer(x, ends) & np.greater_equal.outer(x, starts) + result = result.transpose() + if include_index: + return pd.DataFrame(result, index=interval_array, columns=x) + return result + + +@Appender(docstrings.get_indexer_docstring, join="\n", indents=1) +def get_indexer(interval_array, x): + if not isdisjoint(interval_array): + raise ValueError("get_indexer method is only valid for disjoint intervals.") + ia_length = len(interval_array) + contain_matrix = contains(interval_array, x, include_index=False) + return (np.linspace(1, ia_length, ia_length).dot(contain_matrix) - 1).astype(int) diff --git a/tests/test_single_interval_array.py b/tests/test_single_interval_array.py index 0050913..d086fbe 100644 --- a/tests/test_single_interval_array.py +++ b/tests/test_single_interval_array.py @@ -20,6 +20,7 @@ def get_accessor_method(self, function): piso_intervalarray.coverage: self.piso.coverage, piso_intervalarray.complement: self.piso.complement, piso_intervalarray.get_indexer: self.piso.get_indexer, + piso_intervalarray.contains: self.piso.contains, }[function] @@ -34,6 +35,7 @@ def get_package_method(function): piso_intervalarray.coverage: piso.coverage, piso_intervalarray.complement: piso.complement, piso_intervalarray.get_indexer: piso.get_indexer, + piso_intervalarray.contains: piso.contains, }[function] @@ -665,3 +667,54 @@ def test_get_indexer_exception(how): how=how, function=piso_intervalarray.get_indexer, ) + + +@pytest.mark.parametrize( + "interval_index", + [True, False], +) +@pytest.mark.parametrize( + "x, closed, expected", + [ + (0, "left", [[True], [False], [False]]), + (0, "right", [[False], [False], [False]]), + (6, "left", [[False], [False], [False]]), + (6, "right", [[False], [False], [True]]), + ( + [2, 4, 5], + "left", + [[True, False, False], [True, True, False], [False, True, True]], + ), + ( + [2, 4, 5], + "right", + [[True, True, False], [False, True, True], [False, True, True]], + ), + ], +) +@pytest.mark.parametrize( + "how", + ["supplied", "accessor", "package"], +) +@pytest.mark.parametrize( + "include_index", + [True, False], +) +def test_contains(interval_index, x, closed, expected, how, include_index): + ia = make_ia2(interval_index, closed) + result = perform_op( + ia, + x, + include_index, + how=how, + function=piso_intervalarray.contains, + ) + print(result) + print(ia) + print(x) + if include_index: + expected_result = pd.DataFrame(expected, index=ia, columns=np.array(x, ndmin=1)) + pd.testing.assert_frame_equal(result, expected_result, check_dtype=False) + else: + expected_result = np.array(expected) + assert (result == expected_result).all() From 3e4b7c02bba9dbd85587b2e7b961c3ca5ef8075e Mon Sep 17 00:00:00 2001 From: Venaturum Date: Tue, 2 Nov 2021 20:55:25 +1100 Subject: [PATCH 07/10] speedup for piso.lookup (#30) --- docs/release_notes/index.rst | 45 +++++++++++++++------------- docs/user_guide/case_studies/tax.rst | 6 ++-- piso/ndframe.py | 13 ++++---- 3 files changed, 33 insertions(+), 31 deletions(-) diff --git a/docs/release_notes/index.rst b/docs/release_notes/index.rst index 0553e79..2c2a928 100644 --- a/docs/release_notes/index.rst +++ b/docs/release_notes/index.rst @@ -4,11 +4,16 @@ Release notes ======================== +Added the following methods + +- :func:`piso.join` for *join operations* with interval indexes +- :func:`piso.contains` +- :meth:`ArrayAccessor.contains() ` + +Performance improvements for -- added :meth:`piso.join` for *join operations* with interval indexes -- added :meth:`piso.contains` -- added :meth:`ArrayAccessor.contains() ` -- faster implementation for :func:`piso.get_indexer` +- :func:`piso.lookup` +- :func:`piso.get_indexer` ADD UNRELEASED CHANGES ABOVE THIS LINE @@ -17,8 +22,8 @@ ADD UNRELEASED CHANGES ABOVE THIS LINE Added the following methods -- :meth:`piso.lookup` -- :meth:`piso.get_indexer` +- :func:`piso.lookup` +- :func:`piso.get_indexer` - :meth:`ArrayAccessor.get_indexer() ` @@ -26,8 +31,8 @@ Added the following methods Added the following methods -- :meth:`piso.coverage` -- :meth:`piso.complement` +- :func:`piso.coverage` +- :func:`piso.complement` - :meth:`ArrayAccessor.coverage() ` - :meth:`ArrayAccessor.complement() ` @@ -36,9 +41,9 @@ Added the following methods Added the following methods -- :meth:`piso.isdisjoint` -- :meth:`piso.issuperset` -- :meth:`piso.issubset` +- :func:`piso.isdisjoint` +- :func:`piso.issuperset` +- :func:`piso.issubset` - :meth:`ArrayAccessor.isdisjoint() ` - :meth:`ArrayAccessor.issuperset() ` - :meth:`ArrayAccessor.issubset() ` @@ -50,17 +55,17 @@ Added the following methods The following methods are included in the initial release of `piso` -- :meth:`piso.register_accessors` -- :meth:`piso.union` -- :meth:`piso.intersection` -- :meth:`piso.difference` -- :meth:`piso.symmetric_difference` +- :func:`piso.register_accessors` +- :func:`piso.union` +- :func:`piso.intersection` +- :func:`piso.difference` +- :func:`piso.symmetric_difference` - :meth:`ArrayAccessor.union() ` - :meth:`ArrayAccessor.intersection() ` - :meth:`ArrayAccessor.difference() ` - :meth:`ArrayAccessor.symmetric_difference() ` -- :meth:`piso.interval.union` -- :meth:`piso.interval.intersection` -- :meth:`piso.interval.difference` -- :meth:`piso.interval.symmetric_difference` +- :func:`piso.interval.union` +- :func:`piso.interval.intersection` +- :func:`piso.interval.difference` +- :func:`piso.interval.symmetric_difference` diff --git a/docs/user_guide/case_studies/tax.rst b/docs/user_guide/case_studies/tax.rst index cc2fe7e..a7abedc 100644 --- a/docs/user_guide/case_studies/tax.rst +++ b/docs/user_guide/case_studies/tax.rst @@ -102,7 +102,7 @@ We can then use a vectorised calculation for the tax payable: Alternative approaches ----------------------- -There are a couple of alternative solutions which do not require :mod:`piso` which we detail below. +There are a couple of alternative, straightforward solutions which do not require :mod:`piso` which we detail below. **Alternative 1: pandas.cut** @@ -113,7 +113,7 @@ The `tax_params` dataframe that was produced above by :func:`piso.lookup` can be tax_params = tax_rates.loc[pd.cut(income, tax_brackets)].set_index(income) tax_params -This approach however runs approximately 3 times slower than :func:`piso.lookup`. +This approach however runs approximately 20 times slower than :func:`piso.lookup`. **Alternative 2: applying function** @@ -138,7 +138,7 @@ The function can then used with `pandas.Series.apply` income.apply(calc_tax) -This approach is the fastest - approximately 3 times faster than :func:`piso.lookup` - but it does a function to be defined which is relatively cumbersome to implement. This approach becomes increasingly unattractive, and error prone, as the number of tax brackets increases. +This approach runs approximately 3 times slower than :func:`piso.lookup`. It also requires a function to be defined which is relatively cumbersome to implement. This approach becomes increasingly unattractive, and error prone, as the number of tax brackets increases. diff --git a/piso/ndframe.py b/piso/ndframe.py index 5c1a919..7aa0163 100644 --- a/piso/ndframe.py +++ b/piso/ndframe.py @@ -15,14 +15,11 @@ def lookup(frame_or_series, x): if not hasattr(x, "__len__"): x = np.array(x, ndmin=1) indexer = intervalarray.get_indexer(frame_or_series.index, x) - return ( - frame_or_series.__class__( - data=frame_or_series, - index=x, - ) - .iloc[indexer >= 0] - .reindex(x) - ) + result = frame_or_series.copy().iloc[indexer].set_axis(x) + set_nan = indexer == -1 + if set_nan.any(): + result.loc[set_nan] = np.nan + return result def _assert_has_disjoint_interval_index(frame_or_series): From dd1520dd34314ac6c4552dfd28d52d6f49eac3c5 Mon Sep 17 00:00:00 2001 From: Riley Clement Date: Tue, 2 Nov 2021 21:11:16 +1100 Subject: [PATCH 08/10] Updates for football case study --- docs/user_guide/case_studies/football.rst | 11 ++++++----- docs/user_guide/case_studies/football_staircase.rst | 11 ++++++----- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/docs/user_guide/case_studies/football.rst b/docs/user_guide/case_studies/football.rst index bdb58be..fc1ef3d 100644 --- a/docs/user_guide/case_studies/football.rst +++ b/docs/user_guide/case_studies/football.rst @@ -8,11 +8,12 @@ In this example we will look at a football match from 2009: The Champions League quarter-final between Chelsea and Liverpool in 2009 is recognised as among the best games of all time. - Liverpool scored twice in the first half at 19'and 28'. Chelsea then - opened their account in the second half with three unanswered goals - at 51', 57' and 76'. Liverpool responded with two goals at 81' and 83' - to put themselves ahead, however Chelsea drew with a last minute goal - at 89' and advanced to the next stage on aggregate. + Liverpool scored twice in the first half in the 19th and 28th minute. + Chelsea then opened their account in the second half with three + unanswered goals in the 51st, 57th and 76th minute. Liverpool + responded with two goals in the 81st and 83rd minute to put themselves + ahead, however Chelsea drew with a goal in the 89th minte and advanced + to the next stage on aggregate. We start by importing :mod:`pandas` and :mod:`piso` diff --git a/docs/user_guide/case_studies/football_staircase.rst b/docs/user_guide/case_studies/football_staircase.rst index fbc2e1e..ba44a69 100644 --- a/docs/user_guide/case_studies/football_staircase.rst +++ b/docs/user_guide/case_studies/football_staircase.rst @@ -16,11 +16,12 @@ and analysis presented in the :ref:`corresponding example with piso Date: Tue, 2 Nov 2021 21:19:04 +1100 Subject: [PATCH 09/10] Typo in football case study --- docs/user_guide/case_studies/football.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user_guide/case_studies/football.rst b/docs/user_guide/case_studies/football.rst index fc1ef3d..0ef0813 100644 --- a/docs/user_guide/case_studies/football.rst +++ b/docs/user_guide/case_studies/football.rst @@ -12,7 +12,7 @@ In this example we will look at a football match from 2009: Chelsea then opened their account in the second half with three unanswered goals in the 51st, 57th and 76th minute. Liverpool responded with two goals in the 81st and 83rd minute to put themselves - ahead, however Chelsea drew with a goal in the 89th minte and advanced + ahead, however Chelsea drew with a goal in the 89th minute and advanced to the next stage on aggregate. From 7b69907d16cea6200b010a027818f0fb29c903ae Mon Sep 17 00:00:00 2001 From: Riley Clement Date: Tue, 2 Nov 2021 21:43:49 +1100 Subject: [PATCH 10/10] v0.5.0 --- docs/release_notes/index.rst | 7 +++++-- pyproject.toml | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/docs/release_notes/index.rst b/docs/release_notes/index.rst index 2c2a928..7b6f60c 100644 --- a/docs/release_notes/index.rst +++ b/docs/release_notes/index.rst @@ -4,6 +4,11 @@ Release notes ======================== + +ADD UNRELEASED CHANGES ABOVE THIS LINE + +**v0.5.0 2021-11-02** + Added the following methods - :func:`piso.join` for *join operations* with interval indexes @@ -15,8 +20,6 @@ Performance improvements for - :func:`piso.lookup` - :func:`piso.get_indexer` -ADD UNRELEASED CHANGES ABOVE THIS LINE - **v0.4.0 2021-10-30** diff --git a/pyproject.toml b/pyproject.toml index e2d53ec..a0079b0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "poetry.masonry.api" [tool.poetry] name = "piso" -version = "0.4.0" +version = "0.5.0" description = "Pandas Interval Set Operations: methods for set operations for pandas' Interval, IntervalArray and IntervalIndex" readme = "README.md" authors = ["Riley Clement "]