diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 9823b010..a660ade6 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,17 @@ Changelog ========= +1.9.3 - 2025.01.13 +------------------ + +**Bug fixes** + +- Fix a bug in :meth:`datajudge.WithinRequirement.add_date_no_overlap_constraint` + and :meth:`datajudge.WithinRequirement.add_date_no_overlap2d_constraint` and + :meth:`datajudge.WithinRequirement.add_numeric_no_overlap_constraint` in which + some overlaps were not detected due to equality of their leftmost bounds. + + 1.9.2 - 2024.09.05 ------------------ diff --git a/docs/source/examples/example_dates.rst b/docs/source/examples/example_dates.rst index 702e5791..4e13b4cb 100644 --- a/docs/source/examples/example_dates.rst +++ b/docs/source/examples/example_dates.rst @@ -4,7 +4,7 @@ Example: Dates This example concerns itself with expressing ``Constraint``\s against data revolving around dates. While date ``Constraint``\s between tables exist, we will only illustrate ``Constraint``\s on a single table and reference values here. As a consequence, we will -only use ``WithinRequirement``, as opposed to ``BetweenRequirement``. +only use :class:`~datajudge.WithinRequirement`, as opposed to :class:`datajudge.BetweenRequirement`. Concretely, we will assume a table containing prices for a given product of id 1. Importantly, these prices are valid for a certain date range only. More precisely, @@ -153,17 +153,15 @@ Assuming that such a table exists in database, we can write a specification agai # up these tests. test_constraint = collect_data_tests(requirements) -Please note that the ``DateNoOverlap`` and ``DateNoGap`` constraints also exist -in a slightly different form: ``DateNoOverlap2d`` and ``DateNoGap2d``. -As the names suggest, these can operate in 'two date dimensions'. +Please note that the ``DateNoOverlap`` Constraint also exists +in a slightly different form: ``DateNoOverlap2d``. +As the names suggest, it can operate in 'two date dimensions'. Said constraint +be added with :meth:`datajudge.WithinRequirement.add_date_no_overlap_2d_constraint`. -For example, let's assume a table with four date columns, representing two +For instance, we could imagine a table with four date columns, representing two ranges in distinct dimensions, respectively: * ``date_from``: Date from when a price is valid * ``date_to``: Date until when a price is valid * ``date_definition_from``: Date when a price definition was inserted * ``date_definition_to``: Date until when a price definition was used - -Analogously to the unidimensional scenario illustrated here, one might care -for certain constraints in two dimensions. diff --git a/src/datajudge/__init__.py b/src/datajudge/__init__.py index 95511241..08d6db66 100644 --- a/src/datajudge/__init__.py +++ b/src/datajudge/__init__.py @@ -2,13 +2,14 @@ information.""" from .constraints.base import Constraint -from .db_access import Condition +from .db_access import Condition, DataSource from .requirements import BetweenRequirement, Requirement, WithinRequirement __all__ = [ "BetweenRequirement", "Condition", "Constraint", + "DataSource", "Requirement", "WithinRequirement", ] diff --git a/src/datajudge/db_access.py b/src/datajudge/db_access.py index a18dff1f..0fdd676f 100644 --- a/src/datajudge/db_access.py +++ b/src/datajudge/db_access.py @@ -478,7 +478,30 @@ def get_interval_overlaps_nd( start_columns: list[str], end_columns: list[str], end_included: bool, -): +) -> tuple[sa.sql.selectable.CompoundSelect, sa.sql.selectable.Select]: + """Create selectables for interval overlaps in n dimensions. + + We define the presence of 'overlap' as presence of a non-empty intersection + between two intervals. + + Given that we care about a single dimension and have two intervals :math:`t1` and :math:`t2`, + we define an overlap follows: + + .. math:: + \\begin{align} \\text{overlap}(t_1, t_2) \\Leftrightarrow + &(min(t_1) \\leq min(t_2) \\land max(t_1) \\geq min(t_2)) \\\\ + &\\lor \\\\ + &(min(t_2) \\leq min(t_1) \\land max(t_2) \\geq min(t_1)) + \\end{align} + + We can drop the second clause of the above disjunction if we define :math:`t_1` to be the 'leftmost' + interval. We do so when building our query. + + Note that the above equations are representative of ``end_included=True`` and the second clause + of the conjunction would use a strict inequality if ``end_included=False``. + + We define an overlap in several dimensions as the conjunction of overlaps in every single dimension. + """ if is_snowflake(engine): if key_columns: key_columns = lowercase_column_names(key_columns) @@ -502,10 +525,20 @@ def get_interval_overlaps_nd( table_key_columns = get_table_columns(table1, key_columns) if key_columns else [] end_operator = operator.ge if end_included else operator.gt - violation_condition = sa.and_( + + # We have a violation in two scenarios: + # 1. At least two entries are exactly equal in key and interval columns + # 2. Two entries are not exactly equal in key and interval_columns and fuilfill violation_condition + + # Scenario 1 + duplicate_selection = duplicates(table1) + + # scenario 2 + naive_violation_condition = sa.and_( *[ sa.and_( - table1.c[start_columns[dimension]] < table2.c[start_columns[dimension]], + table1.c[start_columns[dimension]] + <= table2.c[start_columns[dimension]], end_operator( table1.c[end_columns[dimension]], table2.c[start_columns[dimension]] ), @@ -514,8 +547,24 @@ def get_interval_overlaps_nd( ] ) - join_condition = sa.and_(*key_conditions, violation_condition) - violation_selection = sa.select( + interval_inequality_condition = sa.or_( + *[ + sa.or_( + table1.c[start_columns[dimension]] + != table2.c[start_columns[dimension]], + table2.c[end_columns[dimension]] != table2.c[end_columns[dimension]], + ) + for dimension in range(dimensionality) + ] + ) + + distinct_violation_condition = sa.and_( + naive_violation_condition, + interval_inequality_condition, + ) + + distinct_join_condition = sa.and_(*key_conditions, distinct_violation_condition) + distinct_violation_selection = sa.select( *table_key_columns, *[ table.c[start_column] @@ -527,7 +576,7 @@ def get_interval_overlaps_nd( for table in [table1, table2] for end_column in end_columns ], - ).select_from(table1.join(table2, join_condition)) + ).select_from(table1.join(table2, distinct_join_condition)) # Note, Kevin, 21/12/09 # The following approach would likely be preferable to the approach used @@ -544,6 +593,27 @@ def get_interval_overlaps_nd( # violation_subquery # ) + # Merge scenarios 1 and 2. + # We need to 'impute' the missing columns for the duplicate selection in order for the union between + # both selections to work. + duplicate_selection = sa.select( + *( + # Already existing columns + [ + duplicate_selection.c[column] + for column in distinct_violation_selection.columns.keys() + if column in duplicate_selection.columns.keys() + ] + # Fill all missing columns with NULLs. + + [ + sa.null().label(column) + for column in distinct_violation_selection.columns.keys() + if column not in duplicate_selection.columns.keys() + ] + ) + ) + violation_selection = duplicate_selection.union(distinct_violation_selection) + violation_subquery = violation_selection.subquery() keys = ( @@ -1102,12 +1172,11 @@ def get_row_mismatch(engine, ref, ref2, match_and_compare): return result_mismatch, result_n_rows, [selection_difference, selection_n_rows] -def get_duplicate_sample(engine, ref): - initial_selection = ref.get_selection(engine).alias() +def duplicates(subquery: sa.sql.selectable.Subquery) -> sa.sql.selectable.Select: aggregate_subquery = ( - sa.select(initial_selection, sa.func.count().label("n_copies")) - .select_from(initial_selection) - .group_by(*initial_selection.columns) + sa.select(subquery, sa.func.count().label("n_copies")) + .select_from(subquery) + .group_by(*subquery.columns) .alias() ) duplicate_selection = ( @@ -1121,6 +1190,12 @@ def get_duplicate_sample(engine, ref): .select_from(aggregate_subquery) .where(aggregate_subquery.c.n_copies > 1) ) + return duplicate_selection + + +def get_duplicate_sample(engine: sa.engine.Engine, ref: DataReference) -> tuple: + initial_selection = ref.get_selection(engine).alias() + duplicate_selection = duplicates(initial_selection) result = engine.connect().execute(duplicate_selection).first() return result, [duplicate_selection] diff --git a/src/datajudge/requirements.py b/src/datajudge/requirements.py index 1776c5c7..3343218f 100644 --- a/src/datajudge/requirements.py +++ b/src/datajudge/requirements.py @@ -789,11 +789,11 @@ def add_date_no_overlap_constraint( ): """Constraint expressing that several date range rows may not overlap. - The ``DataSource`` under inspection must consist of at least one but up + The :class:`~datajudge.DataSource` under inspection must consist of at least one but up to many ``key_columns``, identifying an entity, a ``start_column`` and an ``end_column``. - For a given row in this ``DataSource``, ``start_column`` and ``end_column`` indicate a + For a given row in this :class:`~datajudge.DataSource`, ``start_column`` and ``end_column`` indicate a date range. Neither of those columns should contain NULL values. Also, it should hold that for a given row, the value of ``end_column`` is strictly greater than the value of ``start_column``. @@ -847,26 +847,23 @@ def add_date_no_overlap_2d_constraint( condition: Optional[Condition] = None, name: Optional[str] = None, cache_size=None, - ): + ) -> None: """Express that several date range rows do not overlap in two date dimensions. The table under inspection must consist of at least one but up to many key columns, - identifying an entity. Per date dimension, a ``start_column`` and an - ``end_column`` should be provided. - - For a given row in this table, ``start_column1`` and ``end_column1`` - indicate a date range. Moreoever, for that same row, ``start_column2`` - and ``end_column2`` indicate a date range. - These date ranges are expected to represent different date 'dimensions'. - Example: A row indicates a forecasted value used in production. ``start_column1`` - and ``end_column1`` represent the timespan that was forecasted, e.g. the - weather from next Saturday to next Sunday. ``end_column1`` and ``end_column2`` + identifying an entity. Per date dimension, a start column (``start_column1``, ``start_column2``) + and end (``end_column1``, ``end_column2``) column should be provided in order to define + date ranges. + + Date ranges in different date dimensions are expected to represent different kinds + of dates. For example, let's say that a row in a table indicates an averag temperature + forecast. ``start_column1`` and ``end_column1`` could the date span that was forecasted, + e.g. the weather from next Saturday to next Sunday. ``end_column1`` and ``end_column2`` might indicate the timespan when this forceast was used, e.g. from the previous Monday to Wednesday. - Neither of those columns should contain ``NULL`` values. Also it should - hold that for a given row, the value of ``end_column`` is strictly greater - than the value of ``start_column``. + Neither of those columns should contain ``NULL`` values. Also, the value of ``end_column_k`` + should be strictly greater than the value of ``start_column_k``. Note that the values of ``start_column1`` and ``start_column2`` are expected to be included in each date range. By default, the values of ``end_column1`` and diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 4065a3aa..7cc9168b 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -265,6 +265,19 @@ def date_table_overlap(engine, metadata): "date_end": datetime.datetime(2016, 1, 8), }, ] + # Duplicate entries. + data += [ + { + "id1": 6, + "date_start": datetime.datetime(2016, 1, 1), + "date_end": datetime.datetime(2016, 1, 10), + }, + { + "id1": 6, + "date_start": datetime.datetime(2016, 1, 1), + "date_end": datetime.datetime(2016, 1, 10), + }, + ] _handle_table(engine, metadata, table_name, columns, data) return TEST_DB_NAME, SCHEMA, table_name @@ -382,6 +395,23 @@ def date_table_overlap_2d(engine, metadata): "date_end2": datetime.datetime(2017, 1, 7), }, ] + # Duplication overlap. + data += [ + { + "id1": 8, + "date_start1": datetime.datetime(2016, 1, 1), + "date_end1": datetime.datetime(2016, 1, 10), + "date_start2": datetime.datetime(2017, 1, 1), + "date_end2": datetime.datetime(2017, 1, 10), + }, + { + "id1": 8, + "date_start1": datetime.datetime(2016, 1, 1), + "date_end1": datetime.datetime(2016, 1, 10), + "date_start2": datetime.datetime(2017, 1, 1), + "date_end2": datetime.datetime(2017, 1, 10), + }, + ] _handle_table(engine, metadata, table_name, columns, data) return TEST_DB_NAME, SCHEMA, table_name @@ -451,6 +481,19 @@ def integer_table_overlap(engine, metadata): "range_end": 8, }, ] + # Duplicate entries. + data += [ + { + "id1": 6, + "range_start": 1, + "range_end": 10, + }, + { + "id1": 6, + "range_start": 1, + "range_end": 10, + }, + ] _handle_table(engine, metadata, table_name, columns, data) return TEST_DB_NAME, SCHEMA, table_name diff --git a/tests/integration/test_integration.py b/tests/integration/test_integration.py index 95f69e62..3dc7fe60 100644 --- a/tests/integration/test_integration.py +++ b/tests/integration/test_integration.py @@ -2004,6 +2004,8 @@ def test_date_between_within(engine, date_table1, data): (identity, 1, Condition(raw_string="id1 = 4")), (negation, 0, Condition(raw_string="id1 = 5")), (identity, 1, Condition(raw_string="id1 = 5")), + (negation, 0, Condition(raw_string="id1 = 6")), + (identity, 1, Condition(raw_string="id1 = 6")), ], ) @pytest.mark.parametrize("key_columns", [["id1"], [], None]) @@ -2034,6 +2036,8 @@ def test_date_no_overlap_within_varying_key_columns( (identity, 1, Condition(raw_string="id1 = 4")), (negation, 0, Condition(raw_string="id1 = 5")), (identity, 1, Condition(raw_string="id1 = 5")), + (negation, 0, Condition(raw_string="id1 = 6")), + (identity, 1, Condition(raw_string="id1 = 6")), ], ) @pytest.mark.parametrize("key_columns", [["id1"], [], None]) @@ -2056,8 +2060,9 @@ def test_integer_no_overlap_within_varying_key_columns( @pytest.mark.parametrize( "data", [ - (negation, 0.59, None), - (identity, 0.6, None), + # 2/6 ids succeed + (negation, 0.66, None), + (identity, 0.67, None), (identity, 0, Condition(raw_string="id1 IN (1, 2)")), ], ) @@ -2130,6 +2135,8 @@ def test_date_no_overlap_within_inclusion_exclusion(engine, date_table_overlap, (identity, 1, Condition(raw_string="id1 = 6")), (negation, 0, Condition(raw_string="id1 = 7")), (identity, 1, Condition(raw_string="id1 = 7")), + (negation, 0, Condition(raw_string="id1 = 8")), + (identity, 1, Condition(raw_string="id1 = 8")), ], ) @pytest.mark.parametrize("key_columns", [["id1"], [], None]) @@ -2155,9 +2162,9 @@ def test_date_no_overlap_2d_within_varying_key_column( "data", [ (identity, 0, Condition(raw_string="id1 IN (1, 2, 3, 4)")), - # 3/7 ids have violations. - (negation, 0.42, None), - (identity, 0.43, None), + # 4/8 ids have violations. + (negation, 0.49, None), + (identity, 0.50, None), ], ) def test_date_no_overlap_2d_within_fixed_key_column(