Bugfix: NoOverlap2d constraint fails to recognize duplicates (#256)

kklein · web-flow · commit 21eef1bc5be9 · 2025-01-20T14:24:36.000+01:00
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -7,6 +7,17 @@
 Changelog
 =========
 
+1.9.3 - 2025.01.13
+------------------
+
+**Bug fixes**
+
+- Fix a bug in :meth:`datajudge.WithinRequirement.add_date_no_overlap_constraint`
+  and :meth:`datajudge.WithinRequirement.add_date_no_overlap2d_constraint` and
+  :meth:`datajudge.WithinRequirement.add_numeric_no_overlap_constraint` in which
+  some overlaps were not detected due to equality of their leftmost bounds.
+
+
 1.9.2 - 2024.09.05
 ------------------
 
diff --git a/docs/source/examples/example_dates.rst b/docs/source/examples/example_dates.rst
@@ -4,7 +4,7 @@ Example: Dates
 This example concerns itself with expressing ``Constraint``\s against data revolving
 around dates. While date ``Constraint``\s between tables exist, we will only illustrate
 ``Constraint``\s on a single table and reference values here. As a consequence, we will
-only use ``WithinRequirement``, as opposed to ``BetweenRequirement``.
+only use :class:`~datajudge.WithinRequirement`, as opposed to :class:`datajudge.BetweenRequirement`.
 
 Concretely, we will assume a table containing prices for a given product of id 1.
 Importantly, these prices are valid for a certain date range only. More precisely,
@@ -153,17 +153,15 @@ Assuming that such a table exists in database, we can write a specification agai
     # up these tests.
     test_constraint = collect_data_tests(requirements)
 
-Please note that the ``DateNoOverlap`` and ``DateNoGap`` constraints also exist
-in a slightly different form: ``DateNoOverlap2d`` and ``DateNoGap2d``.
-As the names suggest, these can operate in 'two date dimensions'.
+Please note that the ``DateNoOverlap`` Constraint also exists
+in a slightly different form: ``DateNoOverlap2d``.
+As the names suggest, it can operate in 'two date dimensions'. Said constraint
+be added with :meth:`datajudge.WithinRequirement.add_date_no_overlap_2d_constraint`.
 
-For example, let's assume a table with four date columns, representing two
+For instance, we could imagine a table with four date columns, representing two
 ranges in distinct dimensions, respectively:
 
 * ``date_from``: Date from when a price is valid
 * ``date_to``: Date until when a price is valid
 * ``date_definition_from``: Date when a price definition was inserted
 * ``date_definition_to``: Date until when a price definition was used
-
-Analogously to the unidimensional scenario illustrated here, one might care
-for certain constraints in two dimensions.
diff --git a/src/datajudge/__init__.py b/src/datajudge/__init__.py
@@ -2,13 +2,14 @@
 information."""
 
 from .constraints.base import Constraint
-from .db_access import Condition
+from .db_access import Condition, DataSource
 from .requirements import BetweenRequirement, Requirement, WithinRequirement
 
 __all__ = [
     "BetweenRequirement",
     "Condition",
     "Constraint",
+    "DataSource",
     "Requirement",
     "WithinRequirement",
 ]
diff --git a/src/datajudge/db_access.py b/src/datajudge/db_access.py
@@ -478,7 +478,30 @@ def get_interval_overlaps_nd(
     start_columns: list[str],
     end_columns: list[str],
     end_included: bool,
-):
+) -> tuple[sa.sql.selectable.CompoundSelect, sa.sql.selectable.Select]:
+    """Create selectables for interval overlaps in n dimensions.
+
+    We define the presence of 'overlap' as presence of a non-empty intersection
+    between two intervals.
+
+    Given that we care about a single dimension and have two intervals :math:`t1` and :math:`t2`,
+    we define an overlap follows:
+
+     .. math::
+        \\begin{align} \\text{overlap}(t_1, t_2) \\Leftrightarrow
+            &(min(t_1) \\leq min(t_2) \\land max(t_1) \\geq min(t_2)) \\\\
+            &\\lor \\\\
+            &(min(t_2) \\leq min(t_1) \\land max(t_2) \\geq min(t_1))
+        \\end{align}
+
+    We can drop the second clause of the above disjunction if we define :math:`t_1` to be the 'leftmost'
+    interval. We do so when building our query.
+
+    Note that the above equations are representative of ``end_included=True`` and the second clause
+    of the conjunction would use a strict inequality if ``end_included=False``.
+
+    We define an overlap in several dimensions as the conjunction of overlaps in every single dimension.
+    """
     if is_snowflake(engine):
         if key_columns:
             key_columns = lowercase_column_names(key_columns)
@@ -502,10 +525,20 @@ def get_interval_overlaps_nd(
     table_key_columns = get_table_columns(table1, key_columns) if key_columns else []
 
     end_operator = operator.ge if end_included else operator.gt
-    violation_condition = sa.and_(
+
+    # We have a violation in two scenarios:
+    # 1. At least two entries are exactly equal in key and interval columns
+    # 2. Two entries are not exactly equal in key and interval_columns and fuilfill violation_condition
+
+    # Scenario 1
+    duplicate_selection = duplicates(table1)
+
+    # scenario 2
+    naive_violation_condition = sa.and_(
         *[
             sa.and_(
-                table1.c[start_columns[dimension]] < table2.c[start_columns[dimension]],
+                table1.c[start_columns[dimension]]
+                <= table2.c[start_columns[dimension]],
                 end_operator(
                     table1.c[end_columns[dimension]], table2.c[start_columns[dimension]]
                 ),
@@ -514,8 +547,24 @@ def get_interval_overlaps_nd(
         ]
     )
 
-    join_condition = sa.and_(*key_conditions, violation_condition)
-    violation_selection = sa.select(
+    interval_inequality_condition = sa.or_(
+        *[
+            sa.or_(
+                table1.c[start_columns[dimension]]
+                != table2.c[start_columns[dimension]],
+                table2.c[end_columns[dimension]] != table2.c[end_columns[dimension]],
+            )
+            for dimension in range(dimensionality)
+        ]
+    )
+
+    distinct_violation_condition = sa.and_(
+        naive_violation_condition,
+        interval_inequality_condition,
+    )
+
+    distinct_join_condition = sa.and_(*key_conditions, distinct_violation_condition)
+    distinct_violation_selection = sa.select(
         *table_key_columns,
         *[
             table.c[start_column]
@@ -527,7 +576,7 @@ def get_interval_overlaps_nd(
             for table in [table1, table2]
             for end_column in end_columns
         ],
-    ).select_from(table1.join(table2, join_condition))
+    ).select_from(table1.join(table2, distinct_join_condition))
 
     # Note, Kevin, 21/12/09
     # The following approach would likely be preferable to the approach used
@@ -544,6 +593,27 @@ def get_interval_overlaps_nd(
     #     violation_subquery
     # )
 
+    # Merge scenarios 1 and 2.
+    # We need to 'impute' the missing columns for the duplicate selection in order for the union between
+    # both selections to work.
+    duplicate_selection = sa.select(
+        *(
+            # Already existing columns
+            [
+                duplicate_selection.c[column]
+                for column in distinct_violation_selection.columns.keys()
+                if column in duplicate_selection.columns.keys()
+            ]
+            # Fill all missing columns with NULLs.
+            + [
+                sa.null().label(column)
+                for column in distinct_violation_selection.columns.keys()
+                if column not in duplicate_selection.columns.keys()
+            ]
+        )
+    )
+    violation_selection = duplicate_selection.union(distinct_violation_selection)
+
     violation_subquery = violation_selection.subquery()
 
     keys = (
@@ -1102,12 +1172,11 @@ def get_row_mismatch(engine, ref, ref2, match_and_compare):
     return result_mismatch, result_n_rows, [selection_difference, selection_n_rows]
 
 
-def get_duplicate_sample(engine, ref):
-    initial_selection = ref.get_selection(engine).alias()
+def duplicates(subquery: sa.sql.selectable.Subquery) -> sa.sql.selectable.Select:
     aggregate_subquery = (
-        sa.select(initial_selection, sa.func.count().label("n_copies"))
-        .select_from(initial_selection)
-        .group_by(*initial_selection.columns)
+        sa.select(subquery, sa.func.count().label("n_copies"))
+        .select_from(subquery)
+        .group_by(*subquery.columns)
         .alias()
     )
     duplicate_selection = (
@@ -1121,6 +1190,12 @@ def get_duplicate_sample(engine, ref):
         .select_from(aggregate_subquery)
         .where(aggregate_subquery.c.n_copies > 1)
     )
+    return duplicate_selection
+
+
+def get_duplicate_sample(engine: sa.engine.Engine, ref: DataReference) -> tuple:
+    initial_selection = ref.get_selection(engine).alias()
+    duplicate_selection = duplicates(initial_selection)
     result = engine.connect().execute(duplicate_selection).first()
     return result, [duplicate_selection]
 
diff --git a/src/datajudge/requirements.py b/src/datajudge/requirements.py
@@ -789,11 +789,11 @@ def add_date_no_overlap_constraint(
     ):
         """Constraint expressing that several date range rows may not overlap.
 
-        The ``DataSource`` under inspection must consist of at least one but up
+        The :class:`~datajudge.DataSource` under inspection must consist of at least one but up
         to many ``key_columns``, identifying an entity, a ``start_column`` and an
         ``end_column``.
 
-        For a given row in this ``DataSource``, ``start_column`` and ``end_column`` indicate a
+        For a given row in this :class:`~datajudge.DataSource`, ``start_column`` and ``end_column`` indicate a
         date range. Neither of those columns should contain NULL values. Also, it
         should hold that for a given row, the value of ``end_column`` is strictly greater
         than the value of ``start_column``.
@@ -847,26 +847,23 @@ def add_date_no_overlap_2d_constraint(
         condition: Optional[Condition] = None,
         name: Optional[str] = None,
         cache_size=None,
-    ):
+    ) -> None:
         """Express that several date range rows do not overlap in two date dimensions.
 
         The table under inspection must consist of at least one but up to many key columns,
-        identifying an entity. Per date dimension, a ``start_column`` and an
-        ``end_column`` should be provided.
-
-        For a given row in this table, ``start_column1`` and ``end_column1``
-        indicate a date range. Moreoever, for that same row, ``start_column2``
-        and ``end_column2`` indicate a date range.
-        These date ranges are expected to represent different date 'dimensions'.
-        Example: A row indicates a forecasted value used in production. ``start_column1``
-        and ``end_column1`` represent the timespan that was forecasted, e.g. the
-        weather from next Saturday to next Sunday. ``end_column1`` and ``end_column2``
+        identifying an entity. Per date dimension, a start column (``start_column1``, ``start_column2``)
+        and end (``end_column1``, ``end_column2``) column should be provided in order to define
+        date ranges.
+
+        Date ranges in different date dimensions are expected to represent different kinds
+        of dates. For example, let's say that a row in a table indicates an averag temperature
+        forecast. ``start_column1`` and ``end_column1`` could the date span that was forecasted,
+        e.g. the weather from next Saturday to next Sunday. ``end_column1`` and ``end_column2``
         might indicate the timespan when this forceast was used, e.g. from the
         previous Monday to Wednesday.
 
-        Neither of those columns should contain ``NULL`` values. Also it should
-        hold that for a given row, the value of ``end_column`` is strictly greater
-        than the value of ``start_column``.
+        Neither of those columns should contain ``NULL`` values. Also, the value of ``end_column_k``
+        should be strictly greater than the value of ``start_column_k``.
 
         Note that the values of ``start_column1`` and ``start_column2`` are expected to be
         included in each date range. By default, the values of ``end_column1`` and
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
@@ -265,6 +265,19 @@ def date_table_overlap(engine, metadata):
             "date_end": datetime.datetime(2016, 1, 8),
         },
     ]
+    # Duplicate entries.
+    data += [
+        {
+            "id1": 6,
+            "date_start": datetime.datetime(2016, 1, 1),
+            "date_end": datetime.datetime(2016, 1, 10),
+        },
+        {
+            "id1": 6,
+            "date_start": datetime.datetime(2016, 1, 1),
+            "date_end": datetime.datetime(2016, 1, 10),
+        },
+    ]
     _handle_table(engine, metadata, table_name, columns, data)
     return TEST_DB_NAME, SCHEMA, table_name
 
@@ -382,6 +395,23 @@ def date_table_overlap_2d(engine, metadata):
             "date_end2": datetime.datetime(2017, 1, 7),
         },
     ]
+    # Duplication overlap.
+    data += [
+        {
+            "id1": 8,
+            "date_start1": datetime.datetime(2016, 1, 1),
+            "date_end1": datetime.datetime(2016, 1, 10),
+            "date_start2": datetime.datetime(2017, 1, 1),
+            "date_end2": datetime.datetime(2017, 1, 10),
+        },
+        {
+            "id1": 8,
+            "date_start1": datetime.datetime(2016, 1, 1),
+            "date_end1": datetime.datetime(2016, 1, 10),
+            "date_start2": datetime.datetime(2017, 1, 1),
+            "date_end2": datetime.datetime(2017, 1, 10),
+        },
+    ]
     _handle_table(engine, metadata, table_name, columns, data)
     return TEST_DB_NAME, SCHEMA, table_name
 
@@ -451,6 +481,19 @@ def integer_table_overlap(engine, metadata):
             "range_end": 8,
         },
     ]
+    # Duplicate entries.
+    data += [
+        {
+            "id1": 6,
+            "range_start": 1,
+            "range_end": 10,
+        },
+        {
+            "id1": 6,
+            "range_start": 1,
+            "range_end": 10,
+        },
+    ]
     _handle_table(engine, metadata, table_name, columns, data)
     return TEST_DB_NAME, SCHEMA, table_name
 
diff --git a/tests/integration/test_integration.py b/tests/integration/test_integration.py
@@ -2004,6 +2004,8 @@ def test_date_between_within(engine, date_table1, data):
         (identity, 1, Condition(raw_string="id1 = 4")),
         (negation, 0, Condition(raw_string="id1 = 5")),
         (identity, 1, Condition(raw_string="id1 = 5")),
+        (negation, 0, Condition(raw_string="id1 = 6")),
+        (identity, 1, Condition(raw_string="id1 = 6")),
     ],
 )
 @pytest.mark.parametrize("key_columns", [["id1"], [], None])
@@ -2034,6 +2036,8 @@ def test_date_no_overlap_within_varying_key_columns(
         (identity, 1, Condition(raw_string="id1 = 4")),
         (negation, 0, Condition(raw_string="id1 = 5")),
         (identity, 1, Condition(raw_string="id1 = 5")),
+        (negation, 0, Condition(raw_string="id1 = 6")),
+        (identity, 1, Condition(raw_string="id1 = 6")),
     ],
 )
 @pytest.mark.parametrize("key_columns", [["id1"], [], None])
@@ -2056,8 +2060,9 @@ def test_integer_no_overlap_within_varying_key_columns(
 @pytest.mark.parametrize(
     "data",
     [
-        (negation, 0.59, None),
-        (identity, 0.6, None),
+        # 2/6 ids succeed
+        (negation, 0.66, None),
+        (identity, 0.67, None),
         (identity, 0, Condition(raw_string="id1 IN (1, 2)")),
     ],
 )
@@ -2130,6 +2135,8 @@ def test_date_no_overlap_within_inclusion_exclusion(engine, date_table_overlap,
         (identity, 1, Condition(raw_string="id1 = 6")),
         (negation, 0, Condition(raw_string="id1 = 7")),
         (identity, 1, Condition(raw_string="id1 = 7")),
+        (negation, 0, Condition(raw_string="id1 = 8")),
+        (identity, 1, Condition(raw_string="id1 = 8")),
     ],
 )
 @pytest.mark.parametrize("key_columns", [["id1"], [], None])
@@ -2155,9 +2162,9 @@ def test_date_no_overlap_2d_within_varying_key_column(
     "data",
     [
         (identity, 0, Condition(raw_string="id1 IN (1, 2, 3, 4)")),
-        # 3/7 ids have violations.
-        (negation, 0.42, None),
-        (identity, 0.43, None),
+        # 4/8 ids have violations.
+        (negation, 0.49, None),
+        (identity, 0.50, None),
     ],
 )
 def test_date_no_overlap_2d_within_fixed_key_column(