Skip to content

Commit 21eef1b

Browse files
authored
Bugfix: NoOverlap2d constraint fails to recognize duplicates (#256)
1 parent efb059f commit 21eef1b

File tree

7 files changed

+173
-41
lines changed

7 files changed

+173
-41
lines changed

CHANGELOG.rst

+11
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,17 @@
77
Changelog
88
=========
99

10+
1.9.3 - 2025.01.13
11+
------------------
12+
13+
**Bug fixes**
14+
15+
- Fix a bug in :meth:`datajudge.WithinRequirement.add_date_no_overlap_constraint`
16+
and :meth:`datajudge.WithinRequirement.add_date_no_overlap2d_constraint` and
17+
:meth:`datajudge.WithinRequirement.add_numeric_no_overlap_constraint` in which
18+
some overlaps were not detected due to equality of their leftmost bounds.
19+
20+
1021
1.9.2 - 2024.09.05
1122
------------------
1223

docs/source/examples/example_dates.rst

+6-8
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ Example: Dates
44
This example concerns itself with expressing ``Constraint``\s against data revolving
55
around dates. While date ``Constraint``\s between tables exist, we will only illustrate
66
``Constraint``\s on a single table and reference values here. As a consequence, we will
7-
only use ``WithinRequirement``, as opposed to ``BetweenRequirement``.
7+
only use :class:`~datajudge.WithinRequirement`, as opposed to :class:`datajudge.BetweenRequirement`.
88

99
Concretely, we will assume a table containing prices for a given product of id 1.
1010
Importantly, these prices are valid for a certain date range only. More precisely,
@@ -153,17 +153,15 @@ Assuming that such a table exists in database, we can write a specification agai
153153
# up these tests.
154154
test_constraint = collect_data_tests(requirements)
155155
156-
Please note that the ``DateNoOverlap`` and ``DateNoGap`` constraints also exist
157-
in a slightly different form: ``DateNoOverlap2d`` and ``DateNoGap2d``.
158-
As the names suggest, these can operate in 'two date dimensions'.
156+
Please note that the ``DateNoOverlap`` Constraint also exists
157+
in a slightly different form: ``DateNoOverlap2d``.
158+
As the names suggest, it can operate in 'two date dimensions'. Said constraint
159+
be added with :meth:`datajudge.WithinRequirement.add_date_no_overlap_2d_constraint`.
159160

160-
For example, let's assume a table with four date columns, representing two
161+
For instance, we could imagine a table with four date columns, representing two
161162
ranges in distinct dimensions, respectively:
162163

163164
* ``date_from``: Date from when a price is valid
164165
* ``date_to``: Date until when a price is valid
165166
* ``date_definition_from``: Date when a price definition was inserted
166167
* ``date_definition_to``: Date until when a price definition was used
167-
168-
Analogously to the unidimensional scenario illustrated here, one might care
169-
for certain constraints in two dimensions.

src/datajudge/__init__.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,14 @@
22
information."""
33

44
from .constraints.base import Constraint
5-
from .db_access import Condition
5+
from .db_access import Condition, DataSource
66
from .requirements import BetweenRequirement, Requirement, WithinRequirement
77

88
__all__ = [
99
"BetweenRequirement",
1010
"Condition",
1111
"Constraint",
12+
"DataSource",
1213
"Requirement",
1314
"WithinRequirement",
1415
]

src/datajudge/db_access.py

+86-11
Original file line numberDiff line numberDiff line change
@@ -478,7 +478,30 @@ def get_interval_overlaps_nd(
478478
start_columns: list[str],
479479
end_columns: list[str],
480480
end_included: bool,
481-
):
481+
) -> tuple[sa.sql.selectable.CompoundSelect, sa.sql.selectable.Select]:
482+
"""Create selectables for interval overlaps in n dimensions.
483+
484+
We define the presence of 'overlap' as presence of a non-empty intersection
485+
between two intervals.
486+
487+
Given that we care about a single dimension and have two intervals :math:`t1` and :math:`t2`,
488+
we define an overlap follows:
489+
490+
.. math::
491+
\\begin{align} \\text{overlap}(t_1, t_2) \\Leftrightarrow
492+
&(min(t_1) \\leq min(t_2) \\land max(t_1) \\geq min(t_2)) \\\\
493+
&\\lor \\\\
494+
&(min(t_2) \\leq min(t_1) \\land max(t_2) \\geq min(t_1))
495+
\\end{align}
496+
497+
We can drop the second clause of the above disjunction if we define :math:`t_1` to be the 'leftmost'
498+
interval. We do so when building our query.
499+
500+
Note that the above equations are representative of ``end_included=True`` and the second clause
501+
of the conjunction would use a strict inequality if ``end_included=False``.
502+
503+
We define an overlap in several dimensions as the conjunction of overlaps in every single dimension.
504+
"""
482505
if is_snowflake(engine):
483506
if key_columns:
484507
key_columns = lowercase_column_names(key_columns)
@@ -502,10 +525,20 @@ def get_interval_overlaps_nd(
502525
table_key_columns = get_table_columns(table1, key_columns) if key_columns else []
503526

504527
end_operator = operator.ge if end_included else operator.gt
505-
violation_condition = sa.and_(
528+
529+
# We have a violation in two scenarios:
530+
# 1. At least two entries are exactly equal in key and interval columns
531+
# 2. Two entries are not exactly equal in key and interval_columns and fuilfill violation_condition
532+
533+
# Scenario 1
534+
duplicate_selection = duplicates(table1)
535+
536+
# scenario 2
537+
naive_violation_condition = sa.and_(
506538
*[
507539
sa.and_(
508-
table1.c[start_columns[dimension]] < table2.c[start_columns[dimension]],
540+
table1.c[start_columns[dimension]]
541+
<= table2.c[start_columns[dimension]],
509542
end_operator(
510543
table1.c[end_columns[dimension]], table2.c[start_columns[dimension]]
511544
),
@@ -514,8 +547,24 @@ def get_interval_overlaps_nd(
514547
]
515548
)
516549

517-
join_condition = sa.and_(*key_conditions, violation_condition)
518-
violation_selection = sa.select(
550+
interval_inequality_condition = sa.or_(
551+
*[
552+
sa.or_(
553+
table1.c[start_columns[dimension]]
554+
!= table2.c[start_columns[dimension]],
555+
table2.c[end_columns[dimension]] != table2.c[end_columns[dimension]],
556+
)
557+
for dimension in range(dimensionality)
558+
]
559+
)
560+
561+
distinct_violation_condition = sa.and_(
562+
naive_violation_condition,
563+
interval_inequality_condition,
564+
)
565+
566+
distinct_join_condition = sa.and_(*key_conditions, distinct_violation_condition)
567+
distinct_violation_selection = sa.select(
519568
*table_key_columns,
520569
*[
521570
table.c[start_column]
@@ -527,7 +576,7 @@ def get_interval_overlaps_nd(
527576
for table in [table1, table2]
528577
for end_column in end_columns
529578
],
530-
).select_from(table1.join(table2, join_condition))
579+
).select_from(table1.join(table2, distinct_join_condition))
531580

532581
# Note, Kevin, 21/12/09
533582
# The following approach would likely be preferable to the approach used
@@ -544,6 +593,27 @@ def get_interval_overlaps_nd(
544593
# violation_subquery
545594
# )
546595

596+
# Merge scenarios 1 and 2.
597+
# We need to 'impute' the missing columns for the duplicate selection in order for the union between
598+
# both selections to work.
599+
duplicate_selection = sa.select(
600+
*(
601+
# Already existing columns
602+
[
603+
duplicate_selection.c[column]
604+
for column in distinct_violation_selection.columns.keys()
605+
if column in duplicate_selection.columns.keys()
606+
]
607+
# Fill all missing columns with NULLs.
608+
+ [
609+
sa.null().label(column)
610+
for column in distinct_violation_selection.columns.keys()
611+
if column not in duplicate_selection.columns.keys()
612+
]
613+
)
614+
)
615+
violation_selection = duplicate_selection.union(distinct_violation_selection)
616+
547617
violation_subquery = violation_selection.subquery()
548618

549619
keys = (
@@ -1102,12 +1172,11 @@ def get_row_mismatch(engine, ref, ref2, match_and_compare):
11021172
return result_mismatch, result_n_rows, [selection_difference, selection_n_rows]
11031173

11041174

1105-
def get_duplicate_sample(engine, ref):
1106-
initial_selection = ref.get_selection(engine).alias()
1175+
def duplicates(subquery: sa.sql.selectable.Subquery) -> sa.sql.selectable.Select:
11071176
aggregate_subquery = (
1108-
sa.select(initial_selection, sa.func.count().label("n_copies"))
1109-
.select_from(initial_selection)
1110-
.group_by(*initial_selection.columns)
1177+
sa.select(subquery, sa.func.count().label("n_copies"))
1178+
.select_from(subquery)
1179+
.group_by(*subquery.columns)
11111180
.alias()
11121181
)
11131182
duplicate_selection = (
@@ -1121,6 +1190,12 @@ def get_duplicate_sample(engine, ref):
11211190
.select_from(aggregate_subquery)
11221191
.where(aggregate_subquery.c.n_copies > 1)
11231192
)
1193+
return duplicate_selection
1194+
1195+
1196+
def get_duplicate_sample(engine: sa.engine.Engine, ref: DataReference) -> tuple:
1197+
initial_selection = ref.get_selection(engine).alias()
1198+
duplicate_selection = duplicates(initial_selection)
11241199
result = engine.connect().execute(duplicate_selection).first()
11251200
return result, [duplicate_selection]
11261201

src/datajudge/requirements.py

+13-16
Original file line numberDiff line numberDiff line change
@@ -789,11 +789,11 @@ def add_date_no_overlap_constraint(
789789
):
790790
"""Constraint expressing that several date range rows may not overlap.
791791
792-
The ``DataSource`` under inspection must consist of at least one but up
792+
The :class:`~datajudge.DataSource` under inspection must consist of at least one but up
793793
to many ``key_columns``, identifying an entity, a ``start_column`` and an
794794
``end_column``.
795795
796-
For a given row in this ``DataSource``, ``start_column`` and ``end_column`` indicate a
796+
For a given row in this :class:`~datajudge.DataSource`, ``start_column`` and ``end_column`` indicate a
797797
date range. Neither of those columns should contain NULL values. Also, it
798798
should hold that for a given row, the value of ``end_column`` is strictly greater
799799
than the value of ``start_column``.
@@ -847,26 +847,23 @@ def add_date_no_overlap_2d_constraint(
847847
condition: Optional[Condition] = None,
848848
name: Optional[str] = None,
849849
cache_size=None,
850-
):
850+
) -> None:
851851
"""Express that several date range rows do not overlap in two date dimensions.
852852
853853
The table under inspection must consist of at least one but up to many key columns,
854-
identifying an entity. Per date dimension, a ``start_column`` and an
855-
``end_column`` should be provided.
856-
857-
For a given row in this table, ``start_column1`` and ``end_column1``
858-
indicate a date range. Moreoever, for that same row, ``start_column2``
859-
and ``end_column2`` indicate a date range.
860-
These date ranges are expected to represent different date 'dimensions'.
861-
Example: A row indicates a forecasted value used in production. ``start_column1``
862-
and ``end_column1`` represent the timespan that was forecasted, e.g. the
863-
weather from next Saturday to next Sunday. ``end_column1`` and ``end_column2``
854+
identifying an entity. Per date dimension, a start column (``start_column1``, ``start_column2``)
855+
and end (``end_column1``, ``end_column2``) column should be provided in order to define
856+
date ranges.
857+
858+
Date ranges in different date dimensions are expected to represent different kinds
859+
of dates. For example, let's say that a row in a table indicates an averag temperature
860+
forecast. ``start_column1`` and ``end_column1`` could the date span that was forecasted,
861+
e.g. the weather from next Saturday to next Sunday. ``end_column1`` and ``end_column2``
864862
might indicate the timespan when this forceast was used, e.g. from the
865863
previous Monday to Wednesday.
866864
867-
Neither of those columns should contain ``NULL`` values. Also it should
868-
hold that for a given row, the value of ``end_column`` is strictly greater
869-
than the value of ``start_column``.
865+
Neither of those columns should contain ``NULL`` values. Also, the value of ``end_column_k``
866+
should be strictly greater than the value of ``start_column_k``.
870867
871868
Note that the values of ``start_column1`` and ``start_column2`` are expected to be
872869
included in each date range. By default, the values of ``end_column1`` and

tests/integration/conftest.py

+43
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,19 @@ def date_table_overlap(engine, metadata):
265265
"date_end": datetime.datetime(2016, 1, 8),
266266
},
267267
]
268+
# Duplicate entries.
269+
data += [
270+
{
271+
"id1": 6,
272+
"date_start": datetime.datetime(2016, 1, 1),
273+
"date_end": datetime.datetime(2016, 1, 10),
274+
},
275+
{
276+
"id1": 6,
277+
"date_start": datetime.datetime(2016, 1, 1),
278+
"date_end": datetime.datetime(2016, 1, 10),
279+
},
280+
]
268281
_handle_table(engine, metadata, table_name, columns, data)
269282
return TEST_DB_NAME, SCHEMA, table_name
270283

@@ -382,6 +395,23 @@ def date_table_overlap_2d(engine, metadata):
382395
"date_end2": datetime.datetime(2017, 1, 7),
383396
},
384397
]
398+
# Duplication overlap.
399+
data += [
400+
{
401+
"id1": 8,
402+
"date_start1": datetime.datetime(2016, 1, 1),
403+
"date_end1": datetime.datetime(2016, 1, 10),
404+
"date_start2": datetime.datetime(2017, 1, 1),
405+
"date_end2": datetime.datetime(2017, 1, 10),
406+
},
407+
{
408+
"id1": 8,
409+
"date_start1": datetime.datetime(2016, 1, 1),
410+
"date_end1": datetime.datetime(2016, 1, 10),
411+
"date_start2": datetime.datetime(2017, 1, 1),
412+
"date_end2": datetime.datetime(2017, 1, 10),
413+
},
414+
]
385415
_handle_table(engine, metadata, table_name, columns, data)
386416
return TEST_DB_NAME, SCHEMA, table_name
387417

@@ -451,6 +481,19 @@ def integer_table_overlap(engine, metadata):
451481
"range_end": 8,
452482
},
453483
]
484+
# Duplicate entries.
485+
data += [
486+
{
487+
"id1": 6,
488+
"range_start": 1,
489+
"range_end": 10,
490+
},
491+
{
492+
"id1": 6,
493+
"range_start": 1,
494+
"range_end": 10,
495+
},
496+
]
454497
_handle_table(engine, metadata, table_name, columns, data)
455498
return TEST_DB_NAME, SCHEMA, table_name
456499

tests/integration/test_integration.py

+12-5
Original file line numberDiff line numberDiff line change
@@ -2004,6 +2004,8 @@ def test_date_between_within(engine, date_table1, data):
20042004
(identity, 1, Condition(raw_string="id1 = 4")),
20052005
(negation, 0, Condition(raw_string="id1 = 5")),
20062006
(identity, 1, Condition(raw_string="id1 = 5")),
2007+
(negation, 0, Condition(raw_string="id1 = 6")),
2008+
(identity, 1, Condition(raw_string="id1 = 6")),
20072009
],
20082010
)
20092011
@pytest.mark.parametrize("key_columns", [["id1"], [], None])
@@ -2034,6 +2036,8 @@ def test_date_no_overlap_within_varying_key_columns(
20342036
(identity, 1, Condition(raw_string="id1 = 4")),
20352037
(negation, 0, Condition(raw_string="id1 = 5")),
20362038
(identity, 1, Condition(raw_string="id1 = 5")),
2039+
(negation, 0, Condition(raw_string="id1 = 6")),
2040+
(identity, 1, Condition(raw_string="id1 = 6")),
20372041
],
20382042
)
20392043
@pytest.mark.parametrize("key_columns", [["id1"], [], None])
@@ -2056,8 +2060,9 @@ def test_integer_no_overlap_within_varying_key_columns(
20562060
@pytest.mark.parametrize(
20572061
"data",
20582062
[
2059-
(negation, 0.59, None),
2060-
(identity, 0.6, None),
2063+
# 2/6 ids succeed
2064+
(negation, 0.66, None),
2065+
(identity, 0.67, None),
20612066
(identity, 0, Condition(raw_string="id1 IN (1, 2)")),
20622067
],
20632068
)
@@ -2130,6 +2135,8 @@ def test_date_no_overlap_within_inclusion_exclusion(engine, date_table_overlap,
21302135
(identity, 1, Condition(raw_string="id1 = 6")),
21312136
(negation, 0, Condition(raw_string="id1 = 7")),
21322137
(identity, 1, Condition(raw_string="id1 = 7")),
2138+
(negation, 0, Condition(raw_string="id1 = 8")),
2139+
(identity, 1, Condition(raw_string="id1 = 8")),
21332140
],
21342141
)
21352142
@pytest.mark.parametrize("key_columns", [["id1"], [], None])
@@ -2155,9 +2162,9 @@ def test_date_no_overlap_2d_within_varying_key_column(
21552162
"data",
21562163
[
21572164
(identity, 0, Condition(raw_string="id1 IN (1, 2, 3, 4)")),
2158-
# 3/7 ids have violations.
2159-
(negation, 0.42, None),
2160-
(identity, 0.43, None),
2165+
# 4/8 ids have violations.
2166+
(negation, 0.49, None),
2167+
(identity, 0.50, None),
21612168
],
21622169
)
21632170
def test_date_no_overlap_2d_within_fixed_key_column(

0 commit comments

Comments
 (0)