Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
643dd20
Initial geospatal data quality checks
tdikland Sep 26, 2025
b9b7b3d
expanded geospatial data checks
tdikland Sep 27, 2025
5e313c6
Merge branch 'main' into feat/geo
tdikland Sep 27, 2025
36f8f77
Merge branch 'main' into feat/geo
mwojtyczka Sep 30, 2025
1554d06
Update src/databricks/labs/dqx/geo/check_funcs.py
mwojtyczka Sep 30, 2025
f13d798
Update src/databricks/labs/dqx/geo/check_funcs.py
mwojtyczka Sep 30, 2025
38d2bfc
Update src/databricks/labs/dqx/geo/check_funcs.py
mwojtyczka Sep 30, 2025
d3b50f2
Update src/databricks/labs/dqx/geo/check_funcs.py
mwojtyczka Sep 30, 2025
d678566
Update src/databricks/labs/dqx/geo/check_funcs.py
mwojtyczka Sep 30, 2025
d2ece65
Update tests/integration/test_row_checks_geo.py
mwojtyczka Sep 30, 2025
8756303
Update tests/integration/test_row_checks_geo.py
mwojtyczka Sep 30, 2025
fd49d00
Update tests/integration/test_row_checks_geo.py
mwojtyczka Sep 30, 2025
66edad2
Update tests/integration/test_row_checks_geo.py
mwojtyczka Sep 30, 2025
bbe8bde
Update tests/integration/test_row_checks_geo.py
mwojtyczka Sep 30, 2025
bfdc650
Update src/databricks/labs/dqx/geo/check_funcs.py
mwojtyczka Sep 30, 2025
2daf11d
Update src/databricks/labs/dqx/geo/check_funcs.py
mwojtyczka Sep 30, 2025
ffc8285
Update src/databricks/labs/dqx/geo/check_funcs.py
mwojtyczka Sep 30, 2025
af715fe
Update src/databricks/labs/dqx/geo/check_funcs.py
mwojtyczka Sep 30, 2025
bb34096
Update src/databricks/labs/dqx/geo/check_funcs.py
mwojtyczka Sep 30, 2025
a5df77f
Update tests/integration/test_row_checks_geo.py
mwojtyczka Sep 30, 2025
1ba578c
Update tests/integration/test_row_checks_geo.py
mwojtyczka Sep 30, 2025
0b4f83a
Update src/databricks/labs/dqx/geo/check_funcs.py
mwojtyczka Sep 30, 2025
e5aab5f
Update src/databricks/labs/dqx/geo/check_funcs.py
mwojtyczka Sep 30, 2025
32d43aa
Update src/databricks/labs/dqx/geo/check_funcs.py
mwojtyczka Sep 30, 2025
69c041e
Update src/databricks/labs/dqx/geo/check_funcs.py
mwojtyczka Sep 30, 2025
8095b29
Update src/databricks/labs/dqx/geo/check_funcs.py
mwojtyczka Sep 30, 2025
0e1eec7
Update src/databricks/labs/dqx/geo/check_funcs.py
mwojtyczka Sep 30, 2025
c9760a3
Update src/databricks/labs/dqx/geo/check_funcs.py
mwojtyczka Sep 30, 2025
775cb84
Update src/databricks/labs/dqx/geo/check_funcs.py
mwojtyczka Sep 30, 2025
91adf6a
Update src/databricks/labs/dqx/geo/check_funcs.py
mwojtyczka Sep 30, 2025
560feeb
Apply suggestion from @mwojtyczka
mwojtyczka Sep 30, 2025
7db6db9
Apply suggestion from @mwojtyczka
mwojtyczka Sep 30, 2025
0027072
Apply suggestion from @mwojtyczka
mwojtyczka Sep 30, 2025
80197d7
Apply suggestion from @mwojtyczka
mwojtyczka Sep 30, 2025
beb3217
Apply suggestion from @mwojtyczka
mwojtyczka Sep 30, 2025
945d031
Apply suggestion from @mwojtyczka
mwojtyczka Sep 30, 2025
0523d67
Apply suggestion from @mwojtyczka
mwojtyczka Sep 30, 2025
0b69618
Apply suggestion from @mwojtyczka
mwojtyczka Sep 30, 2025
7483b6a
Apply suggestion from @mwojtyczka
mwojtyczka Sep 30, 2025
0daae5e
corrected tests, fmt
mwojtyczka Sep 30, 2025
0e4b4fa
remove todos
mwojtyczka Sep 30, 2025
5922211
check if runtime is geo compatible
mwojtyczka Sep 30, 2025
96f0823
expanded integration tests
tdikland Oct 2, 2025
6525f4e
add benchmarks for geo check functions
tdikland Oct 2, 2025
1844672
integation test fixes pt1
tdikland Oct 2, 2025
5381a2b
integation test fixes pt2
tdikland Oct 2, 2025
038677c
Merge branch 'main' into feat/geo
mwojtyczka Oct 2, 2025
22aae45
updated docs
mwojtyczka Oct 2, 2025
1173158
updated tests and docs
mwojtyczka Oct 2, 2025
77f0e48
updated tests
mwojtyczka Oct 2, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
249 changes: 249 additions & 0 deletions docs/dqx/docs/reference/quality_checks.mdx

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions src/databricks/labs/dqx/checks_resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from contextlib import contextmanager

from databricks.labs.dqx import check_funcs
from databricks.labs.dqx.geo import check_funcs as geo_check_funcs
from databricks.labs.dqx.errors import InvalidCheckError

logger = logging.getLogger(__name__)
Expand All @@ -30,6 +31,8 @@ def resolve_check_function(
"""
logger.debug(f"Resolving function: {function_name}")
func = getattr(check_funcs, function_name, None) # resolve using predefined checks first
if not func:
func = getattr(geo_check_funcs, function_name, None) # resolve using prefedined geo checks
if not func and custom_check_functions:
func = custom_check_functions.get(function_name) # returns None if not found
if fail_on_missing and not func:
Expand Down
Empty file.
450 changes: 450 additions & 0 deletions src/databricks/labs/dqx/geo/check_funcs.py

Large diffs are not rendered by default.

85 changes: 85 additions & 0 deletions src/databricks/labs/dqx/llm/resources/yaml_checks_examples.yml
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,91 @@
for_each_column:
- col3
- col5
- criticality: error
check:
function: is_latitude
arguments:
column: col2
- criticality: error
check:
function: is_longitude
arguments:
column: col2
- criticality: error
check:
function: is_geometry
arguments:
column: point_geom
- criticality: error
check:
function: is_geography
arguments:
column: point_geom
- criticality: error
check:
function: is_point
arguments:
column: point_geom
- criticality: error
check:
function: is_linestring
arguments:
column: linestring_geom
- criticality: error
check:
function: is_polygon
arguments:
column: polygon_geom
- criticality: error
check:
function: is_multipoint
arguments:
column: multipoint_geom
- criticality: error
check:
function: is_multilinestring
arguments:
column: multilinestring_geom
- criticality: error
check:
function: is_multipolygon
arguments:
column: multipolygon_geom
- criticality: error
check:
function: is_geometrycollection
arguments:
column: geometrycollection_geom
- criticality: error
check:
function: is_ogc_valid
arguments:
column: point_geom
- criticality: error
check:
function: is_non_empty_geometry
arguments:
column: point_geom
- criticality: error
check:
function: has_dimension
arguments:
column: polygon_geom
dimension: 2
- criticality: error
check:
function: has_x_coordinate_between
arguments:
column: polygon_geom
min_value: 0.0
max_value: 10.0
- criticality: error
check:
function: has_y_coordinate_between
arguments:
column: polygon_geom
min_value: 0.0
max_value: 10.0
- criticality: error
check:
function: is_not_null
Expand Down
39 changes: 39 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import re
from collections.abc import Callable, Generator
from dataclasses import replace
from functools import cached_property
Expand Down Expand Up @@ -40,6 +41,44 @@ def set_utc_timezone():
os.environ.pop("TZ")


@pytest.fixture
def skip_if_runtime_not_geo_compatible(ws, debug_env):
"""
Skip the test if the cluster runtime does not support the required geo functions, i.e.
* serverless clusters have the required geo functions
* standard clusters require runtime 17.1 or above

Args:
ws (WorkspaceClient): Workspace client to interact with Databricks.
debug_env (dict): Test environment variables.
"""
if "DATABRICKS_SERVERLESS_COMPUTE_ID" in debug_env:
return # serverless clusters have the required geo functions

# standard clusters require runtime 17.1 or above
cluster_id = debug_env.get("DATABRICKS_CLUSTER_ID")
if not cluster_id:
raise ValueError("DATABRICKS_CLUSTER_ID is not set in debug_env")

# Fetch cluster details
cluster_info = ws.clusters.get(cluster_id)
runtime_version = cluster_info.spark_version

if not runtime_version:
raise ValueError(f"Unable to retrieve runtime version for cluster {cluster_id}")

# Extract major and minor version numbers
match = re.match(r"(\d+)\.(\d+)", runtime_version)
if not match:
raise ValueError(f"Invalid runtime version format: {runtime_version}")

major, minor = [int(x) for x in match.groups()]
valid = major > 17 or (major == 17 and minor >= 1)

if not valid:
pytest.skip("This test requires a cluster with runtime 17.1 or above")


class CommonUtils:
def __init__(self, env_or_skip_fixture: Callable[[str], str], ws: WorkspaceClient):
self._env_or_skip = env_or_skip_fixture
Expand Down
Loading