From 643dd20b8beb27d3bddc035bcc47eff61edfa86a Mon Sep 17 00:00:00 2001 From: Tim Dikland Date: Fri, 26 Sep 2025 08:34:35 +0200 Subject: [PATCH 01/47] Initial geospatal data quality checks --- .github/workflows/acceptance.yml | 12 +- .github/workflows/docs-release.yml | 3 +- .github/workflows/downstreams.yml | 3 +- .github/workflows/nightly.yml | 17 +- .github/workflows/performance.yml | 3 +- .github/workflows/push.yml | 3 +- .github/workflows/release.yml | 3 +- demos/dqx_demo_tool.py | 47 ++- docs/dqx/docs/dev/contributing.mdx | 9 + .../dqx/docs/guide/quality_checks_storage.mdx | 21 +- docs/dqx/docs/installation.mdx | 36 +- docs/dqx/docs/reference/cli.mdx | 5 +- docs/dqx/docs/reference/engine.mdx | 43 +-- pyproject.toml | 2 +- src/databricks/labs/dqx/checks_storage.py | 9 +- src/databricks/labs/dqx/config.py | 5 + src/databricks/labs/dqx/config_loader.py | 46 ++- src/databricks/labs/dqx/engine.py | 22 +- src/databricks/labs/dqx/geo/__init__.py | 0 src/databricks/labs/dqx/geo/check_funcs.py | 345 ++++++++++++++++++ .../labs/dqx/installer/config_provider.py | 20 +- src/databricks/labs/dqx/installer/install.py | 33 +- .../labs/dqx/profiler/profiler_workflow.py | 1 + .../quality_checker_workflow.py | 1 + src/databricks/labs/dqx/workflows_runner.py | 5 +- tests/conftest.py | 21 +- tests/integration/conftest.py | 33 ++ tests/integration/test_config.py | 31 +- tests/integration/test_e2e_workflow.py | 23 ++ tests/integration/test_installation.py | 171 ++++++++- .../test_load_checks_from_workspace_file.py | 17 + tests/integration/test_profiler_workflow.py | 21 ++ .../test_quality_checker_workflow.py | 11 + tests/integration/test_row_checks_geo.py | 256 +++++++++++++ .../test_save_checks_to_workspace_file.py | 23 +- .../integration/test_save_results_in_table.py | 37 ++ 36 files changed, 1240 insertions(+), 98 deletions(-) create mode 100644 src/databricks/labs/dqx/geo/__init__.py create mode 100644 src/databricks/labs/dqx/geo/check_funcs.py create mode 100644 tests/integration/test_row_checks_geo.py diff --git a/.github/workflows/acceptance.yml b/.github/workflows/acceptance.yml index 1409b1d8..58310ee2 100644 --- a/.github/workflows/acceptance.yml +++ b/.github/workflows/acceptance.yml @@ -41,7 +41,8 @@ jobs: python-version: '3.12' - name: Install hatch - run: pip install hatch==1.9.4 + # click 8.3+ introduced bug for hatch + run: pip install "hatch==1.13.0" "click<8.3" - name: Run unit tests and generate test coverage report run: make test @@ -93,7 +94,8 @@ jobs: python-version: '3.12' - name: Install hatch - run: pip install hatch==1.9.4 + # click 8.3+ introduced bug for hatch + run: pip install "hatch==1.13.0" "click<8.3" - name: Run integration tests on serverless cluster uses: databrickslabs/sandbox/acceptance@acceptance/v0.4.4 @@ -125,7 +127,8 @@ jobs: python-version: '3.12' - name: Install hatch - run: pip install hatch==1.9.4 + # click 8.3+ introduced bug for hatch + run: pip install "hatch==1.13.0" "click<8.3" - name: Install Databricks CLI run: | @@ -177,7 +180,8 @@ jobs: python-version: '3.12' - name: Install hatch - run: pip install hatch==1.9.4 + # click 8.3+ introduced bug for hatch + run: pip install "hatch==1.13.0" "click<8.3" - name: Install Databricks CLI run: | diff --git a/.github/workflows/docs-release.yml b/.github/workflows/docs-release.yml index 052a9e59..a08228f5 100644 --- a/.github/workflows/docs-release.yml +++ b/.github/workflows/docs-release.yml @@ -28,7 +28,8 @@ jobs: - name: Install Hatch run: | - pip install hatch==1.9.4 + # click 8.3+ introduced bug for hatch + pip install "hatch==1.13.0" "click<8.3" - uses: actions/setup-node@v4 with: diff --git a/.github/workflows/downstreams.yml b/.github/workflows/downstreams.yml index 982f5019..4f497c3d 100644 --- a/.github/workflows/downstreams.yml +++ b/.github/workflows/downstreams.yml @@ -43,7 +43,8 @@ jobs: - name: Install toolchain run: | - pip install hatch==1.9.4 + # click 8.3+ introduced bug for hatch + pip install "hatch==1.13.0" "click<8.3" - name: Check downstream compatibility uses: databrickslabs/sandbox/downstreams@downstreams/v0.0.1 diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 835ca416..aa73286b 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -32,7 +32,8 @@ jobs: python-version: '3.12' - name: Install hatch - run: pip install hatch==1.9.4 + # click 8.3+ introduced bug for hatch + run: pip install "hatch==1.13.0" "click<8.3" - name: Run unit tests and generate test coverage report run: make test @@ -81,7 +82,8 @@ jobs: python-version: '3.12' - name: Install hatch - run: pip install hatch==1.9.4 + # click 8.3+ introduced bug for hatch + run: pip install "hatch==1.13.0" "click<8.3" - name: Run integration tests on serverless cluster uses: databrickslabs/sandbox/acceptance@acceptance/v0.4.4 @@ -97,7 +99,6 @@ jobs: DATABRICKS_SERVERLESS_COMPUTE_ID: ${{ env.DATABRICKS_SERVERLESS_COMPUTE_ID }} e2e: - if: github.event_name == 'pull_request' && !github.event.pull_request.draft && !github.event.pull_request.head.repo.fork environment: tool runs-on: larger steps: @@ -114,7 +115,8 @@ jobs: python-version: '3.12' - name: Install hatch - run: pip install hatch==1.9.4 + # click 8.3+ introduced bug for hatch + run: pip install "hatch==1.13.0" "click<8.3" - name: Install Databricks CLI run: | @@ -147,7 +149,6 @@ jobs: ARM_TENANT_ID: ${{ secrets.ARM_TENANT_ID }} e2e_serverless: - if: github.event_name == 'pull_request' && !github.event.pull_request.draft && !github.event.pull_request.head.repo.fork environment: tool runs-on: larger env: @@ -166,7 +167,8 @@ jobs: python-version: '3.12' - name: Install hatch - run: pip install hatch==1.9.4 + # click 8.3+ introduced bug for hatch + run: pip install "hatch==1.13.0" "click<8.3" - name: Install Databricks CLI run: | @@ -219,7 +221,8 @@ jobs: python-version: '3.12' - name: Install hatch - run: pip install hatch==1.9.4 + # click 8.3+ introduced bug for hatch + run: pip install "hatch==1.13.0" "click<8.3" - name: Login to Azure for azure-cli authentication uses: azure/login@v2 diff --git a/.github/workflows/performance.yml b/.github/workflows/performance.yml index cda38de9..cf84000c 100644 --- a/.github/workflows/performance.yml +++ b/.github/workflows/performance.yml @@ -43,7 +43,8 @@ jobs: cache-dependency-path: '**/pyproject.toml' - name: Install hatch - run: pip install hatch==1.9.4 + # click 8.3+ introduced bug for hatch + run: pip install "hatch==1.13.0" "click<8.3" - name: Login to Azure for azure-cli authentication uses: azure/login@v2 diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 6f14655e..d45a722e 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -37,7 +37,8 @@ jobs: - name: Run unit tests run: | - pip install hatch==1.9.4 + # click 8.3+ introduced bug for hatch + pip install "hatch==1.13.0" "click<8.3" make test fmt: diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 02571898..469b80b3 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -27,7 +27,8 @@ jobs: - name: Build wheels run: | - pip install hatch==1.9.4 + # click 8.3+ introduced bug for hatch + pip install "hatch==1.13.0" "click<8.3" hatch build - name: Github release diff --git a/demos/dqx_demo_tool.py b/demos/dqx_demo_tool.py index c730d805..fe4538b5 100644 --- a/demos/dqx_demo_tool.py +++ b/demos/dqx_demo_tool.py @@ -44,6 +44,14 @@ # MAGIC summary_stats_file: profile_summary_stats.yml # MAGIC warehouse_id: your-warehouse-id # MAGIC ``` +# MAGIC +# MAGIC If you install DQX using custom installation path you must update `custom_install_path` variable below. Installation using custom path is required when using [group assigned cluster](https://docs.databricks.com/aws/en/compute/group-access)! + +# COMMAND ---------- + +# Updated the installation path if you install DQX in a custom folder! +custom_install_path: str = "" +dbutils.widgets.text("dqx_custom_installation_path", custom_install_path, "DQX Custom Installation Path") # COMMAND ---------- @@ -107,8 +115,14 @@ import glob import os -user_name = spark.sql("select current_user() as user").collect()[0]["user"] -default_dqx_installation_path = f"/Workspace/Users/{user_name}/.dqx" +if custom_install_path: + default_dqx_installation_path = custom_install_path + print(f"Using custom installation path: {custom_install_path}") +else: + user_name = spark.sql("select current_user() as user").collect()[0]["user"] + default_dqx_installation_path = f"/Workspace/Users/{user_name}/.dqx" + print(f"Using default user's home installation path: {default_dqx_installation_path}") + default_dqx_product_name = "dqx" dbutils.widgets.text("dqx_installation_path", default_dqx_installation_path, "DQX Installation Folder") @@ -116,6 +130,7 @@ dqx_wheel_files_path = f"{dbutils.widgets.get('dqx_installation_path')}/wheels/databricks_labs_dqx-*.whl" dqx_wheel_files = glob.glob(dqx_wheel_files_path) + try: dqx_latest_wheel = max(dqx_wheel_files, key=os.path.getctime) except: @@ -126,6 +141,10 @@ # COMMAND ---------- +custom_install_path = dbutils.widgets.get('dqx_custom_installation_path') or None + +# COMMAND ---------- + # MAGIC %md # MAGIC ### Run profiler workflow to generate quality rule candidates # MAGIC @@ -162,7 +181,9 @@ dq_engine = DQEngine(ws) # load the run configuration -run_config = RunConfigLoader(ws).load_run_config(run_config_name="default", product_name=dqx_product_name) +run_config = RunConfigLoader(ws).load_run_config( + run_config_name="default", product_name=dqx_product_name, install_folder=custom_install_path +) # read the input data, limit to 1000 rows for demo purpose input_df = read_input_data(spark, run_config.input_config).limit(1000) @@ -180,7 +201,10 @@ print(yaml.safe_dump(checks)) # save generated checks to location specified in the default run configuration inside workspace installation folder -dq_engine.save_checks(checks, config=InstallationChecksStorageConfig(run_config_name="default", product_name=dqx_product_name)) +dq_engine.save_checks(checks, config=InstallationChecksStorageConfig( + run_config_name="default", product_name=dqx_product_name, install_folder=custom_install_path + ) +) # or save checks in arbitrary workspace location #dq_engine.save_checks(checks, config=WorkspaceFileChecksStorageConfig(location="/Shared/App1/checks.yml")) @@ -245,7 +269,10 @@ dq_engine = DQEngine(WorkspaceClient()) # save checks to location specified in the default run configuration inside workspace installation folder -dq_engine.save_checks(checks, config=InstallationChecksStorageConfig(run_config_name="default", product_name=dqx_product_name)) +dq_engine.save_checks(checks, config=InstallationChecksStorageConfig( + run_config_name="default", product_name=dqx_product_name, install_folder=custom_install_path + ) +) # or save checks in arbitrary workspace location #dq_engine.save_checks(checks, config=WorkspaceFileChecksStorageConfig(location="/Shared/App1/checks.yml")) @@ -267,7 +294,9 @@ dq_engine = DQEngine(WorkspaceClient()) # load the run configuration -run_config = RunConfigLoader(ws).load_run_config(run_config_name="default", assume_user=True, product_name=dqx_product_name) +run_config = RunConfigLoader(ws).load_run_config( + run_config_name="default", assume_user=True, product_name=dqx_product_name, install_folder=custom_install_path +) # read the data, limit to 1000 rows for demo purpose bronze_df = read_input_data(spark, run_config.input_config).limit(1000) @@ -276,8 +305,10 @@ bronze_transformed_df = bronze_df.filter("vendor_id in (1, 2)") # load checks from location defined in the run configuration - -checks = dq_engine.load_checks(config=InstallationChecksStorageConfig(assume_user=True, run_config_name="default", product_name=dqx_product_name)) +checks = dq_engine.load_checks(config=InstallationChecksStorageConfig( + assume_user=True, run_config_name="default", product_name=dqx_product_name, install_folder=custom_install_path + ) +) # or load checks from arbitrary workspace file #checks = dq_engine.load_checks(config=WorkspaceFileChecksStorageConfig(location="/Shared/App1/checks.yml")) diff --git a/docs/dqx/docs/dev/contributing.mdx b/docs/dqx/docs/dev/contributing.mdx index afd108a4..4c058b41 100644 --- a/docs/dqx/docs/dev/contributing.mdx +++ b/docs/dqx/docs/dev/contributing.mdx @@ -333,6 +333,15 @@ git push --force-with-lease origin HEAD If you encounter any package dependency errors after `git pull`, run `make clean` +### Resolving Hatch JSON TypeError + +If you encounter an error like: +```text +TypeError: the JSON object must be str, bytes or bytearray, not Sentinel +``` + +you can resolve it by downgrading the Click package to a compatible version that works with hatch: `pip install "click<8.3"` + ### Common fixes for `mypy` errors See https://mypy.readthedocs.io/en/stable/cheat_sheet_py3.html for more details diff --git a/docs/dqx/docs/guide/quality_checks_storage.mdx b/docs/dqx/docs/guide/quality_checks_storage.mdx index a1f858f6..0bf7c7a8 100644 --- a/docs/dqx/docs/guide/quality_checks_storage.mdx +++ b/docs/dqx/docs/guide/quality_checks_storage.mdx @@ -12,11 +12,22 @@ import TabItem from '@theme/TabItem'; DQX provides flexible methods to load and save quality checks (rules) defined as metadata (a list of dictionaries) from different storage backends, making it easier to manage, share, and reuse checks across workflows and environments. Saving and loading methods accept a storage backend configuration as input. The following backend configuration are currently supported: -- `FileChecksStorageConfig`: local files (JSON/YAML), or workspace files if invoked from Databricks notebook or job -- `WorkspaceFileChecksStorageConfig`: workspace files (JSON/YAML) using absolute paths -- `VolumeFileChecksStorageConfig`: Unity Catalog volumes (JSON/YAML file) -- `TableChecksStorageConfig`: Unity Catalog tables -- `InstallationChecksStorageConfig`: installation-managed location from the run config, ignores location and infers it from `checks_location` in the run config +* `FileChecksStorageConfig`: local files (JSON/YAML), or workspace files if invoked from Databricks notebook or job. Containing fields: + * `location`: absolute or relative file path in the local filesystem (JSON or YAML); also works with absolute or relative workspace file paths if invoked from Databricks notebook or job. +* `WorkspaceFileChecksStorageConfig`: workspace files (JSON/YAML) using absolute paths. Containing fields: + * `location`: absolute workspace file path (JSON or YAML). +* `TableChecksStorageConfig`: Unity Catalog tables. Containing fields: + * `location`: table fully qualified name. + * `run_config_name`: (optional) run configuration name to load (use "default" if not provided). + * `mode`: (optional) write mode for saving checks (`overwrite` or `append`, default is `overwrite`). The `overwrite` mode will only replace checks for the specific run config and not all checks in the table. +* `VolumeFileChecksStorageConfig`: Unity Catalog volumes (JSON/YAML file). Containing fields: + * `location`: Unity Catalog Volume file path (JSON or YAML). +* `InstallationChecksStorageConfig`: installation-managed location from the run config, ignores location and infers it from `checks_location` in the run config. Containing fields: + * `location` (optional): automatically set based on the `checks_location` field from the run configuration. + * `install_folder`: (optional) installation folder where DQX is installed, only required when custom installation folder is used. + * `run_config_name` (optional) - run configuration name to load (use "default" if not provided). + * `product_name`: (optional) name of the product (use "dqx" if not provided). + * `assume_user`: (optional) if True, assume user installation, otherwise global installation (skipped if `install_folder` is provided). You can find details on how to define checks [here](/docs/guide/quality_checks_definition). diff --git a/docs/dqx/docs/installation.mdx b/docs/dqx/docs/installation.mdx index e3a02d24..0b1854f9 100644 --- a/docs/dqx/docs/installation.mdx +++ b/docs/dqx/docs/installation.mdx @@ -82,8 +82,7 @@ Install a specific version of DQX in your Databricks workspace via Databricks CL databricks labs install dqx@v0.8.0 ``` -You'll be prompted to select a [configuration profile](https://docs.databricks.com/en/dev-tools/auth.html#databricks-client-unified-authentication) created by `databricks auth login` command, -and other configuration options. +You'll be prompted to select a [configuration profile](https://docs.databricks.com/en/dev-tools/auth.html#databricks-client-unified-authentication) created by `databricks auth login` command, and other configuration options. The cli command will install the following components in the workspace installation folder: - A Python [wheel file](https://peps.python.org/pep-0427/) with the library packaged. @@ -98,19 +97,38 @@ It is recommended to use serverless clusters for the workflows, as it allows for If serverless clusters are not used, a default cluster configuration will be used for the workflows. Alternatively, you can override the cluster configuration for each workflow in the `config.yml` file after the installation, to provide existing clusters to use. -#### User vs Global Installation +#### Installation Options -DQX is installed by default in the user home directory (under `/Users//.dqx`). You can also install DQX globally -by setting the 'DQX_FORCE_INSTALL' environment variable. The following options are available: +DQX offers flexible installation options. By default, DQX is installed in the user home directory under `/Users//.dqx`. +You can also install DQX in a global folder or any custom workspace folder. -* `DQX_FORCE_INSTALL=global databricks labs install dqx`: will force the installation to be for root only (`/Applications/dqx`) -* `DQX_FORCE_INSTALL=user databricks labs install dqx`: will force the installation to be for user only (`/Users//.dqx`) +**Environment Variable Override:** + +You can force global or user installation using the 'DQX_FORCE_INSTALL' environment variable: + * `DQX_FORCE_INSTALL=global databricks labs install dqx`: forces installation to `/Applications/dqx` + * `DQX_FORCE_INSTALL=user databricks labs install dqx`: forces installation to `/Users//.dqx` (default behaviour) + +**Custom Workspace Folder:** + +If you provide a custom path during installation (e.g., `/Shared/dqx-team` or `/Users/shared-user/dqx-project`), DQX will be installed there. +You will be prompted to optionally enter a workspace path when installing DQX. + +The custom folder installation is required when using [Group Assigned cluster](https://docs.databricks.com/aws/en/compute/group-access), +as the concept of a user home directory does not exist in this setup. + + +If a custom folder is provided during installation, the installation folder will take precedence over any environment variables (e.g. `DQX_FORCE_INSTALL`). + #### Configuration file DQX configuration file can contain multiple run configurations for different pipelines or projects, each defining specific input, output and quarantine locations, etc. -By default, the config is created in the installation directory under `/Users//.dqx/config.yml` or `/Applications/dqx/config.yml` if installed globally. -The "default" run configuration is created during the installation. When DQX is upgraded, the configuration is preserved. +The configuration file is created in the installation directory depending on installation options (see above): +- User home (default): `/Users//.dqx/config.yml` +- Global: `/Applications/dqx/config.yml` (when DQX_FORCE_INSTALL=global) +- Custom folder: `/config.yml` (when provided during installation) + +A "default" run configuration is created during the installation. When DQX is upgraded, the configuration is preserved. The configuration can be updated / extended manually by the user after the installation. Each run config defines configuration for one specific input and output location. Open the configuration file: diff --git a/docs/dqx/docs/reference/cli.mdx b/docs/dqx/docs/reference/cli.mdx index 6e948393..5c885c4f 100644 --- a/docs/dqx/docs/reference/cli.mdx +++ b/docs/dqx/docs/reference/cli.mdx @@ -24,8 +24,9 @@ databricks labs uninstall dqx ``` -By default, DQX is installed under the user's home (for example `/Users//.dqx`). -Use the `DQX_FORCE_INSTALL` env var to force a global or user install. See the installation guide for details. +By default, DQX is installed under the user's home folder (for example `/Users//.dqx`). +Use the `DQX_FORCE_INSTALL` env var to force a global or user install. Provide a custom installation +folder during installation to override the default location. See the installation guide for details. ## Configuration helpers diff --git a/docs/dqx/docs/reference/engine.mdx b/docs/dqx/docs/reference/engine.mdx index 84ab7e44..8195a433 100644 --- a/docs/dqx/docs/reference/engine.mdx +++ b/docs/dqx/docs/reference/engine.mdx @@ -49,20 +49,20 @@ The following table outlines the available methods of the `DQEngine` and their f
**Available DQX engine methods** -| Method | Description | Arguments | Supports local execution | -| ---------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------- | -| `apply_checks` | Applies quality checks to the DataFrame and returns a DataFrame with result columns. | `df`: DataFrame to check; `checks`: List of checks defined using DQX classes, each check is an instance of the DQRule class; `ref_dfs`: Reference dataframes to use in the checks, if applicable. | Yes | -| `apply_checks_and_split` | Applies quality checks to the DataFrame and returns valid and invalid (quarantine) DataFrames with result columns. | `df`: DataFrame to check; `checks`: List of checks defined using DQX classes, each check is an instance of the DQRule class; `ref_dfs`: Reference dataframes to use in the checks, if applicable. | Yes | -| `apply_checks_and_save_in_table` | Applies quality checks using DQRule objects and writes results to valid and invalid Delta table(s) with result columns. | `input_config`: `InputConfig` object with the table name and options for reading the input data; `checks`: List of checks defined using DQX classes, each check is an instance of the DQRule class; `output_config`: `OutputConfig` object with the table name, output mode, and options for the output data; `quarantine_config`: `OutputConfig` object with the table name, output mode, and options for the quarantine data - if provided, data will be split; `ref_dfs`: Reference dataframes to use in the checks, if applicable. | No | -| `apply_checks_by_metadata` | Applies quality checks defined as a dictionary to the DataFrame and returns a DataFrame with result columns. | `df`: DataFrame to check; `checks`: List of checks defined as dictionary; `custom_check_functions`: (optional) dictionary with custom check functions (e.g., globals() of the calling module); `ref_dfs`: Reference dataframes to use in the checks, if applicable. | Yes | -| `apply_checks_by_metadata_and_split` | Applies quality checks defined as a dictionary and returns valid and invalid (quarantine) DataFrames. | `df`: DataFrame to check; `checks`: List of checks defined as dictionary; `custom_check_functions`: (optional) dictionary with custom check functions (e.g., globals() of the calling module); `ref_dfs`: Reference dataframes to use in the checks, if applicable. | Yes | -| `apply_checks_by_metadata_and_save_in_table` | Applies quality checks defined as a dictionary and writes results to valid and invalid Delta table(s) with result columns. | `input_config`: `InputConfig` object with the table name and options for reading the input data; `checks`: List of checks defined as dictionary; `output_config`: `OutputConfig` object with the table name, output mode, and options for the output data; `quarantine_config`: `OutputConfig` object with the table name, output mode, and options for the quarantine data - if provided, data will be split; `custom_check_functions`: (optional) dictionary with custom check functions; `ref_dfs`: Reference dataframes to use in the checks, if applicable. | No | -| `validate_checks` | Validates the provided quality checks to ensure they conform to the expected structure and types. | `checks`: List of checks to validate; `custom_check_functions`: (optional) dictionary of custom check functions that can be used; `validate_custom_check_functions`: (optional) if set to True, validates custom check functions (defaults to True). | Yes | -| `get_invalid` | Retrieves records from the DataFrame that violate data quality checks (records with warnings and errors). | `df`: Input DataFrame. | Yes | -| `get_valid` | Retrieves records from the DataFrame that pass all data quality checks. | `df`: Input DataFrame. | Yes | -| `load_checks` | Loads quality rules (checks) from storage backend. Multiple storage backends are supported including tables, files or workspace files, installation-managed sources where the location is inferred automatically from run config. | `config`: Configuration for loading checks from a storage backend, i.e. `FileChecksStorageConfig`: file in a local filesystem (YAML or JSON), or workspace files if invoked from Databricks notebook or job; `WorkspaceFileChecksStorageConfig`: file in a workspace (YAML or JSON) using absolute paths; `VolumeFileChecksStorageConfig`: file in a Unity Catalog Volume (YAML or JSON); `TableChecksStorageConfig`: a table; `InstallationChecksStorageConfig`: installation-managed storage backend, using the `checks_location` field from the run configuration. See more details below. | Yes (only with `FileChecksStorageConfig`) | -| `save_checks` | Saves quality rules (checks) to storage backend. Multiple storage backends are supported including tables, files or workspace files, installation-managed targets where the location is inferred automatically from run config. | `checks`: List of checks defined as dictionary; `config`: Configuration for saving checks in a storage backend, i.e. `FileChecksStorageConfig`: file in a local filesystem (YAML or JSON), or workspace files if invoked from Databricks notebook or job; `WorkspaceFileChecksStorageConfig`: file in a workspace (YAML or JSON); `VolumeFileChecksStorageConfig`: file in a Unity Catalog Volume (YAML or JSON); `TableChecksStorageConfig`: a table; `InstallationChecksStorageConfig`: storage defined in the installation context, using the `checks_location` field from the run configuration. See more details below. | Yes (only with `FileChecksStorageConfig`) | -| `save_results_in_table` | Save quality checking results in delta table(s). | `output_df`: (optional) Dataframe containing the output data; `quarantine_df`: (optional) Dataframe containing the output data; `output_config`: `OutputConfig` object with the table name, output mode, and options for the output data; `quarantine_config`: `OutputConfig` object with the table name, output mode, and options for the quarantine data - if provided, data will be split; `run_config_name`: Name of the run config to use; `assume_user`: If True, assume user installation. | No | +| Method | Description | Arguments | Supports local execution | +| ---------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------- | +| `apply_checks` | Applies quality checks to the DataFrame and returns a DataFrame with result columns. | `df`: DataFrame to check; `checks`: List of checks defined using DQX classes, each check is an instance of the DQRule class; `ref_dfs`: Reference dataframes to use in the checks, if applicable. | Yes | +| `apply_checks_and_split` | Applies quality checks to the DataFrame and returns valid and invalid (quarantine) DataFrames with result columns. | `df`: DataFrame to check; `checks`: List of checks defined using DQX classes, each check is an instance of the DQRule class; `ref_dfs`: Reference dataframes to use in the checks, if applicable. | Yes | +| `apply_checks_and_save_in_table` | Applies quality checks using DQRule objects and writes results to valid and invalid Delta table(s) with result columns. | `input_config`: `InputConfig` object with the table name and options for reading the input data; `checks`: List of checks defined using DQX classes, each check is an instance of the DQRule class; `output_config`: `OutputConfig` object with the table name, output mode, and options for the output data; `quarantine_config`: `OutputConfig` object with the table name, output mode, and options for the quarantine data - if provided, data will be split; `ref_dfs`: Reference dataframes to use in the checks, if applicable. | No | +| `apply_checks_by_metadata` | Applies quality checks defined as a dictionary to the DataFrame and returns a DataFrame with result columns. | `df`: DataFrame to check; `checks`: List of checks defined as dictionary; `custom_check_functions`: (optional) dictionary with custom check functions (e.g., globals() of the calling module); `ref_dfs`: Reference dataframes to use in the checks, if applicable. | Yes | +| `apply_checks_by_metadata_and_split` | Applies quality checks defined as a dictionary and returns valid and invalid (quarantine) DataFrames. | `df`: DataFrame to check; `checks`: List of checks defined as dictionary; `custom_check_functions`: (optional) dictionary with custom check functions (e.g., globals() of the calling module); `ref_dfs`: Reference dataframes to use in the checks, if applicable. | Yes | +| `apply_checks_by_metadata_and_save_in_table` | Applies quality checks defined as a dictionary and writes results to valid and invalid Delta table(s) with result columns. | `input_config`: `InputConfig` object with the table name and options for reading the input data; `checks`: List of checks defined as dictionary; `output_config`: `OutputConfig` object with the table name, output mode, and options for the output data; `quarantine_config`: `OutputConfig` object with the table name, output mode, and options for the quarantine data - if provided, data will be split; `custom_check_functions`: (optional) dictionary with custom check functions; `ref_dfs`: Reference dataframes to use in the checks, if applicable. | No | +| `validate_checks` | Validates the provided quality checks to ensure they conform to the expected structure and types. | `checks`: List of checks to validate; `custom_check_functions`: (optional) dictionary of custom check functions that can be used; `validate_custom_check_functions`: (optional) if set to True, validates custom check functions (defaults to True). | Yes | +| `get_invalid` | Retrieves records from the DataFrame that violate data quality checks (records with warnings and errors). | `df`: Input DataFrame. | Yes | +| `get_valid` | Retrieves records from the DataFrame that pass all data quality checks. | `df`: Input DataFrame. | Yes | +| `load_checks` | Loads quality rules (checks) from storage backend. Multiple storage backends are supported including tables, files or workspace files, installation-managed sources where the location is inferred automatically from run config. | `config`: Configuration for loading checks from a storage backend, i.e. `FileChecksStorageConfig`: file in a local filesystem (YAML or JSON), or workspace files if invoked from Databricks notebook or job; `WorkspaceFileChecksStorageConfig`: file in a workspace (YAML or JSON) using absolute paths; `VolumeFileChecksStorageConfig`: file in a Unity Catalog Volume (YAML or JSON); `TableChecksStorageConfig`: a table; `InstallationChecksStorageConfig`: installation-managed storage backend, using the `checks_location` field from the run configuration. See more details below. | Yes (only with `FileChecksStorageConfig`) | +| `save_checks` | Saves quality rules (checks) to storage backend. Multiple storage backends are supported including tables, files or workspace files, installation-managed targets where the location is inferred automatically from run config. | `checks`: List of checks defined as dictionary; `config`: Configuration for saving checks in a storage backend, i.e. `FileChecksStorageConfig`: file in a local filesystem (YAML or JSON), or workspace files if invoked from Databricks notebook or job; `WorkspaceFileChecksStorageConfig`: file in a workspace (YAML or JSON); `VolumeFileChecksStorageConfig`: file in a Unity Catalog Volume (YAML or JSON); `TableChecksStorageConfig`: a table; `InstallationChecksStorageConfig`: storage defined in the installation context, using the `checks_location` field from the run configuration. See more details below. | Yes (only with `FileChecksStorageConfig`) | +| `save_results_in_table` | Save quality checking results in delta table(s). | `output_df`: (optional) Dataframe containing the output data; `quarantine_df`: (optional) Dataframe containing the output data; `output_config`: `OutputConfig` object with the table name, output mode, and options for the output data; `quarantine_config`: `OutputConfig` object with the table name, output mode, and options for the quarantine data - if provided, data will be split; `run_config_name`: Name of the run config to use; `install_folder`: (optional) installation folder where DQX is installed, only required when custom installation folder is used; `assume_user`: (optional) If True, assume user installation, otherwise global installation (skipped if `install_folder` is provided). | No | The 'Supports local execution' in the above table indicates which methods can be used for local testing without a Databricks workspace (see the usage in [local testing section](/docs/reference/testing/#local-execution-and-testing-with-dqengine)). @@ -82,20 +82,21 @@ The 'Supports local execution' in the above table indicates which methods can be Supported storage backend configurations (implementations of `BaseChecksStorageConfig`) for `load_checks` and `save_checks` methods: * `FileChecksStorageConfig` can be used to save or load checks from a local filesystem, or workspace file if invoked from Databricks notebook or job, with fields: - * `location`: absolute or relative file path in the local filesystem (JSON or YAML); also works with absolute or relative workspace file paths if invoked from Databricks notebook or job + * `location`: absolute or relative file path in the local filesystem (JSON or YAML); also works with absolute or relative workspace file paths if invoked from Databricks notebook or job. * `WorkspaceFileChecksStorageConfig` can be used to save or load checks from a workspace file, with fields: - * `location`: absolute workspace file path (JSON or YAML) + * `location`: absolute workspace file path (JSON or YAML). * `TableChecksStorageConfig` can be used to save or load checks from a table, with fields: - * `location`: table fully qualified name - * `run_config_name`: (optional) run configuration name to load (use "default" if not provided) + * `location`: table fully qualified name. + * `run_config_name`: (optional) run configuration name to load (use "default" if not provided). * `mode`: (optional) write mode for saving checks (`overwrite` or `append`, default is `overwrite`). The `overwrite` mode will only replace checks for the specific run config and not all checks in the table. * `VolumeFileChecksStorageConfig` can be used to save or load checks from a Unity Catalog Volume file, with fields: - * `location`: Unity Catalog Volume file path (JSON or YAML) + * `location`: Unity Catalog Volume file path (JSON or YAML). * `InstallationChecksStorageConfig` can be used to save or load checks from workspace installation, with fields: * `location` (optional): automatically set based on the `checks_location` field from the run configuration. + * `install_folder`: (optional) installation folder where DQX is installed, only required when custom installation folder is used. * `run_config_name` (optional) - run configuration name to load (use "default" if not provided). - * `product_name`: name of the product/installation directory - * `assume_user`: if True, assume user installation + * `product_name`: (optional) name of the product (use "dqx" if not provided). + * `assume_user`: (optional) if True, assume user installation, otherwise global installation (skipped if `install_folder` is provided). For details on how to prepare reference DataFrames (`ref_dfs`) and custom check function mapping (`custom_check_functions`) refer to [Quality Checks Reference](/docs/reference/quality_checks).
diff --git a/pyproject.toml b/pyproject.toml index 22651cb6..2d44164e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -95,7 +95,7 @@ dependencies = [ "pyspark~=3.5.0", "dbldatagen~=0.4.0", "pyparsing~=3.2.3", - "jmespath~=1.0.1" + "jmespath~=1.0.1", ] python="3.12" # must match the version required by databricks-connect and python version on the test clusters diff --git a/src/databricks/labs/dqx/checks_storage.py b/src/databricks/labs/dqx/checks_storage.py index 180d81e6..9607db02 100644 --- a/src/databricks/labs/dqx/checks_storage.py +++ b/src/databricks/labs/dqx/checks_storage.py @@ -262,9 +262,14 @@ def _get_storage_handler_and_config( self, config: InstallationChecksStorageConfig ) -> tuple[ChecksStorageHandler, InstallationChecksStorageConfig]: run_config = self._run_config_loader.load_run_config( - config.run_config_name, config.assume_user, config.product_name + run_config_name=config.run_config_name, + assume_user=config.assume_user, + product_name=config.product_name, + install_folder=config.install_folder, + ) + installation = self._run_config_loader.get_installation( + config.assume_user, config.product_name, config.install_folder ) - installation = self._run_config_loader.get_installation(config.assume_user, config.product_name) config.location = run_config.checks_location diff --git a/src/databricks/labs/dqx/config.py b/src/databricks/labs/dqx/config.py index 4eaf67a8..d2df1398 100644 --- a/src/databricks/labs/dqx/config.py +++ b/src/databricks/labs/dqx/config.py @@ -219,9 +219,14 @@ class InstallationChecksStorageConfig( run_config_name: The name of the run configuration to use for checks (default is 'default'). product_name: The product name for retrieving checks from the installation (default is 'dqx'). assume_user: Whether to assume the user is the owner of the checks (default is True). + install_folder: The installation folder where DQX is installed. + DQX will be installed in a default directory if no custom folder is provided: + * User's home directory: "/Users//.dqx" + * Global directory if `DQX_FORCE_INSTALL=global`: "/Applications/dqx" """ location: str = "installation" # retrieved from the installation config run_config_name: str = "default" # to retrieve run config product_name: str = "dqx" assume_user: bool = True + install_folder: str | None = None diff --git a/src/databricks/labs/dqx/config_loader.py b/src/databricks/labs/dqx/config_loader.py index b42d7fd0..6bad5877 100644 --- a/src/databricks/labs/dqx/config_loader.py +++ b/src/databricks/labs/dqx/config_loader.py @@ -1,5 +1,6 @@ from databricks.labs.blueprint.installation import Installation from databricks.sdk import WorkspaceClient +from databricks.sdk.errors import NotFound from databricks.labs.dqx.config import RunConfig, WorkspaceConfig @@ -13,36 +14,67 @@ def __init__(self, workspace_client: WorkspaceClient): self.ws = workspace_client def load_run_config( - self, run_config_name: str | None, assume_user: bool = True, product_name: str = "dqx" + self, + run_config_name: str | None, + install_folder: str | None = None, + assume_user: bool = True, + product_name: str = "dqx", ) -> RunConfig: """ Load run configuration from the installation. Args: - run_config_name: name of the run configuration to use - assume_user: if True, assume user installation - product_name: name of the product + run_config_name: Name of the run configuration to use. + install_folder: Custom workspace installation folder. Required if DQX is installed in a custom folder. + assume_user: Whether to assume a per-user installation when loading the run configuration (True as default, skipped if install_folder is provided). + product_name: Product/installation identifier used to resolve installation paths for config loading in install_folder is not provided ("dqx" as default). """ - installation = self.get_installation(assume_user, product_name) + installation = self.get_installation(assume_user, product_name, install_folder) return self._load_run_config(installation, run_config_name) - def get_installation(self, assume_user: bool, product_name: str) -> Installation: + def get_installation(self, assume_user: bool, product_name: str, install_folder: str | None = None) -> Installation: """ Get the installation for the given product name. Args: assume_user: if True, assume user installation product_name: name of the product + install_folder: optional installation folder """ + + if install_folder: + installation = self.get_custom_installation(self.ws, product_name, install_folder) + return installation + if assume_user: installation = Installation.assume_user_home(self.ws, product_name) else: installation = Installation.assume_global(self.ws, product_name) - # verify the installation installation.current(self.ws, product_name, assume_user=assume_user) return installation + @staticmethod + def get_custom_installation(ws: WorkspaceClient, product_name: str, install_folder: str) -> Installation: + """ + Creates an Installation instance for a custom installation folder, similar to assume_user_home and assume_global. + This ensures the custom folder is created in the workspace when the installation is accessed. + + Args: + ws: Databricks SDK `WorkspaceClient` + product_name: The product name + install_folder: The custom installation folder path + + Returns: + An Installation instance for the custom folder + """ + try: + ws.workspace.get_status(install_folder) + except NotFound: + ws.workspace.mkdirs(install_folder) + + return Installation(ws, product_name, install_folder=install_folder) + @staticmethod def _load_run_config(installation: Installation, run_config_name: str | None) -> RunConfig: """ diff --git a/src/databricks/labs/dqx/engine.py b/src/databricks/labs/dqx/engine.py index 3bb14b34..9f1525a0 100644 --- a/src/databricks/labs/dqx/engine.py +++ b/src/databricks/labs/dqx/engine.py @@ -608,6 +608,7 @@ def save_results_in_table( run_config_name: str | None = "default", product_name: str = "dqx", assume_user: bool = True, + install_folder: str | None = None, ): """Persist result DataFrames using explicit configs or the named run configuration. @@ -620,20 +621,31 @@ def save_results_in_table( output_df: DataFrame with valid rows to be saved (optional). quarantine_df: DataFrame with invalid rows to be saved (optional). output_config: Configuration describing where/how to write the valid rows. If omitted, falls back to the run config. - quarantine_config: Configuration describing where/how to write the invalid rows. If omitted, falls back to the run config. + quarantine_config: Configuration describing where/how to write the invalid rows (optional). If omitted, falls back to the run config. run_config_name: Name of the run configuration to load when a config parameter is omitted. - product_name: Product/installation identifier used to resolve installation paths for config loading. - assume_user: Whether to assume a per-user installation when loading the run configuration. + product_name: Product/installation identifier used to resolve installation paths for config loading in install_folder is not provided ("dqx" as default). + assume_user: Whether to assume a per-user installation when loading the run configuration (True as default, skipped if install_folder is provided). + install_folder: Custom workspace installation folder. Required if DQX is installed in a custom folder. Returns: None """ if output_df is not None and output_config is None: - run_config = self._run_config_loader.load_run_config(run_config_name, assume_user, product_name) + run_config = self._run_config_loader.load_run_config( + run_config_name=run_config_name, + assume_user=assume_user, + product_name=product_name, + install_folder=install_folder, + ) output_config = run_config.output_config if quarantine_df is not None and quarantine_config is None: - run_config = self._run_config_loader.load_run_config(run_config_name, assume_user, product_name) + run_config = self._run_config_loader.load_run_config( + run_config_name=run_config_name, + assume_user=assume_user, + product_name=product_name, + install_folder=install_folder, + ) quarantine_config = run_config.quarantine_config if output_df is not None and output_config is not None: diff --git a/src/databricks/labs/dqx/geo/__init__.py b/src/databricks/labs/dqx/geo/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/databricks/labs/dqx/geo/check_funcs.py b/src/databricks/labs/dqx/geo/check_funcs.py new file mode 100644 index 00000000..720794c3 --- /dev/null +++ b/src/databricks/labs/dqx/geo/check_funcs.py @@ -0,0 +1,345 @@ +from pyspark.sql import Column +import pyspark.sql.functions as F +from databricks.labs.dqx.rule import register_rule +from databricks.labs.dqx.check_funcs import make_condition, _get_normalized_column_and_expr + +POINT_TYPE = "ST_Point" +LINESTRING_TYPE = "ST_LineString" +POLYGON_TYPE = "ST_Polygon" +MULTIPOINT_TYPE = "ST_MultiPoint" +MULTILINESTRING_TYPE = "ST_MultiLineString" +MULTIPOLYGON_TYPE = "ST_MultiPolygon" +GEOMETRYCOLLECTION_TYPE = "ST_GeometryCollection" + + +@register_rule("row") +def is_geometry(column: str | Column) -> Column: + """Checks whether the values in the input column are valid geometries. + + Args: + column: column to check; can be a string column name or a column expression + + Returns: + Column object indicating whether the values in the input column are valid geometries + + Note: + This function requires Databricks runtime 17.1 or above. + """ + col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) + # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in + # Databricks SQL, due to the use of the `try_to_geometry` function. + # TODO: `pyspark.sql.functions.try_to_geometry` is not (yet) available. Replace with + # `pyspark.sql.functions.try_to_geometry` when available in OSS PySpark. + geometry_col = F.expr(f"try_to_geometry({col_str_norm})") + condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(geometry_col.isNull()) + condition_str = f"` in column `{col_expr_str}` is not a geometry" + + return make_condition( + condition, + F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), + f"{col_str_norm}_is_not_a_geometry", + ) + + +@register_rule("row") +def is_geography(column: str | Column) -> Column: + """Checks whether the values in the input column are valid geographies. + + Args: + column: column to check; can be a string column name or a column expression + + Returns: + Column object indicating whether the values in the input column are valid geographies + + Note: + This function requires Databricks runtime 17.1 or above. + """ + col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) + # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in + # Databricks SQL, due to the use of the `try_to_geography` function. + # TODO: `pyspark.sql.functions.try_to_geography` is not (yet) available. Replace with + # `pyspark.sql.functions.try_to_geography` when available in OSS PySpark. + geometry_col = F.expr(f"try_to_geography({col_str_norm})") + condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(geometry_col.isNull()) + condition_str = f"` in column `{col_expr_str}` is not a geography" + + return make_condition( + condition, + F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), + f"{col_str_norm}_is_not_a_geography", + ) + + +@register_rule("row") +def is_point(column: str | Column) -> Column: + """Checks whether the values in the input column are point geometries. + + Args: + column: column to check; can be a string column name or a column expression + + Returns: + Column object indicating whether the values in the input column are point geometries + + Note: + This function requires Databricks runtime 17.1 or above. + """ + col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) + # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in + # Databricks SQL, due to the use of the `try_to_geometry` and `st_geometrytype` functions. + # TODO: Above mentioned functions are not (yet) available. Replace with equivalent functions + # when available in OSS PySpark. + geom_cond = F.expr(f"try_to_geometry({col_str_norm}) IS NULL") + geom_type_cond = F.expr(f"st_geometrytype(try_to_geometry({col_str_norm})) <> '{POINT_TYPE}'") + condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(geom_cond | geom_type_cond) + condition_str = f"` in column `{col_expr_str}` is not a point geometry" + return make_condition( + condition, + F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), + f"{col_str_norm}_is_not_a_point", + ) + + +@register_rule("row") +def is_linestring(column: str | Column) -> Column: + """Checks whether the values in the input column are linestring geometries. + + Args: + column: column to check; can be a string column name or a column expression + + Returns: + Column object indicating whether the values in the input column are linestring geometries + + Note: + This function requires Databricks runtime 17.1 or above. + """ + col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) + # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in + # Databricks SQL, due to the use of the `try_to_geometry` and `st_geometrytype` functions. + # TODO: Above mentioned functions are not (yet) available. Replace with equivalent functions + # when available in OSS PySpark. + geom_cond = F.expr(f"try_to_geometry({col_str_norm}) IS NULL") + geom_type_cond = F.expr(f"st_geometrytype(try_to_geometry({col_str_norm})) <> '{LINESTRING_TYPE}'") + condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(geom_cond | geom_type_cond) + condition_str = f"` in column `{col_expr_str}` is not a linestring geometry" + return make_condition( + condition, + F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), + f"{col_str_norm}_is_not_a_linestring", + ) + + +@register_rule("row") +def is_polygon(column: str | Column) -> Column: + """Checks whether the values in the input column are polygon geometries. + + Args: + column: column to check; can be a string column name or a column expression + + Returns: + Column object indicating whether the values in the input column are polygon geometries + + Note: + This function requires Databricks runtime 17.1 or above. + """ + col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) + # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in + # Databricks SQL, due to the use of the `try_to_geometry` and `st_geometrytype` functions. + # TODO: Above mentioned functions are not (yet) available. Replace with equivalent functions + # when available in OSS PySpark. + geom_cond = F.expr(f"try_to_geometry({col_str_norm}) IS NULL") + geom_type_cond = F.expr(f"st_geometrytype(try_to_geometry({col_str_norm})) <> '{POLYGON_TYPE}'") + condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(geom_cond | geom_type_cond) + condition_str = f"` in column `{col_expr_str}` is not a polygon geometry" + return make_condition( + condition, + F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), + f"{col_str_norm}_is_not_a_polygon", + ) + + +@register_rule("row") +def is_multipoint(column: str | Column) -> Column: + """Checks whether the values in the input column are multipoint geometries. + + Args: + column: column to check; can be a string column name or a column expression + + Returns: + Column object indicating whether the values in the input column are multipoint geometries + + Note: + This function requires Databricks runtime 17.1 or above. + """ + col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) + # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in + # Databricks SQL, due to the use of the `try_to_geometry` and `st_geometrytype` functions. + # TODO: Above mentioned functions are not (yet) available. Replace with equivalent functions + # when available in OSS PySpark. + geom_cond = F.expr(f"try_to_geometry({col_str_norm}) IS NULL") + geom_type_cond = F.expr(f"st_geometrytype(try_to_geometry({col_str_norm})) <> '{MULTIPOINT_TYPE}'") + condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(geom_cond | geom_type_cond) + condition_str = f"` in column `{col_expr_str}` is not a multipoint geometry" + return make_condition( + condition, + F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), + f"{col_str_norm}_is_not_a_multipoint", + ) + + +@register_rule("row") +def is_multilinestring(column: str | Column) -> Column: + """Checks whether the values in the input column are multilinestring geometries. + + Args: + column: column to check; can be a string column name or a column expression + + Returns: + Column object indicating whether the values in the input column are multilinestring geometries + + Note: + This function requires Databricks runtime 17.1 or above. + """ + col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) + # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in + # Databricks SQL, due to the use of the `try_to_geometry` and `st_geometrytype` functions. + # TODO: Above mentioned functions are not (yet) available. Replace with equivalent functions + # when available in OSS PySpark. + geom_cond = F.expr(f"try_to_geometry({col_str_norm}) IS NULL") + geom_type_cond = F.expr(f"st_geometrytype(try_to_geometry({col_str_norm})) <> '{MULTILINESTRING_TYPE}'") + condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(geom_cond | geom_type_cond) + condition_str = f"` in column `{col_expr_str}` is not a multilinestring geometry" + return make_condition( + condition, + F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), + f"{col_str_norm}_is_not_a_multilinestring", + ) + + +@register_rule("row") +def is_multipolygon(column: str | Column) -> Column: + """Checks whether the values in the input column are multipolygon geometries. + + Args: + column: column to check; can be a string column name or a column expression + + Returns: + Column object indicating whether the values in the input column are multipolygon geometries + + Note: + This function requires Databricks runtime 17.1 or above. + """ + col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) + # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in + # Databricks SQL, due to the use of the `try_to_geometry` and `st_geometrytype` functions. + # TODO: Above mentioned functions are not (yet) available. Replace with equivalent functions + # when available in OSS PySpark. + geom_cond = F.expr(f"try_to_geometry({col_str_norm}) IS NULL") + geom_type_cond = F.expr(f"st_geometrytype(try_to_geometry({col_str_norm})) <> '{MULTIPOLYGON_TYPE}'") + condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(geom_cond | geom_type_cond) + condition_str = f"` in column `{col_expr_str}` is not a multipolygon geometry" + return make_condition( + condition, + F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), + f"{col_str_norm}_is_not_a_multipolygon", + ) + + +@register_rule("row") +def is_geometrycollection(column: str | Column) -> Column: + """Checks whether the values in the input column are geometrycollection geometries. + + Args: + column: column to check; can be a string column name or a column expression + + Returns: + Column object indicating whether the values in the input column are geometrycollection geometries + + Note: + This function requires Databricks runtime 17.1 or above. + """ + col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) + # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in + # Databricks SQL, due to the use of the `try_to_geometry` and `st_geometrytype` functions. + # TODO: Above mentioned functions are not (yet) available. Replace with equivalent functions + # when available in OSS PySpark. + geom_cond = F.expr(f"try_to_geometry({col_str_norm}) IS NULL") + geom_type_cond = F.expr(f"st_geometrytype(try_to_geometry({col_str_norm})) <> '{GEOMETRYCOLLECTION_TYPE}'") + condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(geom_cond | geom_type_cond) + condition_str = f"` in column `{col_expr_str}` is not a geometrycollection geometry" + return make_condition( + condition, + F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), + f"{col_str_norm}_is_not_a_geometrycollection", + ) + + +@register_rule("row") +def is_ogc_valid(column: str | Column) -> Column: + """Checks whether the values in the input column are valid geometries in the OGC sense. + + Args: + column: column to check; can be a string column name or a column expression + + Returns: + Column object indicating whether the values in the input column are valid geometries + + Note: + This function requires Databricks runtime 17.1 or above. + """ + col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) + # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in + # Databricks SQL, due to the use of the `try_to_geometry` and `st_isvalid` functions. + # TODO: Above mentioned functions are not (yet) available. Replace with equivalent functions + # when available in OSS PySpark. + geom_cond = F.expr(f"try_to_geometry({col_str_norm}) IS NULL") + geom_type_cond = F.expr(f"NOT st_isvalid(try_to_geometry({col_str_norm}))") + condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(geom_cond | geom_type_cond) + condition_str = f"` in column `{col_expr_str}` is not a valid geometry (in the OGC sense)" + + return make_condition( + condition, + F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), + f"{col_str_norm}_is_not_a_valid_geometry", + ) + + +@register_rule("row") +def is_latitude(column: str | Column) -> Column: + """Checks whether the values in the input column are valid latitudes. + + Args: + column: column to check; can be a string column name or a column expression + + Returns: + Column object indicating whether the values in the input column are valid latitudes + """ + col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) + condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(F.col(col_str_norm).between(-90.0, 90.0)) + condition_str = f"' in Column '{col_expr_str}' is not a valid latitude must be between -90 and 90" + + return make_condition( + condition, + F.concat_ws("", F.lit("Value '"), col_expr.cast("string"), F.lit(condition_str)), + f"{col_str_norm}_is_not_valid_latitude", + ) + + +@register_rule("row") +def is_longitude(column: str | Column) -> Column: + """Checks whether the values in the input column are valid longitudes. + + Args: + column: column to check; can be a string column name or a column expression + + Returns: + Column object indicating whether the values in the input column are valid longitudes + """ + col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) + condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(F.col(col_str_norm).between(-180.0, 180.0)) + condition_str = f"' in Column '{col_expr_str}' is not a valid longitude (must be between -180 and 180)" + + return make_condition( + condition, + F.concat_ws("", F.lit("Value '"), col_expr.cast("string"), F.lit(condition_str)), + f"{col_str_norm}_is_not_valid_longitude", + ) diff --git a/src/databricks/labs/dqx/installer/config_provider.py b/src/databricks/labs/dqx/installer/config_provider.py index 9be82f6e..e5540552 100644 --- a/src/databricks/labs/dqx/installer/config_provider.py +++ b/src/databricks/labs/dqx/installer/config_provider.py @@ -1,3 +1,4 @@ +import os import json import logging from databricks.labs.blueprint.tui import Prompts @@ -5,24 +6,31 @@ from databricks.labs.dqx.config import WorkspaceConfig, RunConfig, InputConfig, OutputConfig, ProfilerConfig -logger = logging.getLogger(__name__) - - class ConfigProvider: """ Collects configuration from the user interactively. """ - def __init__(self, prompts: Prompts, warehouse_configurator: WarehouseInstaller): + def __init__(self, prompts: Prompts, warehouse_configurator: WarehouseInstaller, logger: logging.Logger): self._prompts = prompts self._warehouse_configurator = warehouse_configurator + self.logger = logger - def prompt_new_installation(self) -> WorkspaceConfig: - logger.info( + def prompt_new_installation(self, install_folder: str | None = None) -> WorkspaceConfig: + self.logger.info( "Please answer a couple of questions to provide default DQX run configuration. " "The configuration can also be updated manually after the installation." ) + # Show installation folder information + if install_folder: + self.logger.info(f"DQX will be installed in folder '{install_folder}'") + else: + install_path = ( + "/Applications/dqx" if os.getenv("DQX_FORCE_INSTALL") == "global" else "/Users//.dqx" + ) + self.logger.info(f"DQX will be installed in the default location: '{install_path}'") + log_level = self._prompts.question("Log level", default="INFO").upper() is_streaming = self._prompts.confirm("Should the input data be read using streaming?") input_config = self._prompt_input_config(is_streaming) diff --git a/src/databricks/labs/dqx/installer/install.py b/src/databricks/labs/dqx/installer/install.py index a0d8d3b4..fb4a2239 100644 --- a/src/databricks/labs/dqx/installer/install.py +++ b/src/databricks/labs/dqx/installer/install.py @@ -13,7 +13,7 @@ from databricks.labs.blueprint.parallel import ManyError, Threads from databricks.labs.blueprint.tui import Prompts from databricks.labs.blueprint.upgrades import Upgrades -from databricks.labs.blueprint.wheels import ProductInfo +from databricks.labs.blueprint.wheels import ProductInfo, WheelsV2 from databricks.sdk import WorkspaceClient from databricks.sdk.core import with_user_agent_extra from databricks.sdk.errors import ( @@ -22,6 +22,7 @@ PermissionDenied, ) +from databricks.labs.dqx.config_loader import RunConfigLoader from databricks.labs.dqx.installer.config_provider import ConfigProvider from databricks.labs.dqx.installer.dashboard_installer import DashboardInstaller from databricks.labs.dqx.installer.version_checker import VersionChecker @@ -45,14 +46,16 @@ class WorkspaceInstaller(WorkspaceContext): Args: environ: Optional dictionary of environment variables. ws: The WorkspaceClient instance. + install_folder: Optional custom workspace folder path for installation. """ - def __init__(self, ws: WorkspaceClient, environ: dict[str, str] | None = None): + def __init__(self, ws: WorkspaceClient, environ: dict[str, str] | None = None, install_folder: str | None = None): super().__init__(ws) if not environ: environ = dict(os.environ.items()) self._force_install = environ.get("DQX_FORCE_INSTALL") + self._install_folder = install_folder if "DATABRICKS_RUNTIME_VERSION" in environ: msg = "WorkspaceInstaller is not supposed to be executed in Databricks Runtime" @@ -79,6 +82,11 @@ def installation(self): Raises: NotFound: If the installation is not found. """ + if self._install_folder: + return RunConfigLoader.get_custom_installation( + self.workspace_client, self.product_info.product_name(), self._install_folder + ) + try: return self.product_info.current_installation(self.workspace_client) except NotFound: @@ -203,8 +211,8 @@ def _is_testing(self): def _prompt_for_new_installation(self) -> WorkspaceConfig: configurator = WarehouseInstaller(self.workspace_client, self.prompts) - prompter = ConfigProvider(self.prompts, configurator) - return prompter.prompt_new_installation() + prompter = ConfigProvider(self.prompts, configurator, logger) + return prompter.prompt_new_installation(self._install_folder) def _confirm_force_install(self) -> bool: if not self._force_install: @@ -263,7 +271,7 @@ def __init__( self._ws = ws self._prompts = prompts self._product_info = product_info - self._wheels = product_info.wheels(ws) + self._wheels = WheelsV2(self._installation, product_info) @classmethod def current(cls, ws: WorkspaceClient): @@ -282,7 +290,7 @@ def current(cls, ws: WorkspaceClient): config = installation.load(WorkspaceConfig) run_config_name = config.get_run_config().name prompts = Prompts() - wheels = product_info.wheels(ws) + wheels = WheelsV2(installation, product_info) tasks = WorkflowsRunner.all(config).tasks() workflow_installer = WorkflowDeployment( config, run_config_name, installation, install_state, ws, wheels, product_info, tasks @@ -366,5 +374,16 @@ def _create_dashboard(self) -> None: if is_in_debug(): logging.getLogger("databricks").setLevel(logging.DEBUG) - workspace_installer = WorkspaceInstaller(WorkspaceClient(product="dqx", product_version=__version__)) + installer_prompts = Prompts() + custom_folder = installer_prompts.question( + "Enter a workspace path for DQX installation (leave empty to install in user's home or global directory)", + default="empty", + valid_regex=r"^(/.*)?$", + ).strip() + + custom_install_folder = custom_folder if custom_folder and custom_folder != "empty" else None + + workspace_installer = WorkspaceInstaller( + WorkspaceClient(product="dqx", product_version=__version__), install_folder=custom_install_folder + ) workspace_installer.run() diff --git a/src/databricks/labs/dqx/profiler/profiler_workflow.py b/src/databricks/labs/dqx/profiler/profiler_workflow.py index 6c178e1a..d52d3cbd 100644 --- a/src/databricks/labs/dqx/profiler/profiler_workflow.py +++ b/src/databricks/labs/dqx/profiler/profiler_workflow.py @@ -35,6 +35,7 @@ def profile(self, ctx: WorkflowContext): run_config_name=run_config.name, assume_user=True, product_name=ctx.installation.product(), + install_folder=ctx.installation.install_folder(), ) ctx.profiler.save(checks, profile_summary_stats, storage_config, run_config.profiler_config.summary_stats_file) diff --git a/src/databricks/labs/dqx/quality_checker/quality_checker_workflow.py b/src/databricks/labs/dqx/quality_checker/quality_checker_workflow.py index fe51803a..a652d4b9 100644 --- a/src/databricks/labs/dqx/quality_checker/quality_checker_workflow.py +++ b/src/databricks/labs/dqx/quality_checker/quality_checker_workflow.py @@ -34,6 +34,7 @@ def apply_checks(self, ctx: WorkflowContext): location=run_config.checks_location, run_config_name=run_config.name, product_name=ctx.product_info.product_name(), + install_folder=ctx.installation.install_folder(), ) ) diff --git a/src/databricks/labs/dqx/workflows_runner.py b/src/databricks/labs/dqx/workflows_runner.py index 2d7add0d..bfaea26e 100644 --- a/src/databricks/labs/dqx/workflows_runner.py +++ b/src/databricks/labs/dqx/workflows_runner.py @@ -36,10 +36,11 @@ def __init__(self, workflows: list[Workflow]): self._tasks.append(with_workflow) @classmethod - def all(cls, config: WorkspaceConfig): + def all(cls, config: WorkspaceConfig) -> "WorkflowsRunner": """Return all workflows.""" profiler = ProfilerWorkflow( - spark_conf=config.profiler_spark_conf, override_clusters=config.profiler_override_clusters + spark_conf=config.profiler_spark_conf, + override_clusters=config.profiler_override_clusters, ) quality_checker = DataQualityWorkflow( spark_conf=config.quality_checker_spark_conf, diff --git a/tests/conftest.py b/tests/conftest.py index 7070e709..5766c084 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,7 +6,7 @@ import pytest from databricks.labs.blueprint.installation import Installation, MockInstallation from databricks.labs.blueprint.tui import MockPrompts -from databricks.labs.blueprint.wheels import ProductInfo +from databricks.labs.blueprint.wheels import ProductInfo, WheelsV2 from databricks.labs.dqx.__about__ import __version__ from databricks.labs.dqx.config import WorkspaceConfig, RunConfig from databricks.labs.dqx.contexts.workflow_context import WorkflowContext @@ -78,14 +78,16 @@ def __init__( ws: WorkspaceClient, checks_location, serverless_clusters: bool = True, + install_folder: str | None = None, ): super().__init__(env_or_skip_fixture, ws) self.checks_location = checks_location self.serverless_clusters = serverless_clusters + self.install_folder = install_folder @cached_property def installation(self): - return Installation(self.workspace_client, self.product_info.product_name()) + return Installation(self.workspace_client, self.product_info.product_name(), install_folder=self.install_folder) @cached_property def environ(self) -> dict[str, str]: @@ -96,6 +98,7 @@ def workspace_installer(self): return WorkspaceInstaller( self.workspace_client, self.environ, + self.install_folder, ).replace(prompts=self.prompts, installation=self.installation, product_info=self.product_info) @cached_property @@ -130,7 +133,7 @@ def workflows_deployment(self) -> WorkflowDeployment: self.installation, self.install_state, self.workspace_client, - self.product_info.wheels(self.workspace_client), + WheelsV2(self.installation, self.product_info), self.product_info, self.tasks, ) @@ -188,6 +191,18 @@ def serverless_installation_ctx( ctx.installation_service.uninstall() +@pytest.fixture +def installation_ctx_custom_install_folder( + ws: WorkspaceClient, make_directory, env_or_skip: Callable[[str], str], checks_location="checks.yml" +) -> Generator[MockInstallationContext, None, None]: + custom_folder = str(make_directory().absolute()) + ctx = MockInstallationContext( + env_or_skip, ws, checks_location, serverless_clusters=False, install_folder=custom_folder + ) + yield ctx.replace(workspace_client=ws) + ctx.installation_service.uninstall() + + @pytest.fixture def checks_yaml_content(): return """- criticality: error diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index d833aa19..4e36ec60 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -91,6 +91,38 @@ def delete(resource) -> None: yield from factory("workflows", lambda **kw: create(spark, **kw), delete) +@pytest.fixture +def setup_workflows_with_custom_folder( + ws, spark, installation_ctx_custom_install_folder, make_schema, make_table, make_random +): + """ + Set up the workflows with installation in the custom install folder. + """ + + def create(_spark, **kwargs): + installation_ctx_custom_install_folder.installation_service.run() + + quarantine = False + if "quarantine" in kwargs and kwargs["quarantine"]: + quarantine = True + + checks_location = None + if "checks" in kwargs and kwargs["checks"]: + checks_location = _setup_quality_checks(installation_ctx_custom_install_folder, _spark, ws) + + run_config = _setup_workflows_deps( + installation_ctx_custom_install_folder, make_schema, make_table, make_random, checks_location, quarantine + ) + return installation_ctx_custom_install_folder, run_config + + def delete(resource) -> None: + ctx, run_config = resource + checks_location = f"{ctx.installation.install_folder()}/{run_config.checks_location}" + ws.workspace.delete(checks_location) + + yield from factory("workflows", lambda **kw: create(spark, **kw), delete) + + def _setup_workflows_deps( ctx, make_schema, @@ -228,6 +260,7 @@ def _setup_quality_checks(ctx, spark, ws): config = InstallationChecksStorageConfig( location=checks_location, product_name=ctx.installation.product(), + install_folder=ctx.installation.install_folder(), ) InstallationChecksStorageHandler(ws, spark).save(checks=checks, config=config) diff --git a/tests/integration/test_config.py b/tests/integration/test_config.py index 9d5daffa..b336e0ab 100644 --- a/tests/integration/test_config.py +++ b/tests/integration/test_config.py @@ -1,10 +1,13 @@ from unittest.mock import patch from databricks.labs.blueprint.installation import Installation +from databricks.labs.blueprint.wheels import ProductInfo + +from databricks.labs.dqx.config import WorkspaceConfig from databricks.labs.dqx.config_loader import RunConfigLoader -def test_load_run_config_from_user_installation(ws, installation_ctx, spark): +def test_load_run_config_from_user_installation(ws, installation_ctx): installation_ctx.installation.save(installation_ctx.config) product_name = installation_ctx.product_info.product_name() @@ -29,3 +32,29 @@ def test_load_run_config_from_global_installation(ws, installation_ctx): ) assert run_config == expected_run_config + + +def test_load_run_config_from_custom_folder_installation(ws, installation_ctx_custom_install_folder): + installation_ctx_custom_install_folder.installation.save(installation_ctx_custom_install_folder.config) + product_name = installation_ctx_custom_install_folder.product_info.product_name() + + run_config = RunConfigLoader(ws).load_run_config( + run_config_name="default", + assume_user=True, + product_name=product_name, + install_folder=installation_ctx_custom_install_folder.install_folder, + ) + expected_run_config = installation_ctx_custom_install_folder.config.get_run_config("default") + + assert run_config == expected_run_config + + +def test_get_custom_installation(ws, make_directory): + product_info = ProductInfo.for_testing(WorkspaceConfig) + custom_folder = str(make_directory().absolute()) + + custom_installation = RunConfigLoader.get_custom_installation(ws, product_info.product_name(), custom_folder) + custom_installation.install_folder() + + assert custom_installation.install_folder() == custom_folder + assert ws.workspace.get_status(custom_folder) diff --git a/tests/integration/test_e2e_workflow.py b/tests/integration/test_e2e_workflow.py index 4bfb4f70..29dd4498 100644 --- a/tests/integration/test_e2e_workflow.py +++ b/tests/integration/test_e2e_workflow.py @@ -40,3 +40,26 @@ def test_e2e_workflow_serverless(ws, spark, setup_serverless_workflows, expected quarantine_df = spark.table(run_config.quarantine_config.location) assert quarantine_df.count() > 0, "Output table is empty" + + +def test_e2e_workflow_with_custom_install_folder( + ws, spark, setup_workflows_with_custom_folder, expected_quality_checking_output +): + installation_ctx, run_config = setup_workflows_with_custom_folder() + + installation_ctx.deployed_workflows.run_workflow("e2e", run_config.name) + + config = InstallationChecksStorageConfig( + run_config_name=run_config.name, + assume_user=True, + product_name=installation_ctx.installation.product(), + install_folder=installation_ctx.installation.install_folder(), + ) + checks = DQEngine(ws, spark).load_checks(config=config) + assert checks, "Checks were not loaded correctly" + + checked_df = spark.table(run_config.output_config.location) + input_df = spark.table(run_config.input_config.location) + + # this is sanity check only, we cannot predict the exact output as it depends on the generated rules + assert checked_df.count() == input_df.count(), "Output table is empty" diff --git a/tests/integration/test_installation.py b/tests/integration/test_installation.py index 8a767091..b88508a4 100644 --- a/tests/integration/test_installation.py +++ b/tests/integration/test_installation.py @@ -3,6 +3,7 @@ from unittest.mock import patch, create_autospec import pytest +from databricks.labs.dqx.config_loader import RunConfigLoader from databricks.labs.dqx.installer.install import WorkspaceInstaller from tests.integration.conftest import contains_expected_workflows import databricks @@ -33,6 +34,7 @@ def factory( product_info: ProductInfo | None = None, environ: dict[str, str] | None = None, extend_prompts: dict[str, str] | None = None, + install_folder: str | None = None, ): logger.debug("Creating new installation...") if not product_info: @@ -53,13 +55,14 @@ def factory( if not installation: installation = Installation(ws, product_info.product_name()) - installer = WorkspaceInstaller(ws, environ).replace( + installer = WorkspaceInstaller(ws, environ, install_folder=install_folder).replace( installation=installation, product_info=product_info, prompts=prompts, ) workspace_config = installer.configure() - installation = product_info.current_installation(ws) + if not install_folder: + installation = product_info.current_installation(ws) installation.save(workspace_config) cleanup.append(installation) return installation @@ -90,12 +93,26 @@ def test_fresh_user_config_installation(ws, installation_ctx): ) +def test_fresh_custom_folder_config_installation(ws, installation_ctx_custom_install_folder): + installation_ctx_custom_install_folder.installation.save(installation_ctx_custom_install_folder.config) + assert ( + installation_ctx_custom_install_folder.installation_service.install_folder + != f"/Users/{ws.current_user.me().user_name}/.{installation_ctx_custom_install_folder.product_info.product_name()}" + ) + + def test_complete_installation(ws, installation_ctx): installation_ctx.workspace_installer.run(installation_ctx.config) assert installation_ctx.workspace_installer.installation assert installation_ctx.deployed_workflows.latest_job_status() +def test_complete_installation_with_custom_folder(ws, installation_ctx_custom_install_folder): + installation_ctx_custom_install_folder.workspace_installer.run(installation_ctx_custom_install_folder.config) + assert installation_ctx_custom_install_folder.workspace_installer.installation + assert installation_ctx_custom_install_folder.deployed_workflows.latest_job_status() + + def test_installation(ws, installation_ctx): installation_ctx.installation_service.run() workflows = installation_ctx.deployed_workflows.latest_job_status() @@ -106,6 +123,16 @@ def test_installation(ws, installation_ctx): assert contains_expected_workflows(workflows, state) +def test_installation_with_custom_folder(ws, installation_ctx_custom_install_folder): + installation_ctx_custom_install_folder.installation_service.run() + workflows = installation_ctx_custom_install_folder.deployed_workflows.latest_job_status() + expected_workflows_state = [{'workflow': 'profiler', 'state': 'UNKNOWN', 'started': ''}] + + assert ws.workspace.get_status(installation_ctx_custom_install_folder.installation_service.install_folder) + for state in expected_workflows_state: + assert contains_expected_workflows(workflows, state) + + def test_dashboard_state_installation(ws, installation_ctx): installation_ctx.installation_service.run() dashboard_id = list(installation_ctx.install_state.dashboards.values())[0] @@ -113,6 +140,13 @@ def test_dashboard_state_installation(ws, installation_ctx): assert dashboard_id is not None +def test_dashboard_state_installation_with_custom_folder(ws, installation_ctx_custom_install_folder): + installation_ctx_custom_install_folder.installation_service.run() + dashboard_id = list(installation_ctx_custom_install_folder.install_state.dashboards.values())[0] + + assert dashboard_id is not None + + def test_dashboard_workspace_installation(ws, installation_ctx): installation_ctx.installation_service.run() dashboard_id = list(installation_ctx.install_state.dashboards.values())[0] @@ -121,6 +155,14 @@ def test_dashboard_workspace_installation(ws, installation_ctx): assert dashboard.lifecycle_state == LifecycleState.ACTIVE +def test_dashboard_workspace_installation_with_custom_folder(ws, installation_ctx_custom_install_folder): + installation_ctx_custom_install_folder.installation_service.run() + dashboard_id = list(installation_ctx_custom_install_folder.install_state.dashboards.values())[0] + dashboard = ws.lakeview.get(dashboard_id) + + assert dashboard.lifecycle_state == LifecycleState.ACTIVE + + def test_dashboard_repeated_workspace_installation(ws, installation_ctx): installation_ctx.installation_service.run() installation_ctx.installation_service.run() @@ -130,6 +172,15 @@ def test_dashboard_repeated_workspace_installation(ws, installation_ctx): assert dashboard.lifecycle_state == LifecycleState.ACTIVE +def test_dashboard_repeated_workspace_installation_with_custom_folder(ws, installation_ctx_custom_install_folder): + installation_ctx_custom_install_folder.installation_service.run() + installation_ctx_custom_install_folder.installation_service.run() + dashboard_id = list(installation_ctx_custom_install_folder.install_state.dashboards.values())[0] + dashboard = ws.lakeview.get(dashboard_id) + + assert dashboard.lifecycle_state == LifecycleState.ACTIVE + + def test_installation_when_dashboard_is_trashed(ws, installation_ctx): """A dashboard might be trashed (manually), the upgrade should handle this.""" installation_ctx.installation_service.run() @@ -142,6 +193,18 @@ def test_installation_when_dashboard_is_trashed(ws, installation_ctx): assert True, "Installation succeeded when dashboard was trashed" +def test_installation_with_custom_folder_when_dashboard_is_trashed(ws, installation_ctx_custom_install_folder): + """A dashboard might be trashed (manually), the upgrade should handle this.""" + installation_ctx_custom_install_folder.installation_service.run() + dashboard_id = list(installation_ctx_custom_install_folder.install_state.dashboards.values())[0] + ws.lakeview.trash(dashboard_id) + try: + installation_ctx_custom_install_folder.installation_service.run() + except NotFound: + assert False, "Installation failed when dashboard was trashed" + assert True, "Installation succeeded when dashboard was trashed" + + def test_installation_when_dashboard_state_missing(ws, installation_ctx): installation_ctx.installation_service.run() state_file = installation_ctx.install_state.install_folder() + "/" + RawState.__file__ @@ -153,6 +216,17 @@ def test_installation_when_dashboard_state_missing(ws, installation_ctx): assert dashboard.lifecycle_state == LifecycleState.ACTIVE +def test_installation_with_custom_folder_when_dashboard_state_missing(ws, installation_ctx_custom_install_folder): + installation_ctx_custom_install_folder.installation_service.run() + state_file = installation_ctx_custom_install_folder.install_state.install_folder() + "/" + RawState.__file__ + ws.workspace.delete(state_file) + installation_ctx_custom_install_folder.installation_service.run() # check that dashboard can be overwritten + dashboard_id = list(installation_ctx_custom_install_folder.install_state.dashboards.values())[0] + dashboard = ws.lakeview.get(dashboard_id) + + assert dashboard.lifecycle_state == LifecycleState.ACTIVE + + def test_uninstallation(ws, installation_ctx): installation_ctx.installation_service.run() job_id = list(installation_ctx.install_state.jobs.values())[0] @@ -166,6 +240,19 @@ def test_uninstallation(ws, installation_ctx): ws.dashboards.get(dashboard_id) +def test_uninstallation_with_custom_folder(ws, installation_ctx_custom_install_folder): + installation_ctx_custom_install_folder.installation_service.run() + job_id = list(installation_ctx_custom_install_folder.install_state.jobs.values())[0] + dashboard_id = list(installation_ctx_custom_install_folder.install_state.dashboards.values())[0] + installation_ctx_custom_install_folder.installation_service.uninstall() + with pytest.raises(NotFound): + ws.workspace.get_status(installation_ctx_custom_install_folder.installation_service.install_folder) + with pytest.raises(NotFound): + ws.jobs.get(job_id) + with pytest.raises(NotFound): + ws.dashboards.get(dashboard_id) + + def test_uninstallation_dashboard_does_not_exist_anymore(ws, installation_ctx): installation_ctx.installation_service.run() dashboard_id = list(installation_ctx.install_state.dashboards.values())[0] @@ -173,6 +260,13 @@ def test_uninstallation_dashboard_does_not_exist_anymore(ws, installation_ctx): installation_ctx.installation_service.uninstall() +def test_uninstallation_with_custom_folder_dashboard_does_not_exist_anymore(ws, installation_ctx_custom_install_folder): + installation_ctx_custom_install_folder.installation_service.run() + dashboard_id = list(installation_ctx_custom_install_folder.install_state.dashboards.values())[0] + ws.lakeview.trash(dashboard_id) + installation_ctx_custom_install_folder.installation_service.uninstall() + + def test_uninstallation_job_does_not_exist_anymore(ws, installation_ctx): installation_ctx.installation_service.run() job_id = list(installation_ctx.install_state.jobs.values())[0] @@ -180,6 +274,13 @@ def test_uninstallation_job_does_not_exist_anymore(ws, installation_ctx): installation_ctx.installation_service.uninstall() +def test_uninstallation_with_custom_folder_job_does_not_exist_anymore(ws, installation_ctx_custom_install_folder): + installation_ctx_custom_install_folder.installation_service.run() + job_id = list(installation_ctx_custom_install_folder.install_state.jobs.values())[0] + ws.jobs.delete(job_id) + installation_ctx_custom_install_folder.installation_service.uninstall() + + def test_global_installation_on_existing_global_install(ws, installation_ctx): product_name = installation_ctx.product_info.product_name() # patch the global installation to existing install_folder to avoid access permission issues in the workspace @@ -290,6 +391,61 @@ def test_global_installation_on_existing_user_install(ws, new_installation): ) +def test_custom_folder_installation_on_existing_user_installation(ws, make_directory, new_installation): + product_info = ProductInfo.for_testing(WorkspaceConfig) + + existing_user_installation = new_installation( + product_info=product_info, installation=Installation.assume_user_home(ws, product_info.product_name()) + ) + assert ( + existing_user_installation.install_folder() + == f"/Users/{ws.current_user.me().user_name}/.{product_info.product_name()}" + ) + + custom_folder = str(make_directory().absolute()) + custom_installation = RunConfigLoader.get_custom_installation(ws, product_info.product_name(), custom_folder) + installation = new_installation( + product_info=product_info, + installation=custom_installation, + install_folder=custom_folder, + ) + + assert installation.install_folder() == custom_folder + assert ws.workspace.get_status(custom_folder) + + +def test_custom_folder_installation_upgrade(ws, installation_ctx_custom_install_folder, new_installation): + product_info = ProductInfo.for_testing(WorkspaceConfig) + installation_ctx_custom_install_folder.installation.save(installation_ctx_custom_install_folder.config) + + custom_folder = installation_ctx_custom_install_folder.installation_service.install_folder + custom_installation = RunConfigLoader.get_custom_installation(ws, product_info.product_name(), custom_folder) + second_installation = new_installation( + product_info=product_info, + installation=custom_installation, + install_folder=custom_folder, + ) + + assert second_installation.install_folder() == custom_folder + assert ws.workspace.get_status(custom_folder) + + +def test_custom_folder_installation_with_environment_variable(ws, make_directory, new_installation): + product_info = ProductInfo.for_testing(WorkspaceConfig) + custom_folder = str(make_directory().absolute()) + + custom_installation = RunConfigLoader.get_custom_installation(ws, product_info.product_name(), custom_folder) + installation = new_installation( + product_info=product_info, + installation=custom_installation, + install_folder=custom_folder, + environ={'DQX_FORCE_INSTALL': 'global'}, # environment variable should not override the install folder + ) + + assert installation.install_folder() == custom_folder + assert ws.workspace.get_status(custom_folder) + + @skip("New tag version must be created in git") def test_compare_remote_local_install_versions(ws, installation_ctx): installation_ctx.installation_service.run() @@ -320,6 +476,17 @@ def test_installation_stores_install_state_keys(ws, installation_ctx): assert getattr(install_state, key), f"Installation state is empty: {key}" +def test_installation_with_custom_folder_stores_install_state_keys(ws, installation_ctx_custom_install_folder): + """The installation should store the keys in the installation state.""" + expected_keys = ["jobs", "dashboards"] + installation_ctx_custom_install_folder.installation_service.run() + # Refresh the installation state since the installation context uses `@cached_property` + install_state = InstallState.from_installation(installation_ctx_custom_install_folder.installation) + for key in expected_keys: + assert hasattr(install_state, key), f"Missing key in install state: {key}" + assert getattr(install_state, key), f"Installation state is empty: {key}" + + def side_effect_remove_after_in_tags_settings(**settings) -> CreateResponse: tags = settings.get("tags", {}) _ = tags["RemoveAfter"] # KeyError side effect diff --git a/tests/integration/test_load_checks_from_workspace_file.py b/tests/integration/test_load_checks_from_workspace_file.py index 22125243..29ee069e 100644 --- a/tests/integration/test_load_checks_from_workspace_file.py +++ b/tests/integration/test_load_checks_from_workspace_file.py @@ -96,6 +96,23 @@ def test_load_checks_from_user_installation(ws, installation_ctx, make_check_fil assert checks == expected_checks, "Checks were not loaded correctly" +def test_load_checks_from_custom_folder_installation( + ws, installation_ctx_custom_install_folder, make_check_file_as_yaml, expected_checks, spark +): + installation_ctx_custom_install_folder.installation.save(installation_ctx_custom_install_folder.config) + make_check_file_as_yaml(install_dir=installation_ctx_custom_install_folder.installation.install_folder()) + + config = InstallationChecksStorageConfig( + run_config_name="default", + assume_user=True, + product_name=installation_ctx_custom_install_folder.installation.product(), + install_folder=installation_ctx_custom_install_folder.installation.install_folder(), + ) + checks = DQEngine(ws, spark).load_checks(config=config) + + assert checks == expected_checks, "Checks were not loaded correctly" + + def test_load_checks_from_absolute_path(ws, installation_ctx, make_check_file_as_yaml, expected_checks, spark): checks_location = make_check_file_as_yaml() config = installation_ctx.config diff --git a/tests/integration/test_profiler_workflow.py b/tests/integration/test_profiler_workflow.py index ffec805f..cb426bce 100644 --- a/tests/integration/test_profiler_workflow.py +++ b/tests/integration/test_profiler_workflow.py @@ -82,3 +82,24 @@ def test_profiler_workflow_serverless(ws, spark, setup_serverless_workflows): install_folder = installation_ctx.installation.install_folder() status = ws.workspace.get_status(f"{install_folder}/{run_config.profiler_config.summary_stats_file}") assert status, f"Profile summary stats file {run_config.profiler_config.summary_stats_file} does not exist." + + +def test_profiler_workflow_with_custom_install_folder(ws, spark, setup_workflows_with_custom_folder): + installation_ctx, run_config = setup_workflows_with_custom_folder() + + installation_ctx.deployed_workflows.run_workflow("profiler", run_config.name) + + config = InstallationChecksStorageConfig( + run_config_name=run_config.name, + assume_user=True, + product_name=installation_ctx.installation.product(), + install_folder=installation_ctx.installation.install_folder(), + ) + + dq_engine = DQEngine(ws, spark) + checks = dq_engine.load_checks(config=config) + assert checks, "Checks were not loaded correctly" + + install_folder = installation_ctx.installation.install_folder() + status = ws.workspace.get_status(f"{install_folder}/{run_config.profiler_config.summary_stats_file}") + assert status, f"Profile summary stats file {run_config.profiler_config.summary_stats_file} does not exist." diff --git a/tests/integration/test_quality_checker_workflow.py b/tests/integration/test_quality_checker_workflow.py index 49443203..d6bf3182 100644 --- a/tests/integration/test_quality_checker_workflow.py +++ b/tests/integration/test_quality_checker_workflow.py @@ -29,6 +29,17 @@ def test_quality_checker_workflow_serverless(ws, spark, setup_serverless_workflo assert_df_equality(checked_df, expected_quality_checking_output, ignore_nullable=True) +def test_quality_checker_workflow_with_custom_install_folder( + ws, spark, setup_workflows_with_custom_folder, expected_quality_checking_output +): + installation_ctx, run_config = setup_workflows_with_custom_folder(checks=True) + + installation_ctx.deployed_workflows.run_workflow("quality-checker", run_config.name) + + checked_df = spark.table(run_config.output_config.location) + assert_df_equality(checked_df, expected_quality_checking_output, ignore_nullable=True) + + def test_quality_checker_workflow_streaming(ws, spark, setup_serverless_workflows, expected_quality_checking_output): installation_ctx, run_config = setup_serverless_workflows(checks=True, is_streaming=True) diff --git a/tests/integration/test_row_checks_geo.py b/tests/integration/test_row_checks_geo.py new file mode 100644 index 00000000..372a4c31 --- /dev/null +++ b/tests/integration/test_row_checks_geo.py @@ -0,0 +1,256 @@ +import pytest +from chispa.dataframe_comparer import assert_df_equality # type: ignore +from databricks.labs.dqx.geo.check_funcs import ( + is_geometry, + is_geography, + is_geometrycollection, + is_linestring, + is_multilinestring, + is_multipoint, + is_multipolygon, + is_point, + is_polygon, + is_ogc_valid, +) + + +def test_is_geometry(spark): + input_schema = "geom_string: string, geom_binary: binary, geom_int: int" + test_df = spark.createDataFrame( + [ + ["POINT(1 1)", None, None], # valid WKT + ["not-a-geometry", None, None], # invalid (not valid WKT) + [None, bytes.fromhex("01E9030000000000000000F03F00000000000000400000000000005940"), None], # valid WKB + [None, None, 42], # invalid (wrong data type) + ], + input_schema, + ) + + actual = test_df.select(is_geometry("geom_string"), is_geometry("geom_binary"), is_geometry("geom_int")) + + checked_schema = "geom_string_is_not_a_geometry: string, geom_binary_is_not_a_geometry: string, geom_int_is_not_a_geometry: string" + expected = spark.createDataFrame( + [ + [None, None, None], + ["value `not-a-geometry` in column `geom_string` is not a geometry", None, None], + [None, None, None], + [None, None, "value `42` in column `geom_int` is not a geometry"], + ], + checked_schema, + ) + + assert_df_equality(actual, expected, ignore_nullable=True) + + +def test_is_geography(spark): + input_schema = "geography_string: string, geography_binary: binary, geography_int: int" + test_df = spark.createDataFrame( + [ + ["POINT(1 1)", None, None], # valid WKT + ["POINT(181 91)", None, None], # invalid (lat/lon out of range) + ["not-a-geography", None, None], # invalid (not valid WKT) + [None, bytes.fromhex("0101000000000000000000f03f0000000000000040"), None], # valid WKB + [None, None, 42], # invalid (wrong data type) + ], + input_schema, + ) + + actual = test_df.select( + is_geography("geography_string"), is_geography("geography_binary"), is_geography("geography_int") + ) + + checked_schema = "geography_string_is_not_a_geography: string, geography_binary_is_not_a_geography: string, geography_int_is_not_a_geography: string" + expected = spark.createDataFrame( + [ + [None, None, None], + ["value `POINT(181 91)` in column `geography_string` is not a geography", None, None], + ["value `not-a-geography` in column `geography_string` is not a geography", None, None], + [None, None, None], + [None, None, "value `42` in column `geography_int` is not a geography"], + ], + checked_schema, + ) + + assert_df_equality(actual, expected, ignore_nullable=True) + + +def test_is_point(spark): + input_schema = "geom: string" + test_df = spark.createDataFrame( + [["POINT(1 1)"], ["nonsense"], ["POLYGON((1 1, 2 2, 3 3, 1 1))"], [None]], + input_schema, + ) + + actual = test_df.select(is_point("geom")) + + checked_schema = "geom_is_not_a_point: string" + expected = spark.createDataFrame( + [ + [None], + ["value `nonsense` in column `geom` is not a point geometry"], + ["value `POLYGON((1 1, 2 2, 3 3, 1 1))` in column `geom` is not a point geometry"], + [None], + ], + checked_schema, + ) + + assert_df_equality(actual, expected, ignore_nullable=True) + + +def test_is_linestring(spark): + input_schema = "geom: string" + test_df = spark.createDataFrame( + [["LINESTRING(1 1, 2 2)"], ["nonsense"], ["POLYGON((1 1, 2 2, 3 3, 1 1))"], [None]], + input_schema, + ) + + actual = test_df.select(is_linestring("geom")) + + checked_schema = "geom_is_not_a_linestring: string" + expected = spark.createDataFrame( + [ + [None], + ["value `nonsense` in column `geom` is not a linestring geometry"], + ["value `POLYGON((1 1, 2 2, 3 3, 1 1))` in column `geom` is not a linestring geometry"], + [None], + ], + checked_schema, + ) + assert_df_equality(actual, expected, ignore_nullable=True) + + +def test_is_polygon(spark): + input_schema = "geom: string" + test_df = spark.createDataFrame( + [["POLYGON((1 1, 2 2, 3 3, 1 1))"], ["nonsense"], ["LINESTRING(1 1, 2 2)"], [None]], + input_schema, + ) + + actual = test_df.select(is_polygon("geom")) + + checked_schema = "geom_is_not_a_polygon: string" + expected = spark.createDataFrame( + [ + [None], + ["value `nonsense` in column `geom` is not a polygon geometry"], + ["value `LINESTRING(1 1, 2 2)` in column `geom` is not a polygon geometry"], + [None], + ], + checked_schema, + ) + assert_df_equality(actual, expected, ignore_nullable=True) + + +def test_is_multipoint(spark): + input_schema = "geom: string" + test_df = spark.createDataFrame( + [["MULTIPOINT(1 1, 2 2)"], ["nonsense"], ["LINESTRING(1 1, 2 2)"], [None]], + input_schema, + ) + + actual = test_df.select(is_multipoint("geom")) + + checked_schema = "geom_is_not_a_multipoint: string" + expected = spark.createDataFrame( + [ + [None], + ["value `nonsense` in column `geom` is not a multipoint geometry"], + ["value `LINESTRING(1 1, 2 2)` in column `geom` is not a multipoint geometry"], + [None], + ], + checked_schema, + ) + assert_df_equality(actual, expected, ignore_nullable=True) + + +def test_is_multilinestring(spark): + input_schema = "geom: string" + test_df = spark.createDataFrame( + [["MULTILINESTRING((1 1, 2 2), (3 3, 4 4))"], ["nonsense"], ["POLYGON((1 1, 2 2, 3 3, 1 1))"], [None]], + input_schema, + ) + + actual = test_df.select(is_multilinestring("geom")) + + checked_schema = "geom_is_not_a_multilinestring: string" + expected = spark.createDataFrame( + [ + [None], + ["value `nonsense` in column `geom` is not a multilinestring geometry"], + ["value `POLYGON((1 1, 2 2, 3 3, 1 1))` in column `geom` is not a multilinestring geometry"], + [None], + ], + checked_schema, + ) + assert_df_equality(actual, expected, ignore_nullable=True) + + +def test_is_multipolygon(spark): + input_schema = "geom: string" + test_df = spark.createDataFrame( + [["MULTIPOLYGON(((1 1, 2 2, 3 3, 1 1)))"], ["nonsense"], ["LINESTRING(1 1, 2 2)"], [None]], + input_schema, + ) + + actual = test_df.select(is_multipolygon("geom")) + + checked_schema = "geom_is_not_a_multipolygon: string" + expected = spark.createDataFrame( + [ + [None], + ["value `nonsense` in column `geom` is not a multipolygon geometry"], + ["value `LINESTRING(1 1, 2 2)` in column `geom` is not a multipolygon geometry"], + [None], + ], + checked_schema, + ) + assert_df_equality(actual, expected, ignore_nullable=True) + + +def test_is_geometrycollection(spark): + input_schema = "geom: string" + test_df = spark.createDataFrame( + [ + ["GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(2 2, 3 3))"], + ["nonsense"], + ["POLYGON((1 1, 2 2, 3 3, 1 1))"], + [None], + ], + input_schema, + ) + + actual = test_df.select(is_geometrycollection("geom")) + + checked_schema = "geom_is_not_a_geometrycollection: string" + expected = spark.createDataFrame( + [ + [None], + ["value `nonsense` in column `geom` is not a geometrycollection geometry"], + ["value `POLYGON((1 1, 2 2, 3 3, 1 1))` in column `geom` is not a geometrycollection geometry"], + [None], + ], + checked_schema, + ) + assert_df_equality(actual, expected, ignore_nullable=True) + + +def test_is_ogc_valid(spark): + input_schema = "geom: string" + test_df = spark.createDataFrame( + [["POLYGON((0 0,10 0,0 10,0 0))"], ["nonsense"], ["POLYGON((0 0,10 10,10 0,0 10,0 0))"], [None]], + input_schema, + ) + + actual = test_df.select(is_ogc_valid("geom")) + + checked_schema = "geom_is_not_a_valid_geometry: string" + expected = spark.createDataFrame( + [ + [None], + ["value `nonsense` in column `geom` is not a valid geometry (in the OGC sense)"], + ["value `POLYGON((0 0,10 10,10 0,0 10,0 0))` in column `geom` is not a valid geometry (in the OGC sense)"], + [None], + ], + checked_schema, + ) + assert_df_equality(actual, expected, ignore_nullable=True) diff --git a/tests/integration/test_save_checks_to_workspace_file.py b/tests/integration/test_save_checks_to_workspace_file.py index 9883fd76..93ced82b 100644 --- a/tests/integration/test_save_checks_to_workspace_file.py +++ b/tests/integration/test_save_checks_to_workspace_file.py @@ -111,7 +111,28 @@ def test_save_checks_when_global_installation_missing(ws, spark): DQEngine(ws, spark).save_checks(TEST_CHECKS, config=config) -def test_load_checks_when_user_installation_missing(ws, spark): +def test_save_checks_in_custom_folder_installation_in_yaml_file(ws, spark, installation_ctx_custom_install_folder): + installation_ctx_custom_install_folder.installation.save(installation_ctx_custom_install_folder.config) + product_name = installation_ctx_custom_install_folder.product_info.product_name() + + dq_engine = DQEngine(ws, spark) + config = InstallationChecksStorageConfig( + run_config_name="default", + assume_user=True, + product_name=product_name, + install_folder=installation_ctx_custom_install_folder.installation.install_folder(), + ) + dq_engine.save_checks(TEST_CHECKS, config=config) + + install_dir = installation_ctx_custom_install_folder.installation.install_folder() + checks_path = f"{install_dir}/{installation_ctx_custom_install_folder.config.get_run_config().checks_location}" + _verify_workspace_file_is_valid(ws, checks_path, file_format="yaml") + + checks = dq_engine.load_checks(config=config) + assert TEST_CHECKS == checks, "Checks were not saved correctly" + + +def test_save_checks_when_user_installation_missing(ws, spark): with pytest.raises(NotFound): config = InstallationChecksStorageConfig(run_config_name="default", assume_user=True) DQEngine(ws, spark).save_checks(TEST_CHECKS, config=config) diff --git a/tests/integration/test_save_results_in_table.py b/tests/integration/test_save_results_in_table.py index f8d9c660..a0454ad1 100644 --- a/tests/integration/test_save_results_in_table.py +++ b/tests/integration/test_save_results_in_table.py @@ -287,6 +287,43 @@ def test_save_results_in_table_in_user_installation_missing_output_and_quarantin ), "Quarantine table should not have been saved" +def test_save_results_in_table_in_custom_folder_installation( + ws, spark, installation_ctx_custom_install_folder, make_schema, make_random +): + catalog_name = "main" + schema = make_schema(catalog_name=catalog_name) + output_table = f"{catalog_name}.{schema.name}.{make_random(6).lower()}" + quarantine_table = f"{catalog_name}.{schema.name}.{make_random(6).lower()}" + + config = installation_ctx_custom_install_folder.config + run_config = config.get_run_config() + run_config.output_config = OutputConfig(location=output_table) + run_config.quarantine_config = OutputConfig(location=quarantine_table) + installation_ctx_custom_install_folder.installation.save(installation_ctx_custom_install_folder.config) + product_name = installation_ctx_custom_install_folder.product_info.product_name() + install_folder = installation_ctx_custom_install_folder.install_folder + + schema = "a: int, b: int" + output_df = spark.createDataFrame([[1, 2]], schema) + quarantine_df = spark.createDataFrame([[3, 4]], schema) + + engine = DQEngine(ws, spark) + engine.save_results_in_table( + output_df=output_df, + quarantine_df=quarantine_df, + run_config_name=run_config.name, + product_name=product_name, + assume_user=True, + install_folder=install_folder, + ) + + output_df_loaded = spark.table(output_table) + quarantine_df_loaded = spark.table(quarantine_table) + + assert_df_equality(output_df, output_df_loaded) + assert_df_equality(quarantine_df, quarantine_df_loaded) + + def test_save_streaming_results_in_table(ws, spark, make_schema, make_random, make_volume): catalog_name = "main" schema = make_schema(catalog_name=catalog_name) From b9b7b3d41552f945df8f12eb8ce82995545cf177 Mon Sep 17 00:00:00 2001 From: Tim Dikland Date: Sat, 27 Sep 2025 10:48:09 +0200 Subject: [PATCH 02/47] expanded geospatial data checks --- src/databricks/labs/dqx/geo/check_funcs.py | 153 +++++++++++++++++++-- tests/integration/test_row_checks_geo.py | 142 +++++++++++++++++++ 2 files changed, 280 insertions(+), 15 deletions(-) diff --git a/src/databricks/labs/dqx/geo/check_funcs.py b/src/databricks/labs/dqx/geo/check_funcs.py index 720794c3..160f0d1a 100644 --- a/src/databricks/labs/dqx/geo/check_funcs.py +++ b/src/databricks/labs/dqx/geo/check_funcs.py @@ -11,6 +11,50 @@ MULTIPOLYGON_TYPE = "ST_MultiPolygon" GEOMETRYCOLLECTION_TYPE = "ST_GeometryCollection" +@register_rule("row") +def is_latitude(column: str | Column) -> Column: + """Checks whether the values in the input column are valid latitudes. + + Args: + column: column to check; can be a string column name or a column expression + + Returns: + Column object indicating whether the values in the input column are valid latitudes + """ + col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) + condition = ~F.when(col_expr.isNull(), F.lit(None)).otherwise( + F.col(col_str_norm).try_cast("double").between(-90.0, 90.0) + ) + condition_str = f"` in column `{col_expr_str}` is not a valid latitude (must be between -90 and 90)" + + return make_condition( + condition, + F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), + f"{col_str_norm}_is_not_a_valid_latitude", + ) + + +@register_rule("row") +def is_longitude(column: str | Column) -> Column: + """Checks whether the values in the input column are valid longitudes. + + Args: + column: column to check; can be a string column name or a column expression + + Returns: + Column object indicating whether the values in the input column are valid longitudes + """ + col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) + condition = ~F.when(col_expr.isNull(), F.lit(None)).otherwise( + F.col(col_str_norm).try_cast("double").between(-180.0, 180.0) + ) + condition_str = f"` in column `{col_expr_str}` is not a valid longitude (must be between -180 and 180)" + + return make_condition( + condition, + F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), + f"{col_str_norm}_is_not_a_valid_longitude", + ) @register_rule("row") def is_geometry(column: str | Column) -> Column: @@ -302,44 +346,123 @@ def is_ogc_valid(column: str | Column) -> Column: f"{col_str_norm}_is_not_a_valid_geometry", ) - @register_rule("row") -def is_latitude(column: str | Column) -> Column: - """Checks whether the values in the input column are valid latitudes. +def is_non_empty_geometry(column: str | Column) -> Column: + """Checks whether the values in the input column are empty geometries. Args: column: column to check; can be a string column name or a column expression Returns: - Column object indicating whether the values in the input column are valid latitudes + Column object indicating whether the values in the input column are empty geometries + + Note: + This function requires Databricks runtime 17.1 or above. """ col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) - condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(F.col(col_str_norm).between(-90.0, 90.0)) - condition_str = f"' in Column '{col_expr_str}' is not a valid latitude must be between -90 and 90" + # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in + # Databricks SQL, due to the use of the `try_to_geometry` and `st_isempty` functions. + # TODO: Above mentioned functions are not (yet) available. Replace with equivalent functions + # when available in OSS PySpark. + geom_cond = F.expr(f"try_to_geometry({col_str_norm}) IS NULL") + geom_type_cond = F.expr(f"st_isempty(try_to_geometry({col_str_norm}))") + condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(geom_cond | geom_type_cond) + condition_str = f"` in column `{col_expr_str}` is an empty geometry" return make_condition( condition, - F.concat_ws("", F.lit("Value '"), col_expr.cast("string"), F.lit(condition_str)), - f"{col_str_norm}_is_not_valid_latitude", + F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), + f"{col_str_norm}_is_an_empty_geometry", ) +@register_rule("row") +def has_dimension(column: str | Column, dimension: int) -> Column: + """Checks whether the geometries/geographies in the input column have a given dimension. + + Args: + column: column to check; can be a string column name or a column expression + dimension: required dimension of the geometries/geographies + + Returns: + Column object indicating whether the geometries/geographies in the input column have a given dimension + + Note: + This function requires Databricks runtime 17.1 or above. + """ + col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) + # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in + # Databricks SQL, due to the use of the `try_to_geometry` and `st_dimension` functions. + # TODO: Above mentioned functions are not (yet) available. Replace with equivalent functions + # when available in OSS PySpark. + geom_cond = F.expr(f"try_to_geometry({col_str_norm}) IS NULL") + geom_type_cond = F.expr(f"st_dimension(try_to_geometry({col_str_norm})) <> {dimension}") + condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(geom_cond | geom_type_cond) + condition_str = f"` in column `{col_expr_str}` does not have the required dimension ({dimension})" + + return make_condition( + condition, + F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), + f"{col_str_norm}_does_not_have_the_required_dimension", + ) @register_rule("row") -def is_longitude(column: str | Column) -> Column: - """Checks whether the values in the input column are valid longitudes. +def has_x_coordinate_between(column: str | Column, min_value: float, max_value: float) -> Column: + """Checks whether the x coordinates of the geometries in the input column are between a given range. Args: column: column to check; can be a string column name or a column expression + min_value: minimum value of the x coordinates + max_value: maximum value of the x coordinates Returns: - Column object indicating whether the values in the input column are valid longitudes + Column object indicating whether the x coordinates of the geometries in the input column are between a given range + + Note: + This function requires Databricks runtime 17.1 or above. """ col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) - condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(F.col(col_str_norm).between(-180.0, 180.0)) - condition_str = f"' in Column '{col_expr_str}' is not a valid longitude (must be between -180 and 180)" + # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in + # Databricks SQL, due to the use of the `try_to_geometry`, `st_xmax` and `st_xmin` functions. + # TODO: Above mentioned functions are not (yet) available. Replace with equivalent functions + # when available in OSS PySpark. + geom_cond = F.expr(f"try_to_geometry({col_str_norm}) IS NULL") + geom_type_cond = F.expr(f"st_xmax(try_to_geometry({col_str_norm})) > {max_value} OR st_xmin(try_to_geometry({col_str_norm})) < {min_value}") + condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(geom_cond | geom_type_cond) + condition_str = f"` in column `{col_expr_str}` has x coordinates outside the range [{min_value}, {max_value}]" return make_condition( condition, - F.concat_ws("", F.lit("Value '"), col_expr.cast("string"), F.lit(condition_str)), - f"{col_str_norm}_is_not_valid_longitude", + F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), + f"{col_str_norm}_has_x_coordinates_outside_range", ) + +@register_rule("row") +def has_y_coordinate_between(column: str | Column, min_value: float, max_value: float) -> Column: + """Checks whether the y coordinates of the geometries in the input column are between a given range. + + Args: + column: column to check; can be a string column name or a column expression + min_value: minimum value of the y coordinates + max_value: maximum value of the y coordinates + + Returns: + Column object indicating whether the y coordinates of the geometries in the input column are between a given range + + Note: + This function requires Databricks runtime 17.1 or above. + """ + col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) + # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in + # Databricks SQL, due to the use of the `try_to_geometry`, `st_ymax` and `st_ymin` functions. + # TODO: Above mentioned functions are not (yet) available. Replace with equivalent functions + # when available in OSS PySpark. + geom_cond = F.expr(f"try_to_geometry({col_str_norm}) IS NULL") + geom_type_cond = F.expr(f"st_ymax(try_to_geometry({col_str_norm})) > {max_value} OR st_ymin(try_to_geometry({col_str_norm})) < {min_value}") + condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(geom_cond | geom_type_cond) + condition_str = f"` in column `{col_expr_str}` has y coordinates outside the range [{min_value}, {max_value}]" + + return make_condition( + condition, + F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), + f"{col_str_norm}_has_y_coordinates_outside_range", + ) \ No newline at end of file diff --git a/tests/integration/test_row_checks_geo.py b/tests/integration/test_row_checks_geo.py index 372a4c31..b54024a2 100644 --- a/tests/integration/test_row_checks_geo.py +++ b/tests/integration/test_row_checks_geo.py @@ -1,10 +1,16 @@ import pytest from chispa.dataframe_comparer import assert_df_equality # type: ignore from databricks.labs.dqx.geo.check_funcs import ( + has_dimension, + has_x_coordinate_between, + has_y_coordinate_between, + is_non_empty_geometry, is_geometry, is_geography, is_geometrycollection, + is_latitude, is_linestring, + is_longitude, is_multilinestring, is_multipoint, is_multipolygon, @@ -254,3 +260,139 @@ def test_is_ogc_valid(spark): checked_schema, ) assert_df_equality(actual, expected, ignore_nullable=True) + + +def test_is_longitude(spark): + input_schema = "long_string: string, long_int: int, long_double: double" + test_df = spark.createDataFrame( + [["1", 120, 180.0], ["-181", None, 180.01]], + input_schema, + ) + + actual = test_df.select(is_longitude("long_string"), is_longitude("long_int"), is_longitude("long_double")) + + checked_schema = "long_string_is_not_a_valid_longitude: string, long_int_is_not_a_valid_longitude: string, long_double_is_not_a_valid_longitude: string" + expected = spark.createDataFrame( + [ + [None, None, None], + [ + "value `-181` in column `long_string` is not a valid longitude (must be between -180 and 180)", + None, + "value `180.01` in column `long_double` is not a valid longitude (must be between -180 and 180)", + ], + ], + checked_schema, + ) + assert_df_equality(actual, expected, ignore_nullable=True) + + +def test_is_latitude(spark): + input_schema = "lat_string: string, lat_int: int, lat_double: double" + test_df = spark.createDataFrame( + [["1", 60, 90.0], ["-91", None, 90.01]], + input_schema, + ) + + actual = test_df.select(is_latitude("lat_string"), is_latitude("lat_int"), is_latitude("lat_double")) + + checked_schema = "lat_string_is_not_a_valid_latitude: string, lat_int_is_not_a_valid_latitude: string, lat_double_is_not_a_valid_latitude: string" + expected = spark.createDataFrame( + [ + [None, None, None], + [ + "value `-91` in column `lat_string` is not a valid latitude (must be between -90 and 90)", + None, + "value `90.01` in column `lat_double` is not a valid latitude (must be between -90 and 90)", + ], + ], + checked_schema, + ) + assert_df_equality(actual, expected, ignore_nullable=True) + + +def test_is_non_empty_geometry(spark): + input_schema = "geom: string" + test_df = spark.createDataFrame( + [["POINT(1 1)"], ["nonsense"], ["POLYGON EMPTY"], [None]], + input_schema, + ) + + actual = test_df.select(is_non_empty_geometry("geom")) + + checked_schema = "geom_is_an_empty_geometry: string" + expected = spark.createDataFrame( + [ + [None], + ["value `nonsense` in column `geom` is an empty geometry"], + ["value `POLYGON EMPTY` in column `geom` is an empty geometry"], + [None], + ], + checked_schema, + ) + assert_df_equality(actual, expected, ignore_nullable=True) + + +def test_has_dimension(spark): + input_schema = "geom: string" + test_df = spark.createDataFrame( + [["POINT(1 1)"], ["nonsense"], ["POLYGON((0 0, 2 0, 0 2, 0 0))"], [None]], + input_schema, + ) + + actual = test_df.select(has_dimension("geom", 0)) + + checked_schema = "geom_does_not_have_the_required_dimension: string" + expected = spark.createDataFrame( + [ + [None], + ["value `nonsense` in column `geom` does not have the required dimension (0)"], + ["value `POLYGON((0 0, 2 0, 0 2, 0 0))` in column `geom` does not have the required dimension (0)"], + [None], + ], + checked_schema, + ) + assert_df_equality(actual, expected, ignore_nullable=True) + +def test_has_x_coordinate_between(spark): + input_schema = "geom: string" + test_df = spark.createDataFrame( + [["POINT(1 1)"], ["nonsense"], ["POLYGON((0 0, 2 0, 0 2, 0 0))"], [None]], + input_schema, + ) + + actual = test_df.select(has_x_coordinate_between("geom", 0, 1)) + + checked_schema = "geom_has_x_coordinates_outside_range: string" + expected = spark.createDataFrame( + [ + [None], + ["value `nonsense` in column `geom` has x coordinates outside the range [0, 1]"], + ["value `POLYGON((0 0, 2 0, 0 2, 0 0))` in column `geom` has x coordinates outside the range [0, 1]"], + [None], + ], + checked_schema, + ) + + assert_df_equality(actual, expected, ignore_nullable=True) + +def test_has_y_coordinate_between(spark): + input_schema = "geom: string" + test_df = spark.createDataFrame( + [["POINT(1 1)"], ["nonsense"], ["POLYGON((0 0, 2 0, 0 2, 0 0))"], [None]], + input_schema, + ) + + actual = test_df.select(has_y_coordinate_between("geom", 0, 1)) + + checked_schema = "geom_has_y_coordinates_outside_range: string" + expected = spark.createDataFrame( + [ + [None], + ["value `nonsense` in column `geom` has y coordinates outside the range [0, 1]"], + ["value `POLYGON((0 0, 2 0, 0 2, 0 0))` in column `geom` has y coordinates outside the range [0, 1]"], + [None], + ], + checked_schema, + ) + + assert_df_equality(actual, expected, ignore_nullable=True) \ No newline at end of file From 1554d06fa2a0cb4bd6082a9fedc68629080479f9 Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 30 Sep 2025 12:58:11 +0200 Subject: [PATCH 03/47] Update src/databricks/labs/dqx/geo/check_funcs.py --- src/databricks/labs/dqx/geo/check_funcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/dqx/geo/check_funcs.py b/src/databricks/labs/dqx/geo/check_funcs.py index 160f0d1a..7aecaa91 100644 --- a/src/databricks/labs/dqx/geo/check_funcs.py +++ b/src/databricks/labs/dqx/geo/check_funcs.py @@ -30,7 +30,7 @@ def is_latitude(column: str | Column) -> Column: return make_condition( condition, F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), - f"{col_str_norm}_is_not_a_valid_latitude", + f"{col_str_norm}_is_not_valid_latitude", ) From f13d7985aa9bc476c1a275d6907467da961eeaf6 Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 30 Sep 2025 12:58:20 +0200 Subject: [PATCH 04/47] Update src/databricks/labs/dqx/geo/check_funcs.py --- src/databricks/labs/dqx/geo/check_funcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/dqx/geo/check_funcs.py b/src/databricks/labs/dqx/geo/check_funcs.py index 7aecaa91..af1e95a2 100644 --- a/src/databricks/labs/dqx/geo/check_funcs.py +++ b/src/databricks/labs/dqx/geo/check_funcs.py @@ -53,7 +53,7 @@ def is_longitude(column: str | Column) -> Column: return make_condition( condition, F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), - f"{col_str_norm}_is_not_a_valid_longitude", + f"{col_str_norm}_is_not_valid_longitude", ) @register_rule("row") From 38d2bfc6fb8e4f1714e72ef0bc683a7a8395f93b Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 30 Sep 2025 12:58:28 +0200 Subject: [PATCH 05/47] Update src/databricks/labs/dqx/geo/check_funcs.py --- src/databricks/labs/dqx/geo/check_funcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/dqx/geo/check_funcs.py b/src/databricks/labs/dqx/geo/check_funcs.py index af1e95a2..351ebc14 100644 --- a/src/databricks/labs/dqx/geo/check_funcs.py +++ b/src/databricks/labs/dqx/geo/check_funcs.py @@ -81,7 +81,7 @@ def is_geometry(column: str | Column) -> Column: return make_condition( condition, F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), - f"{col_str_norm}_is_not_a_geometry", + f"{col_str_norm}_is_not_geometry", ) From d3b50f2b7079092807f4b83019b3cd0ddccf0028 Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 30 Sep 2025 12:58:36 +0200 Subject: [PATCH 06/47] Update src/databricks/labs/dqx/geo/check_funcs.py --- src/databricks/labs/dqx/geo/check_funcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/dqx/geo/check_funcs.py b/src/databricks/labs/dqx/geo/check_funcs.py index 351ebc14..480c83cc 100644 --- a/src/databricks/labs/dqx/geo/check_funcs.py +++ b/src/databricks/labs/dqx/geo/check_funcs.py @@ -139,7 +139,7 @@ def is_point(column: str | Column) -> Column: return make_condition( condition, F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), - f"{col_str_norm}_is_not_a_point", + f"{col_str_norm}_is_not_point", ) From d678566c9e425c357d3a77c9df0ed0ed51f28f05 Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 30 Sep 2025 12:58:43 +0200 Subject: [PATCH 07/47] Update src/databricks/labs/dqx/geo/check_funcs.py --- src/databricks/labs/dqx/geo/check_funcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/dqx/geo/check_funcs.py b/src/databricks/labs/dqx/geo/check_funcs.py index 480c83cc..b15cb706 100644 --- a/src/databricks/labs/dqx/geo/check_funcs.py +++ b/src/databricks/labs/dqx/geo/check_funcs.py @@ -110,7 +110,7 @@ def is_geography(column: str | Column) -> Column: return make_condition( condition, F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), - f"{col_str_norm}_is_not_a_geography", + f"{col_str_norm}_is_not_geography", ) From d2ece65348f49f52168e34096392a826ed7f75c1 Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 30 Sep 2025 12:58:49 +0200 Subject: [PATCH 08/47] Update tests/integration/test_row_checks_geo.py --- tests/integration/test_row_checks_geo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_row_checks_geo.py b/tests/integration/test_row_checks_geo.py index b54024a2..84dff706 100644 --- a/tests/integration/test_row_checks_geo.py +++ b/tests/integration/test_row_checks_geo.py @@ -249,7 +249,7 @@ def test_is_ogc_valid(spark): actual = test_df.select(is_ogc_valid("geom")) - checked_schema = "geom_is_not_a_valid_geometry: string" + checked_schema = "geom_is_not_valid_geometry: string" expected = spark.createDataFrame( [ [None], From 875630350a8094b3010cd266c0ce58a3ae305b4e Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 30 Sep 2025 12:59:05 +0200 Subject: [PATCH 09/47] Update tests/integration/test_row_checks_geo.py --- tests/integration/test_row_checks_geo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_row_checks_geo.py b/tests/integration/test_row_checks_geo.py index 84dff706..baca2842 100644 --- a/tests/integration/test_row_checks_geo.py +++ b/tests/integration/test_row_checks_geo.py @@ -271,7 +271,7 @@ def test_is_longitude(spark): actual = test_df.select(is_longitude("long_string"), is_longitude("long_int"), is_longitude("long_double")) - checked_schema = "long_string_is_not_a_valid_longitude: string, long_int_is_not_a_valid_longitude: string, long_double_is_not_a_valid_longitude: string" + checked_schema = "long_string_is_not_valid_longitude: string, long_int_is_not_valid_longitude: string, long_double_is_not_valid_longitude: string" expected = spark.createDataFrame( [ [None, None, None], From fd49d006ea8515175d4344612b6e1e62c3931f0a Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 30 Sep 2025 12:59:13 +0200 Subject: [PATCH 10/47] Update tests/integration/test_row_checks_geo.py --- tests/integration/test_row_checks_geo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_row_checks_geo.py b/tests/integration/test_row_checks_geo.py index baca2842..7f43d288 100644 --- a/tests/integration/test_row_checks_geo.py +++ b/tests/integration/test_row_checks_geo.py @@ -295,7 +295,7 @@ def test_is_latitude(spark): actual = test_df.select(is_latitude("lat_string"), is_latitude("lat_int"), is_latitude("lat_double")) - checked_schema = "lat_string_is_not_a_valid_latitude: string, lat_int_is_not_a_valid_latitude: string, lat_double_is_not_a_valid_latitude: string" + checked_schema = "lat_string_is_not_valid_latitude: string, lat_int_is_not_valid_latitude: string, lat_double_is_not_valid_latitude: string" expected = spark.createDataFrame( [ [None, None, None], From 66edad24fbcf8fec2c6eefcafd23c9db81096693 Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 30 Sep 2025 12:59:21 +0200 Subject: [PATCH 11/47] Update tests/integration/test_row_checks_geo.py --- tests/integration/test_row_checks_geo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_row_checks_geo.py b/tests/integration/test_row_checks_geo.py index 7f43d288..706535b6 100644 --- a/tests/integration/test_row_checks_geo.py +++ b/tests/integration/test_row_checks_geo.py @@ -341,7 +341,7 @@ def test_has_dimension(spark): actual = test_df.select(has_dimension("geom", 0)) - checked_schema = "geom_does_not_have_the_required_dimension: string" + checked_schema = "geom_does_not_have_required_dimension: string" expected = spark.createDataFrame( [ [None], From bbe8bde9fad78f59bd7dcd060291413b8fbc55af Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 30 Sep 2025 12:59:32 +0200 Subject: [PATCH 12/47] Update tests/integration/test_row_checks_geo.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- tests/integration/test_row_checks_geo.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration/test_row_checks_geo.py b/tests/integration/test_row_checks_geo.py index 706535b6..4a179279 100644 --- a/tests/integration/test_row_checks_geo.py +++ b/tests/integration/test_row_checks_geo.py @@ -374,7 +374,6 @@ def test_has_x_coordinate_between(spark): ) assert_df_equality(actual, expected, ignore_nullable=True) - def test_has_y_coordinate_between(spark): input_schema = "geom: string" test_df = spark.createDataFrame( From bfdc650273a2050342c1067e15ec6cc3dda167c4 Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 30 Sep 2025 12:59:39 +0200 Subject: [PATCH 13/47] Update src/databricks/labs/dqx/geo/check_funcs.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/databricks/labs/dqx/geo/check_funcs.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/databricks/labs/dqx/geo/check_funcs.py b/src/databricks/labs/dqx/geo/check_funcs.py index b15cb706..5c0f4115 100644 --- a/src/databricks/labs/dqx/geo/check_funcs.py +++ b/src/databricks/labs/dqx/geo/check_funcs.py @@ -345,7 +345,6 @@ def is_ogc_valid(column: str | Column) -> Column: F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), f"{col_str_norm}_is_not_a_valid_geometry", ) - @register_rule("row") def is_non_empty_geometry(column: str | Column) -> Column: """Checks whether the values in the input column are empty geometries. From 2daf11d0d976500ed7b1bf46ba0513e4d6a9cd6d Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 30 Sep 2025 12:59:46 +0200 Subject: [PATCH 14/47] Update src/databricks/labs/dqx/geo/check_funcs.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/databricks/labs/dqx/geo/check_funcs.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/databricks/labs/dqx/geo/check_funcs.py b/src/databricks/labs/dqx/geo/check_funcs.py index 5c0f4115..5a7bb199 100644 --- a/src/databricks/labs/dqx/geo/check_funcs.py +++ b/src/databricks/labs/dqx/geo/check_funcs.py @@ -403,7 +403,6 @@ def has_dimension(column: str | Column, dimension: int) -> Column: F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), f"{col_str_norm}_does_not_have_the_required_dimension", ) - @register_rule("row") def has_x_coordinate_between(column: str | Column, min_value: float, max_value: float) -> Column: """Checks whether the x coordinates of the geometries in the input column are between a given range. From ffc82856f9b11977cf0967d7b693f36880f0d5fa Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 30 Sep 2025 12:59:55 +0200 Subject: [PATCH 15/47] Update src/databricks/labs/dqx/geo/check_funcs.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/databricks/labs/dqx/geo/check_funcs.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/databricks/labs/dqx/geo/check_funcs.py b/src/databricks/labs/dqx/geo/check_funcs.py index 5a7bb199..cddf03d8 100644 --- a/src/databricks/labs/dqx/geo/check_funcs.py +++ b/src/databricks/labs/dqx/geo/check_funcs.py @@ -433,7 +433,6 @@ def has_x_coordinate_between(column: str | Column, min_value: float, max_value: F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), f"{col_str_norm}_has_x_coordinates_outside_range", ) - @register_rule("row") def has_y_coordinate_between(column: str | Column, min_value: float, max_value: float) -> Column: """Checks whether the y coordinates of the geometries in the input column are between a given range. From af715fea2317e28ba595e271814c7135219884ec Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 30 Sep 2025 13:00:03 +0200 Subject: [PATCH 16/47] Update src/databricks/labs/dqx/geo/check_funcs.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/databricks/labs/dqx/geo/check_funcs.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/databricks/labs/dqx/geo/check_funcs.py b/src/databricks/labs/dqx/geo/check_funcs.py index cddf03d8..9aedd544 100644 --- a/src/databricks/labs/dqx/geo/check_funcs.py +++ b/src/databricks/labs/dqx/geo/check_funcs.py @@ -373,7 +373,6 @@ def is_non_empty_geometry(column: str | Column) -> Column: F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), f"{col_str_norm}_is_an_empty_geometry", ) - @register_rule("row") def has_dimension(column: str | Column, dimension: int) -> Column: """Checks whether the geometries/geographies in the input column have a given dimension. From bb3409687445b612a992e4efc9b45a8f49278eba Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 30 Sep 2025 13:00:10 +0200 Subject: [PATCH 17/47] Update src/databricks/labs/dqx/geo/check_funcs.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/databricks/labs/dqx/geo/check_funcs.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/databricks/labs/dqx/geo/check_funcs.py b/src/databricks/labs/dqx/geo/check_funcs.py index 9aedd544..e5d63acd 100644 --- a/src/databricks/labs/dqx/geo/check_funcs.py +++ b/src/databricks/labs/dqx/geo/check_funcs.py @@ -55,7 +55,6 @@ def is_longitude(column: str | Column) -> Column: F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), f"{col_str_norm}_is_not_valid_longitude", ) - @register_rule("row") def is_geometry(column: str | Column) -> Column: """Checks whether the values in the input column are valid geometries. From a5df77f870b398753ce8a6a27a3b6384b02bb65d Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 30 Sep 2025 13:00:16 +0200 Subject: [PATCH 18/47] Update tests/integration/test_row_checks_geo.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- tests/integration/test_row_checks_geo.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration/test_row_checks_geo.py b/tests/integration/test_row_checks_geo.py index 4a179279..4b12ded7 100644 --- a/tests/integration/test_row_checks_geo.py +++ b/tests/integration/test_row_checks_geo.py @@ -352,7 +352,6 @@ def test_has_dimension(spark): checked_schema, ) assert_df_equality(actual, expected, ignore_nullable=True) - def test_has_x_coordinate_between(spark): input_schema = "geom: string" test_df = spark.createDataFrame( From 1ba578c1e6ed66e60db359b518d1593e419738e7 Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 30 Sep 2025 13:00:32 +0200 Subject: [PATCH 19/47] Update tests/integration/test_row_checks_geo.py --- tests/integration/test_row_checks_geo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_row_checks_geo.py b/tests/integration/test_row_checks_geo.py index 4b12ded7..f5f726fd 100644 --- a/tests/integration/test_row_checks_geo.py +++ b/tests/integration/test_row_checks_geo.py @@ -319,7 +319,7 @@ def test_is_non_empty_geometry(spark): actual = test_df.select(is_non_empty_geometry("geom")) - checked_schema = "geom_is_an_empty_geometry: string" + checked_schema = "geom_is_empty_geometry: string" expected = spark.createDataFrame( [ [None], From 0b4f83a6903829704f95eb5d70c154df6c95b473 Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 30 Sep 2025 13:00:54 +0200 Subject: [PATCH 20/47] Update src/databricks/labs/dqx/geo/check_funcs.py --- src/databricks/labs/dqx/geo/check_funcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/dqx/geo/check_funcs.py b/src/databricks/labs/dqx/geo/check_funcs.py index e5d63acd..81f393b9 100644 --- a/src/databricks/labs/dqx/geo/check_funcs.py +++ b/src/databricks/labs/dqx/geo/check_funcs.py @@ -167,7 +167,7 @@ def is_linestring(column: str | Column) -> Column: return make_condition( condition, F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), - f"{col_str_norm}_is_not_a_linestring", + f"{col_str_norm}_is_not_linestring", ) From e5aab5ff7a02e47e740938f5f63b0fbdad998eba Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 30 Sep 2025 13:01:10 +0200 Subject: [PATCH 21/47] Update src/databricks/labs/dqx/geo/check_funcs.py --- src/databricks/labs/dqx/geo/check_funcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/dqx/geo/check_funcs.py b/src/databricks/labs/dqx/geo/check_funcs.py index 81f393b9..e1450aa9 100644 --- a/src/databricks/labs/dqx/geo/check_funcs.py +++ b/src/databricks/labs/dqx/geo/check_funcs.py @@ -196,7 +196,7 @@ def is_polygon(column: str | Column) -> Column: return make_condition( condition, F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), - f"{col_str_norm}_is_not_a_polygon", + f"{col_str_norm}_is_not_polygon", ) From 32d43aaf90a61ffe5c02f3be59b81218d1820e08 Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 30 Sep 2025 13:01:25 +0200 Subject: [PATCH 22/47] Update src/databricks/labs/dqx/geo/check_funcs.py --- src/databricks/labs/dqx/geo/check_funcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/dqx/geo/check_funcs.py b/src/databricks/labs/dqx/geo/check_funcs.py index e1450aa9..5ca74448 100644 --- a/src/databricks/labs/dqx/geo/check_funcs.py +++ b/src/databricks/labs/dqx/geo/check_funcs.py @@ -225,7 +225,7 @@ def is_multipoint(column: str | Column) -> Column: return make_condition( condition, F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), - f"{col_str_norm}_is_not_a_multipoint", + f"{col_str_norm}_is_not_multipoint", ) From 69c041e31a4400c1ff7c8d315857c3d80f8000ed Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 30 Sep 2025 13:01:38 +0200 Subject: [PATCH 23/47] Update src/databricks/labs/dqx/geo/check_funcs.py --- src/databricks/labs/dqx/geo/check_funcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/dqx/geo/check_funcs.py b/src/databricks/labs/dqx/geo/check_funcs.py index 5ca74448..d7339f5a 100644 --- a/src/databricks/labs/dqx/geo/check_funcs.py +++ b/src/databricks/labs/dqx/geo/check_funcs.py @@ -254,7 +254,7 @@ def is_multilinestring(column: str | Column) -> Column: return make_condition( condition, F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), - f"{col_str_norm}_is_not_a_multilinestring", + f"{col_str_norm}_is_not_multilinestring", ) From 8095b296699639ba2d57898f08c213838e4ff6d4 Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 30 Sep 2025 13:01:52 +0200 Subject: [PATCH 24/47] Update src/databricks/labs/dqx/geo/check_funcs.py --- src/databricks/labs/dqx/geo/check_funcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/dqx/geo/check_funcs.py b/src/databricks/labs/dqx/geo/check_funcs.py index d7339f5a..81386594 100644 --- a/src/databricks/labs/dqx/geo/check_funcs.py +++ b/src/databricks/labs/dqx/geo/check_funcs.py @@ -283,7 +283,7 @@ def is_multipolygon(column: str | Column) -> Column: return make_condition( condition, F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), - f"{col_str_norm}_is_not_a_multipolygon", + f"{col_str_norm}_is_not_multipolygon", ) From 0e1eec704c06953a72952c395a0932c7da0efa59 Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 30 Sep 2025 13:02:13 +0200 Subject: [PATCH 25/47] Update src/databricks/labs/dqx/geo/check_funcs.py --- src/databricks/labs/dqx/geo/check_funcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/dqx/geo/check_funcs.py b/src/databricks/labs/dqx/geo/check_funcs.py index 81386594..6519d4a5 100644 --- a/src/databricks/labs/dqx/geo/check_funcs.py +++ b/src/databricks/labs/dqx/geo/check_funcs.py @@ -342,7 +342,7 @@ def is_ogc_valid(column: str | Column) -> Column: return make_condition( condition, F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), - f"{col_str_norm}_is_not_a_valid_geometry", + f"{col_str_norm}_is_not_valid_geometry", ) @register_rule("row") def is_non_empty_geometry(column: str | Column) -> Column: From c9760a3b9a78d515d7225685de0a00550179cbf0 Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 30 Sep 2025 13:02:31 +0200 Subject: [PATCH 26/47] Update src/databricks/labs/dqx/geo/check_funcs.py --- src/databricks/labs/dqx/geo/check_funcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/dqx/geo/check_funcs.py b/src/databricks/labs/dqx/geo/check_funcs.py index 6519d4a5..254f5edc 100644 --- a/src/databricks/labs/dqx/geo/check_funcs.py +++ b/src/databricks/labs/dqx/geo/check_funcs.py @@ -312,7 +312,7 @@ def is_geometrycollection(column: str | Column) -> Column: return make_condition( condition, F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), - f"{col_str_norm}_is_not_a_geometrycollection", + f"{col_str_norm}_is_not_geometrycollection", ) From 775cb84b796bf73af66503ba1bdd85a2d8e6af64 Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 30 Sep 2025 13:02:50 +0200 Subject: [PATCH 27/47] Update src/databricks/labs/dqx/geo/check_funcs.py --- src/databricks/labs/dqx/geo/check_funcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/dqx/geo/check_funcs.py b/src/databricks/labs/dqx/geo/check_funcs.py index 254f5edc..1c7f0452 100644 --- a/src/databricks/labs/dqx/geo/check_funcs.py +++ b/src/databricks/labs/dqx/geo/check_funcs.py @@ -370,7 +370,7 @@ def is_non_empty_geometry(column: str | Column) -> Column: return make_condition( condition, F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), - f"{col_str_norm}_is_an_empty_geometry", + f"{col_str_norm}_is_empty_geometry", ) @register_rule("row") def has_dimension(column: str | Column, dimension: int) -> Column: From 91adf6a2863ff48f2de23fb536ef95f0ef19b300 Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 30 Sep 2025 13:03:05 +0200 Subject: [PATCH 28/47] Update src/databricks/labs/dqx/geo/check_funcs.py --- src/databricks/labs/dqx/geo/check_funcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/dqx/geo/check_funcs.py b/src/databricks/labs/dqx/geo/check_funcs.py index 1c7f0452..c63064c1 100644 --- a/src/databricks/labs/dqx/geo/check_funcs.py +++ b/src/databricks/labs/dqx/geo/check_funcs.py @@ -399,7 +399,7 @@ def has_dimension(column: str | Column, dimension: int) -> Column: return make_condition( condition, F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), - f"{col_str_norm}_does_not_have_the_required_dimension", + f"{col_str_norm}_does_not_have_required_geo_dimension", ) @register_rule("row") def has_x_coordinate_between(column: str | Column, min_value: float, max_value: float) -> Column: From 560feebfea4b5fa9abe7834171f08ab9b7b8fb53 Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 30 Sep 2025 13:03:27 +0200 Subject: [PATCH 29/47] Apply suggestion from @mwojtyczka --- tests/integration/test_row_checks_geo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_row_checks_geo.py b/tests/integration/test_row_checks_geo.py index f5f726fd..7986a353 100644 --- a/tests/integration/test_row_checks_geo.py +++ b/tests/integration/test_row_checks_geo.py @@ -34,7 +34,7 @@ def test_is_geometry(spark): actual = test_df.select(is_geometry("geom_string"), is_geometry("geom_binary"), is_geometry("geom_int")) - checked_schema = "geom_string_is_not_a_geometry: string, geom_binary_is_not_a_geometry: string, geom_int_is_not_a_geometry: string" + checked_schema = "geom_string_is_not_geometry: string, geom_binary_is_not_geometry: string, geom_int_is_not_geometry: string" expected = spark.createDataFrame( [ [None, None, None], From 7db6db94fbf2391d3e3c9d7d6ac07e98b3ee7cc2 Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 30 Sep 2025 13:03:35 +0200 Subject: [PATCH 30/47] Apply suggestion from @mwojtyczka --- tests/integration/test_row_checks_geo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_row_checks_geo.py b/tests/integration/test_row_checks_geo.py index 7986a353..d27a03f7 100644 --- a/tests/integration/test_row_checks_geo.py +++ b/tests/integration/test_row_checks_geo.py @@ -65,7 +65,7 @@ def test_is_geography(spark): is_geography("geography_string"), is_geography("geography_binary"), is_geography("geography_int") ) - checked_schema = "geography_string_is_not_a_geography: string, geography_binary_is_not_a_geography: string, geography_int_is_not_a_geography: string" + checked_schema = "geography_string_is_not_geography: string, geography_binary_is_not_geography: string, geography_int_is_not_geography: string" expected = spark.createDataFrame( [ [None, None, None], From 00270722f7a580b2a0f53dd0622f100eaadb2818 Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 30 Sep 2025 13:03:43 +0200 Subject: [PATCH 31/47] Apply suggestion from @mwojtyczka --- tests/integration/test_row_checks_geo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_row_checks_geo.py b/tests/integration/test_row_checks_geo.py index d27a03f7..990ab143 100644 --- a/tests/integration/test_row_checks_geo.py +++ b/tests/integration/test_row_checks_geo.py @@ -89,7 +89,7 @@ def test_is_point(spark): actual = test_df.select(is_point("geom")) - checked_schema = "geom_is_not_a_point: string" + checked_schema = "geom_is_not_point: string" expected = spark.createDataFrame( [ [None], From 80197d7db9a549023cd9b111f921893ad21581b4 Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 30 Sep 2025 13:03:52 +0200 Subject: [PATCH 32/47] Apply suggestion from @mwojtyczka --- tests/integration/test_row_checks_geo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_row_checks_geo.py b/tests/integration/test_row_checks_geo.py index 990ab143..67cbec78 100644 --- a/tests/integration/test_row_checks_geo.py +++ b/tests/integration/test_row_checks_geo.py @@ -112,7 +112,7 @@ def test_is_linestring(spark): actual = test_df.select(is_linestring("geom")) - checked_schema = "geom_is_not_a_linestring: string" + checked_schema = "geom_is_not_linestring: string" expected = spark.createDataFrame( [ [None], From beb321776db3342f94245a5b93d3461c01ae10a8 Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 30 Sep 2025 13:04:01 +0200 Subject: [PATCH 33/47] Apply suggestion from @mwojtyczka --- tests/integration/test_row_checks_geo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_row_checks_geo.py b/tests/integration/test_row_checks_geo.py index 67cbec78..fb4709c1 100644 --- a/tests/integration/test_row_checks_geo.py +++ b/tests/integration/test_row_checks_geo.py @@ -134,7 +134,7 @@ def test_is_polygon(spark): actual = test_df.select(is_polygon("geom")) - checked_schema = "geom_is_not_a_polygon: string" + checked_schema = "geom_is_not_polygon: string" expected = spark.createDataFrame( [ [None], From 945d03193339a18e53c826a6480fd542bf9ef938 Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 30 Sep 2025 13:04:09 +0200 Subject: [PATCH 34/47] Apply suggestion from @mwojtyczka --- tests/integration/test_row_checks_geo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_row_checks_geo.py b/tests/integration/test_row_checks_geo.py index fb4709c1..aad1111b 100644 --- a/tests/integration/test_row_checks_geo.py +++ b/tests/integration/test_row_checks_geo.py @@ -156,7 +156,7 @@ def test_is_multipoint(spark): actual = test_df.select(is_multipoint("geom")) - checked_schema = "geom_is_not_a_multipoint: string" + checked_schema = "geom_is_not_multipoint: string" expected = spark.createDataFrame( [ [None], From 0523d678f84341d725802592fbdcaca8b3652294 Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 30 Sep 2025 13:04:19 +0200 Subject: [PATCH 35/47] Apply suggestion from @mwojtyczka --- tests/integration/test_row_checks_geo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_row_checks_geo.py b/tests/integration/test_row_checks_geo.py index aad1111b..56601f7b 100644 --- a/tests/integration/test_row_checks_geo.py +++ b/tests/integration/test_row_checks_geo.py @@ -227,7 +227,7 @@ def test_is_geometrycollection(spark): actual = test_df.select(is_geometrycollection("geom")) - checked_schema = "geom_is_not_a_geometrycollection: string" + checked_schema = "geom_is_not_geometrycollection: string" expected = spark.createDataFrame( [ [None], From 0b69618c2c07481c1d0eb30c94c21665f9311f23 Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 30 Sep 2025 13:04:30 +0200 Subject: [PATCH 36/47] Apply suggestion from @mwojtyczka --- tests/integration/test_row_checks_geo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_row_checks_geo.py b/tests/integration/test_row_checks_geo.py index 56601f7b..7b1e0ff4 100644 --- a/tests/integration/test_row_checks_geo.py +++ b/tests/integration/test_row_checks_geo.py @@ -200,7 +200,7 @@ def test_is_multipolygon(spark): actual = test_df.select(is_multipolygon("geom")) - checked_schema = "geom_is_not_a_multipolygon: string" + checked_schema = "geom_is_not_multipolygon: string" expected = spark.createDataFrame( [ [None], From 7483b6ae8d7219d575d979a2769f39bddddae056 Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 30 Sep 2025 13:04:37 +0200 Subject: [PATCH 37/47] Apply suggestion from @mwojtyczka --- tests/integration/test_row_checks_geo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_row_checks_geo.py b/tests/integration/test_row_checks_geo.py index 7b1e0ff4..3f651320 100644 --- a/tests/integration/test_row_checks_geo.py +++ b/tests/integration/test_row_checks_geo.py @@ -178,7 +178,7 @@ def test_is_multilinestring(spark): actual = test_df.select(is_multilinestring("geom")) - checked_schema = "geom_is_not_a_multilinestring: string" + checked_schema = "geom_is_not_multilinestring: string" expected = spark.createDataFrame( [ [None], From 0daae5e95ffc4d7a31f9f01fb5f96b0005d66bc9 Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 30 Sep 2025 13:26:38 +0200 Subject: [PATCH 38/47] corrected tests, fmt --- src/databricks/labs/dqx/geo/check_funcs.py | 23 ++++++++++++++++++---- tests/integration/test_row_checks_geo.py | 14 ++++++++----- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/src/databricks/labs/dqx/geo/check_funcs.py b/src/databricks/labs/dqx/geo/check_funcs.py index c63064c1..4b83420e 100644 --- a/src/databricks/labs/dqx/geo/check_funcs.py +++ b/src/databricks/labs/dqx/geo/check_funcs.py @@ -11,6 +11,7 @@ MULTIPOLYGON_TYPE = "ST_MultiPolygon" GEOMETRYCOLLECTION_TYPE = "ST_GeometryCollection" + @register_rule("row") def is_latitude(column: str | Column) -> Column: """Checks whether the values in the input column are valid latitudes. @@ -55,6 +56,8 @@ def is_longitude(column: str | Column) -> Column: F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), f"{col_str_norm}_is_not_valid_longitude", ) + + @register_rule("row") def is_geometry(column: str | Column) -> Column: """Checks whether the values in the input column are valid geometries. @@ -344,6 +347,8 @@ def is_ogc_valid(column: str | Column) -> Column: F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), f"{col_str_norm}_is_not_valid_geometry", ) + + @register_rule("row") def is_non_empty_geometry(column: str | Column) -> Column: """Checks whether the values in the input column are empty geometries. @@ -372,6 +377,8 @@ def is_non_empty_geometry(column: str | Column) -> Column: F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), f"{col_str_norm}_is_empty_geometry", ) + + @register_rule("row") def has_dimension(column: str | Column, dimension: int) -> Column: """Checks whether the geometries/geographies in the input column have a given dimension. @@ -382,7 +389,7 @@ def has_dimension(column: str | Column, dimension: int) -> Column: Returns: Column object indicating whether the geometries/geographies in the input column have a given dimension - + Note: This function requires Databricks runtime 17.1 or above. """ @@ -401,6 +408,8 @@ def has_dimension(column: str | Column, dimension: int) -> Column: F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), f"{col_str_norm}_does_not_have_required_geo_dimension", ) + + @register_rule("row") def has_x_coordinate_between(column: str | Column, min_value: float, max_value: float) -> Column: """Checks whether the x coordinates of the geometries in the input column are between a given range. @@ -422,7 +431,9 @@ def has_x_coordinate_between(column: str | Column, min_value: float, max_value: # TODO: Above mentioned functions are not (yet) available. Replace with equivalent functions # when available in OSS PySpark. geom_cond = F.expr(f"try_to_geometry({col_str_norm}) IS NULL") - geom_type_cond = F.expr(f"st_xmax(try_to_geometry({col_str_norm})) > {max_value} OR st_xmin(try_to_geometry({col_str_norm})) < {min_value}") + geom_type_cond = F.expr( + f"st_xmax(try_to_geometry({col_str_norm})) > {max_value} OR st_xmin(try_to_geometry({col_str_norm})) < {min_value}" + ) condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(geom_cond | geom_type_cond) condition_str = f"` in column `{col_expr_str}` has x coordinates outside the range [{min_value}, {max_value}]" @@ -431,6 +442,8 @@ def has_x_coordinate_between(column: str | Column, min_value: float, max_value: F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), f"{col_str_norm}_has_x_coordinates_outside_range", ) + + @register_rule("row") def has_y_coordinate_between(column: str | Column, min_value: float, max_value: float) -> Column: """Checks whether the y coordinates of the geometries in the input column are between a given range. @@ -452,7 +465,9 @@ def has_y_coordinate_between(column: str | Column, min_value: float, max_value: # TODO: Above mentioned functions are not (yet) available. Replace with equivalent functions # when available in OSS PySpark. geom_cond = F.expr(f"try_to_geometry({col_str_norm}) IS NULL") - geom_type_cond = F.expr(f"st_ymax(try_to_geometry({col_str_norm})) > {max_value} OR st_ymin(try_to_geometry({col_str_norm})) < {min_value}") + geom_type_cond = F.expr( + f"st_ymax(try_to_geometry({col_str_norm})) > {max_value} OR st_ymin(try_to_geometry({col_str_norm})) < {min_value}" + ) condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(geom_cond | geom_type_cond) condition_str = f"` in column `{col_expr_str}` has y coordinates outside the range [{min_value}, {max_value}]" @@ -460,4 +475,4 @@ def has_y_coordinate_between(column: str | Column, min_value: float, max_value: condition, F.concat_ws("", F.lit("value `"), col_expr.cast("string"), F.lit(condition_str)), f"{col_str_norm}_has_y_coordinates_outside_range", - ) \ No newline at end of file + ) diff --git a/tests/integration/test_row_checks_geo.py b/tests/integration/test_row_checks_geo.py index 3f651320..dd2bf686 100644 --- a/tests/integration/test_row_checks_geo.py +++ b/tests/integration/test_row_checks_geo.py @@ -1,4 +1,3 @@ -import pytest from chispa.dataframe_comparer import assert_df_equality # type: ignore from databricks.labs.dqx.geo.check_funcs import ( has_dimension, @@ -34,7 +33,9 @@ def test_is_geometry(spark): actual = test_df.select(is_geometry("geom_string"), is_geometry("geom_binary"), is_geometry("geom_int")) - checked_schema = "geom_string_is_not_geometry: string, geom_binary_is_not_geometry: string, geom_int_is_not_geometry: string" + checked_schema = ( + "geom_string_is_not_geometry: string, geom_binary_is_not_geometry: string, geom_int_is_not_geometry: string" + ) expected = spark.createDataFrame( [ [None, None, None], @@ -340,8 +341,7 @@ def test_has_dimension(spark): ) actual = test_df.select(has_dimension("geom", 0)) - - checked_schema = "geom_does_not_have_required_dimension: string" + checked_schema = "geom_does_not_have_required_geo_dimension: string" expected = spark.createDataFrame( [ [None], @@ -352,6 +352,8 @@ def test_has_dimension(spark): checked_schema, ) assert_df_equality(actual, expected, ignore_nullable=True) + + def test_has_x_coordinate_between(spark): input_schema = "geom: string" test_df = spark.createDataFrame( @@ -373,6 +375,8 @@ def test_has_x_coordinate_between(spark): ) assert_df_equality(actual, expected, ignore_nullable=True) + + def test_has_y_coordinate_between(spark): input_schema = "geom: string" test_df = spark.createDataFrame( @@ -393,4 +397,4 @@ def test_has_y_coordinate_between(spark): checked_schema, ) - assert_df_equality(actual, expected, ignore_nullable=True) \ No newline at end of file + assert_df_equality(actual, expected, ignore_nullable=True) From 0e4b4fa4e40b39dfdafa348ede28f6cdf92890e0 Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 30 Sep 2025 13:28:29 +0200 Subject: [PATCH 39/47] remove todos --- src/databricks/labs/dqx/geo/check_funcs.py | 28 ---------------------- 1 file changed, 28 deletions(-) diff --git a/src/databricks/labs/dqx/geo/check_funcs.py b/src/databricks/labs/dqx/geo/check_funcs.py index 4b83420e..81c3ffe8 100644 --- a/src/databricks/labs/dqx/geo/check_funcs.py +++ b/src/databricks/labs/dqx/geo/check_funcs.py @@ -74,8 +74,6 @@ def is_geometry(column: str | Column) -> Column: col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in # Databricks SQL, due to the use of the `try_to_geometry` function. - # TODO: `pyspark.sql.functions.try_to_geometry` is not (yet) available. Replace with - # `pyspark.sql.functions.try_to_geometry` when available in OSS PySpark. geometry_col = F.expr(f"try_to_geometry({col_str_norm})") condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(geometry_col.isNull()) condition_str = f"` in column `{col_expr_str}` is not a geometry" @@ -103,8 +101,6 @@ def is_geography(column: str | Column) -> Column: col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in # Databricks SQL, due to the use of the `try_to_geography` function. - # TODO: `pyspark.sql.functions.try_to_geography` is not (yet) available. Replace with - # `pyspark.sql.functions.try_to_geography` when available in OSS PySpark. geometry_col = F.expr(f"try_to_geography({col_str_norm})") condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(geometry_col.isNull()) condition_str = f"` in column `{col_expr_str}` is not a geography" @@ -132,8 +128,6 @@ def is_point(column: str | Column) -> Column: col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in # Databricks SQL, due to the use of the `try_to_geometry` and `st_geometrytype` functions. - # TODO: Above mentioned functions are not (yet) available. Replace with equivalent functions - # when available in OSS PySpark. geom_cond = F.expr(f"try_to_geometry({col_str_norm}) IS NULL") geom_type_cond = F.expr(f"st_geometrytype(try_to_geometry({col_str_norm})) <> '{POINT_TYPE}'") condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(geom_cond | geom_type_cond) @@ -161,8 +155,6 @@ def is_linestring(column: str | Column) -> Column: col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in # Databricks SQL, due to the use of the `try_to_geometry` and `st_geometrytype` functions. - # TODO: Above mentioned functions are not (yet) available. Replace with equivalent functions - # when available in OSS PySpark. geom_cond = F.expr(f"try_to_geometry({col_str_norm}) IS NULL") geom_type_cond = F.expr(f"st_geometrytype(try_to_geometry({col_str_norm})) <> '{LINESTRING_TYPE}'") condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(geom_cond | geom_type_cond) @@ -190,8 +182,6 @@ def is_polygon(column: str | Column) -> Column: col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in # Databricks SQL, due to the use of the `try_to_geometry` and `st_geometrytype` functions. - # TODO: Above mentioned functions are not (yet) available. Replace with equivalent functions - # when available in OSS PySpark. geom_cond = F.expr(f"try_to_geometry({col_str_norm}) IS NULL") geom_type_cond = F.expr(f"st_geometrytype(try_to_geometry({col_str_norm})) <> '{POLYGON_TYPE}'") condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(geom_cond | geom_type_cond) @@ -219,8 +209,6 @@ def is_multipoint(column: str | Column) -> Column: col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in # Databricks SQL, due to the use of the `try_to_geometry` and `st_geometrytype` functions. - # TODO: Above mentioned functions are not (yet) available. Replace with equivalent functions - # when available in OSS PySpark. geom_cond = F.expr(f"try_to_geometry({col_str_norm}) IS NULL") geom_type_cond = F.expr(f"st_geometrytype(try_to_geometry({col_str_norm})) <> '{MULTIPOINT_TYPE}'") condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(geom_cond | geom_type_cond) @@ -248,8 +236,6 @@ def is_multilinestring(column: str | Column) -> Column: col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in # Databricks SQL, due to the use of the `try_to_geometry` and `st_geometrytype` functions. - # TODO: Above mentioned functions are not (yet) available. Replace with equivalent functions - # when available in OSS PySpark. geom_cond = F.expr(f"try_to_geometry({col_str_norm}) IS NULL") geom_type_cond = F.expr(f"st_geometrytype(try_to_geometry({col_str_norm})) <> '{MULTILINESTRING_TYPE}'") condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(geom_cond | geom_type_cond) @@ -277,8 +263,6 @@ def is_multipolygon(column: str | Column) -> Column: col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in # Databricks SQL, due to the use of the `try_to_geometry` and `st_geometrytype` functions. - # TODO: Above mentioned functions are not (yet) available. Replace with equivalent functions - # when available in OSS PySpark. geom_cond = F.expr(f"try_to_geometry({col_str_norm}) IS NULL") geom_type_cond = F.expr(f"st_geometrytype(try_to_geometry({col_str_norm})) <> '{MULTIPOLYGON_TYPE}'") condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(geom_cond | geom_type_cond) @@ -306,8 +290,6 @@ def is_geometrycollection(column: str | Column) -> Column: col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in # Databricks SQL, due to the use of the `try_to_geometry` and `st_geometrytype` functions. - # TODO: Above mentioned functions are not (yet) available. Replace with equivalent functions - # when available in OSS PySpark. geom_cond = F.expr(f"try_to_geometry({col_str_norm}) IS NULL") geom_type_cond = F.expr(f"st_geometrytype(try_to_geometry({col_str_norm})) <> '{GEOMETRYCOLLECTION_TYPE}'") condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(geom_cond | geom_type_cond) @@ -335,8 +317,6 @@ def is_ogc_valid(column: str | Column) -> Column: col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in # Databricks SQL, due to the use of the `try_to_geometry` and `st_isvalid` functions. - # TODO: Above mentioned functions are not (yet) available. Replace with equivalent functions - # when available in OSS PySpark. geom_cond = F.expr(f"try_to_geometry({col_str_norm}) IS NULL") geom_type_cond = F.expr(f"NOT st_isvalid(try_to_geometry({col_str_norm}))") condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(geom_cond | geom_type_cond) @@ -365,8 +345,6 @@ def is_non_empty_geometry(column: str | Column) -> Column: col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in # Databricks SQL, due to the use of the `try_to_geometry` and `st_isempty` functions. - # TODO: Above mentioned functions are not (yet) available. Replace with equivalent functions - # when available in OSS PySpark. geom_cond = F.expr(f"try_to_geometry({col_str_norm}) IS NULL") geom_type_cond = F.expr(f"st_isempty(try_to_geometry({col_str_norm}))") condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(geom_cond | geom_type_cond) @@ -396,8 +374,6 @@ def has_dimension(column: str | Column, dimension: int) -> Column: col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in # Databricks SQL, due to the use of the `try_to_geometry` and `st_dimension` functions. - # TODO: Above mentioned functions are not (yet) available. Replace with equivalent functions - # when available in OSS PySpark. geom_cond = F.expr(f"try_to_geometry({col_str_norm}) IS NULL") geom_type_cond = F.expr(f"st_dimension(try_to_geometry({col_str_norm})) <> {dimension}") condition = F.when(col_expr.isNull(), F.lit(None)).otherwise(geom_cond | geom_type_cond) @@ -428,8 +404,6 @@ def has_x_coordinate_between(column: str | Column, min_value: float, max_value: col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in # Databricks SQL, due to the use of the `try_to_geometry`, `st_xmax` and `st_xmin` functions. - # TODO: Above mentioned functions are not (yet) available. Replace with equivalent functions - # when available in OSS PySpark. geom_cond = F.expr(f"try_to_geometry({col_str_norm}) IS NULL") geom_type_cond = F.expr( f"st_xmax(try_to_geometry({col_str_norm})) > {max_value} OR st_xmin(try_to_geometry({col_str_norm})) < {min_value}" @@ -462,8 +436,6 @@ def has_y_coordinate_between(column: str | Column, min_value: float, max_value: col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in # Databricks SQL, due to the use of the `try_to_geometry`, `st_ymax` and `st_ymin` functions. - # TODO: Above mentioned functions are not (yet) available. Replace with equivalent functions - # when available in OSS PySpark. geom_cond = F.expr(f"try_to_geometry({col_str_norm}) IS NULL") geom_type_cond = F.expr( f"st_ymax(try_to_geometry({col_str_norm})) > {max_value} OR st_ymin(try_to_geometry({col_str_norm})) < {min_value}" From 592221197fc3902092578946bbbcfa03b5648fc5 Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Tue, 30 Sep 2025 14:10:42 +0200 Subject: [PATCH 40/47] check if runtime is geo compatible --- tests/integration/conftest.py | 39 ++++++++++++++++++++++++ tests/integration/test_row_checks_geo.py | 28 ++++++++--------- 2 files changed, 53 insertions(+), 14 deletions(-) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 4e36ec60..75b8f63d 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -1,4 +1,5 @@ import logging +import re from datetime import datetime, timezone from unittest.mock import patch from pyspark.sql import DataFrame @@ -272,3 +273,41 @@ def contains_expected_workflows(workflows, state): if all(item in workflow.items() for item in state.items()): return True return False + + +@pytest.fixture +def skip_if_runtime_not_geo_compatible(ws, debug_env): + """ + Skip the test if the cluster runtime does not support the required geo functions, i.e. + * serverless clusters have the required geo functions + * standard clusters require runtime 17.1 or above + + Args: + ws (WorkspaceClient): Workspace client to interact with Databricks. + debug_env (dict): Test environment variables. + """ + if "DATABRICKS_SERVERLESS_COMPUTE_ID" in debug_env: + return # serverless clusters have the required geo functions + + # standard clusters require runtime 17.1 or above + cluster_id = debug_env.get("DATABRICKS_CLUSTER_ID") + if not cluster_id: + raise ValueError("DATABRICKS_CLUSTER_ID is not set in debug_env") + + # Fetch cluster details + cluster_info = ws.clusters.get(cluster_id) + runtime_version = cluster_info.spark_version + + if not runtime_version: + raise ValueError(f"Unable to retrieve runtime version for cluster {cluster_id}") + + # Extract major and minor version numbers + match = re.match(r"(\d+)\.(\d+)", runtime_version) + if not match: + raise ValueError(f"Invalid runtime version format: {runtime_version}") + + major, minor = [int(x) for x in match.groups()] + valid = major > 17 or (major == 17 and minor >= 1) + + if not valid: + pytest.skip("This test requires a cluster with runtime 17.1 or above") diff --git a/tests/integration/test_row_checks_geo.py b/tests/integration/test_row_checks_geo.py index dd2bf686..a12ebceb 100644 --- a/tests/integration/test_row_checks_geo.py +++ b/tests/integration/test_row_checks_geo.py @@ -19,7 +19,7 @@ ) -def test_is_geometry(spark): +def test_is_geometry(skip_if_runtime_not_geo_compatible, spark): input_schema = "geom_string: string, geom_binary: binary, geom_int: int" test_df = spark.createDataFrame( [ @@ -49,7 +49,7 @@ def test_is_geometry(spark): assert_df_equality(actual, expected, ignore_nullable=True) -def test_is_geography(spark): +def test_is_geography(skip_if_runtime_not_geo_compatible, spark): input_schema = "geography_string: string, geography_binary: binary, geography_int: int" test_df = spark.createDataFrame( [ @@ -81,7 +81,7 @@ def test_is_geography(spark): assert_df_equality(actual, expected, ignore_nullable=True) -def test_is_point(spark): +def test_is_point(skip_if_runtime_not_geo_compatible, spark): input_schema = "geom: string" test_df = spark.createDataFrame( [["POINT(1 1)"], ["nonsense"], ["POLYGON((1 1, 2 2, 3 3, 1 1))"], [None]], @@ -104,7 +104,7 @@ def test_is_point(spark): assert_df_equality(actual, expected, ignore_nullable=True) -def test_is_linestring(spark): +def test_is_linestring(skip_if_runtime_not_geo_compatible, spark): input_schema = "geom: string" test_df = spark.createDataFrame( [["LINESTRING(1 1, 2 2)"], ["nonsense"], ["POLYGON((1 1, 2 2, 3 3, 1 1))"], [None]], @@ -126,7 +126,7 @@ def test_is_linestring(spark): assert_df_equality(actual, expected, ignore_nullable=True) -def test_is_polygon(spark): +def test_is_polygon(skip_if_runtime_not_geo_compatible, spark): input_schema = "geom: string" test_df = spark.createDataFrame( [["POLYGON((1 1, 2 2, 3 3, 1 1))"], ["nonsense"], ["LINESTRING(1 1, 2 2)"], [None]], @@ -148,7 +148,7 @@ def test_is_polygon(spark): assert_df_equality(actual, expected, ignore_nullable=True) -def test_is_multipoint(spark): +def test_is_multipoint(skip_if_runtime_not_geo_compatible, spark): input_schema = "geom: string" test_df = spark.createDataFrame( [["MULTIPOINT(1 1, 2 2)"], ["nonsense"], ["LINESTRING(1 1, 2 2)"], [None]], @@ -170,7 +170,7 @@ def test_is_multipoint(spark): assert_df_equality(actual, expected, ignore_nullable=True) -def test_is_multilinestring(spark): +def test_is_multilinestring(skip_if_runtime_not_geo_compatible, spark): input_schema = "geom: string" test_df = spark.createDataFrame( [["MULTILINESTRING((1 1, 2 2), (3 3, 4 4))"], ["nonsense"], ["POLYGON((1 1, 2 2, 3 3, 1 1))"], [None]], @@ -192,7 +192,7 @@ def test_is_multilinestring(spark): assert_df_equality(actual, expected, ignore_nullable=True) -def test_is_multipolygon(spark): +def test_is_multipolygon(skip_if_runtime_not_geo_compatible, spark): input_schema = "geom: string" test_df = spark.createDataFrame( [["MULTIPOLYGON(((1 1, 2 2, 3 3, 1 1)))"], ["nonsense"], ["LINESTRING(1 1, 2 2)"], [None]], @@ -214,7 +214,7 @@ def test_is_multipolygon(spark): assert_df_equality(actual, expected, ignore_nullable=True) -def test_is_geometrycollection(spark): +def test_is_geometrycollection(skip_if_runtime_not_geo_compatible, spark): input_schema = "geom: string" test_df = spark.createDataFrame( [ @@ -241,7 +241,7 @@ def test_is_geometrycollection(spark): assert_df_equality(actual, expected, ignore_nullable=True) -def test_is_ogc_valid(spark): +def test_is_ogc_valid(skip_if_runtime_not_geo_compatible, spark): input_schema = "geom: string" test_df = spark.createDataFrame( [["POLYGON((0 0,10 0,0 10,0 0))"], ["nonsense"], ["POLYGON((0 0,10 10,10 0,0 10,0 0))"], [None]], @@ -311,7 +311,7 @@ def test_is_latitude(spark): assert_df_equality(actual, expected, ignore_nullable=True) -def test_is_non_empty_geometry(spark): +def test_is_non_empty_geometry(skip_if_runtime_not_geo_compatible, spark): input_schema = "geom: string" test_df = spark.createDataFrame( [["POINT(1 1)"], ["nonsense"], ["POLYGON EMPTY"], [None]], @@ -333,7 +333,7 @@ def test_is_non_empty_geometry(spark): assert_df_equality(actual, expected, ignore_nullable=True) -def test_has_dimension(spark): +def test_has_dimension(skip_if_runtime_not_geo_compatible, spark): input_schema = "geom: string" test_df = spark.createDataFrame( [["POINT(1 1)"], ["nonsense"], ["POLYGON((0 0, 2 0, 0 2, 0 0))"], [None]], @@ -354,7 +354,7 @@ def test_has_dimension(spark): assert_df_equality(actual, expected, ignore_nullable=True) -def test_has_x_coordinate_between(spark): +def test_has_x_coordinate_between(skip_if_runtime_not_geo_compatible, spark): input_schema = "geom: string" test_df = spark.createDataFrame( [["POINT(1 1)"], ["nonsense"], ["POLYGON((0 0, 2 0, 0 2, 0 0))"], [None]], @@ -377,7 +377,7 @@ def test_has_x_coordinate_between(spark): assert_df_equality(actual, expected, ignore_nullable=True) -def test_has_y_coordinate_between(spark): +def test_has_y_coordinate_between(skip_if_runtime_not_geo_compatible, spark): input_schema = "geom: string" test_df = spark.createDataFrame( [["POINT(1 1)"], ["nonsense"], ["POLYGON((0 0, 2 0, 0 2, 0 0))"], [None]], From 96f08233ce299c8e95051d986806eabdcc13b1ce Mon Sep 17 00:00:00 2001 From: Tim Dikland Date: Thu, 2 Oct 2025 15:13:16 +0200 Subject: [PATCH 41/47] expanded integration tests --- docs/dqx/docs/reference/quality_checks.mdx | 248 ++++++++++++++++ src/databricks/labs/dqx/checks_resolver.py | 3 + tests/integration/test_apply_checks.py | 324 ++++++++++++++++++++- tests/perf/test_apply_checks.py | 25 ++ tests/resources/all_row_checks.yaml | 152 ++++++++-- 5 files changed, 730 insertions(+), 22 deletions(-) diff --git a/docs/dqx/docs/reference/quality_checks.mdx b/docs/dqx/docs/reference/quality_checks.mdx index abcf4d0b..71da7b94 100644 --- a/docs/dqx/docs/reference/quality_checks.mdx +++ b/docs/dqx/docs/reference/quality_checks.mdx @@ -48,6 +48,22 @@ You can also define your own custom checks (see [Creating custom checks](#creati | `sql_expression` | Checks whether the values meet the condition provided as an SQL expression, e.g. `a = 'str1' and a > b`. SQL expressions are evaluated at runtime, so ensure that the expression is safe and that functions used within it (e.g. h3_ischildof, division) do not throw exceptions. You can achieve this by validating input arguments or columns beforehand using guards such as CASE WHEN, IS NOT NULL, RLIKE, or type try casts. | `expression`: sql expression to check on a DataFrame (fail the check if expression evaluates to True, pass if it evaluates to False); `msg`: optional message to output; `name`: optional name of the resulting column (it can be overwritten by `name` specified at the check level); `negate`: if the condition should be negated; `columns`: optional list of columns to be used for reporting and as name prefix if name not provided, unused in the actual logic | | `is_data_fresh` | Checks whether the values in the input timestamp column are not older than the specified number of minutes from the base timestamp column. This is useful for identifying stale data due to delayed pipelines and helps catch upstream issues early. | `column`: column of type timestamp/date to check (can be a string column name or a column expression); `max_age_minutes`: maximum age in minutes before data is considered stale; `base_timestamp`: optional base timestamp column from which the stale check is calculated. This can be a string, column expression, datetime value or literal value ex:F.lit(datetime(2024,1,1)). If not provided current_timestamp() function is used | | `does_not_contain_pii` | Checks whether the values in the input column contain Personally Identifiable Information (PII). Uses Microsoft Presidio to detect various named entities (e.g. PERSON, ADDRESS, EMAIL_ADDRESS). Requires installation of PII detection extras: `pip install 'databricks-labs-dqx[pii-detection]'`. See more details [here](#detecting-personally-identifiable-information-pii). | `column`: column to check (can be a string column name or a column expression); `threshold`: confidence threshold for PII detection (0.0 to 1.0, default: 0.7); `language`: optional language of the text (default: 'en'); `entities`: optional list of entities to detect; `nlp_engine_config`: optional dictionary configuring the NLP engine used for PII detection, see the [Presidio documentation](https://microsoft.github.io/presidio/analyzer/customizing_nlp_models/) for more information | +| `is_latitude` | Checks whether the values in the input column are valid latitude values (i.e. between -90 and 90 degrees) | `column`: column to check (can be a string column name or a column expression) | +| `is_longitude` | Checks whether the values in the input column are valid longitude values (i.e. between -180 and 180 degrees) | `column`: column to check (can be a string column name or a column expression) | +| `is_geometry` | Checks whether the values in the input column are valid geometries. | `column`: column to check (can be a string column name or a column expression) | +| `is_geography` | Checks whether the values in the input column are valid geographies. | `column`: column to check (can be a string column name or a column expression) | +| `is_point` | Checks whether the values in the input column are point geometries/geographies. | `column`: column to check (can be a string column name or a column expression) | +| `is_linestring` | Checks whether the values in the input column are linestring geometries/geographies. | `column`: column to check (can be a string column name or a column expression) | +| `is_polygon` | Checks whether the values in the input column are polygon geometries/geographies. | `column`: column to check (can be a string column name or a column expression) | +| `is_multipoint` | Checks whether the values in the input column are multipoint geometries/geographies. | `column`: column to check (can be a string column name or a column expression) | +| `is_multilinestring` | Checks whether the values in the input column are multilinestring geometries/geographies. | `column`: column to check (can be a string column name or a column expression) | +| `is_multipolygon` | Checks whether the values in the input column are multipolygon geometries/geographies. | `column`: column to check (can be a string column name or a column expression) | +| `is_geometrycollection` | Checks whether the values in the input column are geometrycollection geometries/geographies. | `column`: column to check (can be a string column name or a column expression) | +| `is_ogc_valid` | Checks whether the values in the input column are valid geometries in the OGC sense. I.e a bowtie polygon is invalid because it has a self intersection. | `column`: column to check (can be a string column name or a column expression) | +| `is_non_empty_geometry` | Checks whether the values in the input column are non-empty geometries. | `column`: column to check (can be a string column name or a column expression) | +| `has_dimension` | Checks whether the values in the input column are geometries of the specified dimension (2D projected dimension). | `column`: column to check (can be a string column name or a column expression); `dimension`: dimension to check | +| `has_x_coordinate_between` | Checks whether the values in the input column are geometries with x coordinate between the provided boundaries. | `column`: column to check (can be a string column name or a column expression); `min_value`: minimum value; `max_value`: maximum value | +| `has_y_coordinate_between` | Checks whether the values in the input column are geometries with y coordinate between the provided boundaries. | `column`: column to check (can be a string column name or a column expression); `min_value`: minimum value; `max_value`: maximum value | | `column`: column to check (can be a string column name or a column expression); `min_value`: minimum value; `max_value`: maximum value | @@ -471,6 +487,123 @@ For brevity, the `name` field in the examples is omitted and it will be auto-gen for_each_column: # apply the check for each column in the list - col3 - col5 + +# is_latitude check +- criticality: error + check: + function: is_latitude + arguments: + column: col2 + +# is_longitude check +- criticality: error + check: + function: is_longitude + arguments: + column: col2 + +# is_geometry check +- criticality: error + check: + function: is_geometry + arguments: + column: point_geom + +# is_geography check +- criticality: error + check: + function: is_geography + arguments: + column: point_geom + +# is_point check +- criticality: error + check: + function: is_point + arguments: + column: point_geom + +# is_linestring check +- criticality: error + check: + function: is_linestring + arguments: + column: linestring_geom + +# is_polygon check +- criticality: error + check: + function: is_polygon + arguments: + column: polygon_geom + +# is_multipoint check +- criticality: error + check: + function: is_multipoint + arguments: + column: multipoint_geom + +# is_multilinestring check +- criticality: error + check: + function: is_multilinestring + arguments: + column: multilinestring_geom + +# is_multipolygon check +- criticality: error + check: + function: is_multipolygon + arguments: + column: multipolygon_geom + +# is_geometrycollection check +- criticality: error + check: + function: is_geometrycollection + arguments: + column: geometrycollection_geom + +# is_ogc_valid check +- criticality: error + check: + function: is_ogc_valid + arguments: + column: point_geom + +# is_non_empty_geometry check +- criticality: error + check: + function: is_non_empty_geometry + arguments: + column: point_geom + +# has_dimension check +- criticality: error + check: + function: has_dimension + arguments: + column: polygon_geom + dimension: 2 + +# has_x_coordinate_between check +- criticality: error + check: + function: has_x_coordinate_between + arguments: + column: polygon_geom + min_value: 0.0 + max_value: 10.0 + +# has_y_coordinate_between check +- criticality: error + check: + function: has_y_coordinate_between + arguments: + column: polygon_geom + min_value: 0.0 + max_value: 10.0 ``` @@ -817,6 +950,121 @@ checks = [ } ), + # is_latitude check + DQRowRule( + criticality="error", + check_func=check_funcs.is_latitude, + column="col2" + ), + + # is_longitude check + DQRowRule( + criticality="error", + check_func=check_funcs.is_longitude, + column="col2" + ), + + # is_geometry check + DQRowRule( + criticality="error", + check_func=check_funcs.is_geometry, + column="point_geom" + ), + + # is_geography check + DQRowRule( + criticality="error", + check_func=check_funcs.is_geography, + column="point_geom" + ), + + # is_point check + DQRowRule( + criticality="error", + check_func=check_funcs.is_point, + column="point_geom" + ), + + # is_linestring check + DQRowRule( + criticality="error", + check_func=check_funcs.is_linestring, + column="linestring_geom" + ), + + # is_polygon check + DQRowRule( + criticality="error", + check_func=check_funcs.is_polygon, + column="polygon_geom" + ), + + # is_multipoint check + DQRowRule( + criticality="error", + check_func=check_funcs.is_multipoint, + column="multipoint_geom" + ), + + # is_multilinestring check + DQRowRule( + criticality="error", + check_func=check_funcs.is_multilinestring, + column="multilinestring_geom" + ), + + # is_multipolygon check + DQRowRule( + criticality="error", + check_func=check_funcs.is_multipolygon, + column="multipolygon_geom" + ), + + # is_geometrycollection check + DQRowRule( + criticality="error", + check_func=check_funcs.is_geometrycollection, + column="geometrycollection_geom" + ), + + # is_ogc_valid check + DQRowRule( + criticality="error", + check_func=check_funcs.is_ogc_valid, + column="point_geom" + ), + + # is_non_empty_geometry check + DQRowRule( + criticality="error", + check_func=check_funcs.is_non_empty_geometry, + column="point_geom" + ), + + # has_dimension check + DQRowRule( + criticality="error", + check_func=check_funcs.has_dimension, + column="polygon_geom" + check_func_kwargs={"dimension": 2} + ), + + # has_x_coordinate_between check + DQRowRule( + criticality="error", + check_func=check_funcs.has_x_coordinate_between, + column="polygon_geom" + check_func_kwargs={"min_value": 0.0, "max_value": 10.0} + ), + + # has_y_coordinate_between check + DQRowRule( + criticality="error", + check_func=check_funcs.has_y_coordinate_between, + column="polygon_geom" + check_func_kwargs={"min_value": 0.0, "max_value": 10.0} + ), + # sql_expression check DQRowRule( criticality="error", diff --git a/src/databricks/labs/dqx/checks_resolver.py b/src/databricks/labs/dqx/checks_resolver.py index d340a59f..1e08ca54 100644 --- a/src/databricks/labs/dqx/checks_resolver.py +++ b/src/databricks/labs/dqx/checks_resolver.py @@ -6,6 +6,7 @@ from contextlib import contextmanager from databricks.labs.dqx import check_funcs +from databricks.labs.dqx.geo import check_funcs as geo_check_funcs from databricks.labs.dqx.errors import InvalidCheckError logger = logging.getLogger(__name__) @@ -30,6 +31,8 @@ def resolve_check_function( """ logger.debug(f"Resolving function: {function_name}") func = getattr(check_funcs, function_name, None) # resolve using predefined checks first + if not func: + func = getattr(geo_check_funcs, function_name, None) # resolve using prefedined geo checks if not func and custom_check_functions: func = custom_check_functions.get(function_name) # returns None if not found if fail_on_missing and not func: diff --git a/tests/integration/test_apply_checks.py b/tests/integration/test_apply_checks.py index 8964df83..fc46a4eb 100644 --- a/tests/integration/test_apply_checks.py +++ b/tests/integration/test_apply_checks.py @@ -22,6 +22,7 @@ ) from databricks.labs.dqx.schema import dq_result_schema from databricks.labs.dqx import check_funcs +import databricks.labs.dqx.geo.check_funcs as geo_check_funcs from tests.integration.conftest import REPORTING_COLUMNS, RUN_TIME, EXTRA_PARAMS @@ -4469,7 +4470,9 @@ def test_apply_checks_all_row_checks_as_yaml_with_streaming(ws, make_schema, mak schema = ( "col1: string, col2: int, col3: int, col4 array, col5: date, col6: timestamp, " "col7: map, col8: struct, col10: int, col11: string, " - "col_ipv4: string, col_ipv6: string" + "col_ipv4: string, col_ipv6: string, point_geom: string, linestring_geom: string, " + "polygon_geom: string, multipoint_geom: string, multilinestring_geom: string, " + "multipolygon_geom: string, geometrycollection_geom: string" ) test_df = spark.createDataFrame( [ @@ -4486,6 +4489,13 @@ def test_apply_checks_all_row_checks_as_yaml_with_streaming(ws, make_schema, mak "val2", "192.168.1.1", "2001:0db8:85a3:08d3:1319:8a2e:0370:7344", + "POINT(1 1)", + "LINESTRING(1 1, 2 2)", + "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", + "MULTIPOINT(1 1, 2 2)", + "MULTILINESSTRING((1 1, 2 2))", + "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", + "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", ], [ "val2", @@ -4500,6 +4510,13 @@ def test_apply_checks_all_row_checks_as_yaml_with_streaming(ws, make_schema, mak "val2", "192.168.1.2", "2001:0db8:85a3:08d3:ffff:ffff:ffff:ffff", + "POINT(1 1)", + "LINESTRING(1 1, 2 2)", + "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", + "MULTIPOINT(1 1, 2 2)", + "MULTILINESSTRING((1 1, 2 2))", + "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", + "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", ], [ "val3", @@ -4514,6 +4531,13 @@ def test_apply_checks_all_row_checks_as_yaml_with_streaming(ws, make_schema, mak "val2", "192.168.1.3", "2001:db8:85a3:8d3:1319:8a2e:3.112.115.68", + "POINT(1 1)", + "LINESTRING(1 1, 2 2)", + "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", + "MULTIPOINT(1 1, 2 2)", + "MULTILINESSTRING((1 1, 2 2))", + "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", + "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", ], ], schema, @@ -4552,8 +4576,15 @@ def test_apply_checks_all_row_checks_as_yaml_with_streaming(ws, make_schema, mak "val2", "192.168.1.1", "2001:0db8:85a3:08d3:1319:8a2e:0370:7344", + "POINT(1 1)", + "LINESTRING(1 1, 2 2)", + "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", + "MULTIPOINT(1 1, 2 2)", + "MULTILINESSTRING((1 1, 2 2))", + "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", + "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", None, - None, + None ], [ "val2", @@ -4568,6 +4599,13 @@ def test_apply_checks_all_row_checks_as_yaml_with_streaming(ws, make_schema, mak "val2", "192.168.1.2", "2001:0db8:85a3:08d3:ffff:ffff:ffff:ffff", + "POINT(1 1)", + "LINESTRING(1 1, 2 2)", + "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", + "MULTIPOINT(1 1, 2 2)", + "MULTILINESSTRING((1 1, 2 2))", + "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", + "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", None, None, ], @@ -4584,6 +4622,13 @@ def test_apply_checks_all_row_checks_as_yaml_with_streaming(ws, make_schema, mak "val2", "192.168.1.3", "2001:db8:85a3:8d3:1319:8a2e:3.112.115.68", + "POINT(1 1)", + "LINESTRING(1 1, 2 2)", + "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", + "MULTIPOINT(1 1, 2 2)", + "MULTILINESSTRING((1 1, 2 2))", + "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", + "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", None, None, ], @@ -4615,7 +4660,9 @@ def test_apply_checks_all_checks_as_yaml(ws, spark): schema = ( "col1: string, col2: int, col3: int, col4 array, col5: date, col6: timestamp, " "col7: map, col8: struct, col10: int, col11: string, " - "col_ipv4: string, col_ipv6: string" + "col_ipv4: string, col_ipv6: string, point_geom: string, linestring_geom: string, " + "polygon_geom: string, multipoint_geom: string, multilinestring_geom: string, " + "multipolygon_geom: string, geometrycollection_geom: string" ) test_df = spark.createDataFrame( [ @@ -4632,6 +4679,13 @@ def test_apply_checks_all_checks_as_yaml(ws, spark): "val2", "192.168.1.0", "2001:0db8:85a3:08d3:0000:0000:0000:0001", + "POINT(1 1)", + "LINESTRING(1 1, 2 2)", + "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", + "MULTIPOINT(1 1, 2 2)", + "MULTILINESSTRING((1 1, 2 2))", + "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", + "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", ], [ "val2", @@ -4646,6 +4700,13 @@ def test_apply_checks_all_checks_as_yaml(ws, spark): "val2", "192.168.1.1", "2001:0db8:85a3:08d3:0000:0000:0000:1", + "POINT(1 1)", + "LINESTRING(1 1, 2 2)", + "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", + "MULTIPOINT(1 1, 2 2)", + "MULTILINESSTRING((1 1, 2 2))", + "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", + "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", ], [ "val3", @@ -4660,6 +4721,13 @@ def test_apply_checks_all_checks_as_yaml(ws, spark): "val2", "192.168.1.2", "2001:0db8:85a3:08d3:0000::2", + "POINT(1 1)", + "LINESTRING(1 1, 2 2)", + "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", + "MULTIPOINT(1 1, 2 2)", + "MULTILINESSTRING((1 1, 2 2))", + "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", + "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", ], ], schema, @@ -4686,6 +4754,13 @@ def test_apply_checks_all_checks_as_yaml(ws, spark): "val2", "192.168.1.0", "2001:0db8:85a3:08d3:0000:0000:0000:0001", + "POINT(1 1)", + "LINESTRING(1 1, 2 2)", + "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", + "MULTIPOINT(1 1, 2 2)", + "MULTILINESSTRING((1 1, 2 2))", + "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", + "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", None, None, ], @@ -4702,6 +4777,13 @@ def test_apply_checks_all_checks_as_yaml(ws, spark): "val2", "192.168.1.1", "2001:0db8:85a3:08d3:0000:0000:0000:1", + "POINT(1 1)", + "LINESTRING(1 1, 2 2)", + "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", + "MULTIPOINT(1 1, 2 2)", + "MULTILINESSTRING((1 1, 2 2))", + "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", + "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", None, None, ], @@ -4718,6 +4800,13 @@ def test_apply_checks_all_checks_as_yaml(ws, spark): "val2", "192.168.1.2", "2001:0db8:85a3:08d3:0000::2", + "POINT(1 1)", + "LINESTRING(1 1, 2 2)", + "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", + "MULTIPOINT(1 1, 2 2)", + "MULTILINESSTRING((1 1, 2 2))", + "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", + "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", None, None, ], @@ -5315,6 +5404,188 @@ def test_apply_checks_all_checks_using_classes(ws, spark): user_metadata={"tag1": "value9", "tag2": "036"}, check_func_kwargs={"cidr_block": "2001:0db8:85a3:08d3:0000:0000:0000:0000/64"}, ), + # is_latitude check + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_latitude, + column="col2", + ), + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_latitude, + column=F.col("col2"), + ), + # is_longitude check + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_longitude, + column="col2", + ), + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_longitude, + column=F.col("col2"), + ), + # is_geometry check + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_geometry, + column="point_geom", + ), + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_geometry, + column=F.col("point_geom"), + ), + # is_geography check + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_geography, + column="point_geom", + ), + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_geography, + column=F.col("point_geom"), + ), + # is_point check + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_point, + column="point_geom", + ), + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_point, + column=F.col("point_geom"), + ), + # is_linestring check + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_linestring, + column="linestring_geom", + ), + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_linestring, + column=F.col("linestring_geom"), + ), + # is_polygon check + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_polygon, + column="polygon_geom", + ), + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_polygon, + column=F.col("polygon_geom"), + ), + # is_multipoint check + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_multipoint, + column="multipoint_geom", + ), + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_multipoint, + column=F.col("multipoint_geom"), + ), + # is_multilinestring check + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_multilinestring, + column="multilinestring_geom", + ), + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_multilinestring, + column=F.col("multilinestring_geom"), + ), + # is_multipolygon check + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_multipolygon, + column="multipolygon_geom", + ), + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_multipolygon, + column=F.col("multipolygon_geom"), + ), + # is_geometrycollection check + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_geometrycollection, + column="geometrycollection_geom", + ), + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_geometrycollection, + column=F.col("geometrycollection_geom"), + ), + # is_ogc_valid check + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_ogc_valid, + column="point_geom", + ), + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_ogc_valid, + column=F.col("point_geom"), + ), + # is_non_empty_geometry check + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_non_empty_geometry, + column="point_geom", + ), + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_non_empty_geometry, + column=F.col("point_geom"), + ), + # has_dimension check + DQRowRule( + criticality="error", + check_func=geo_check_funcs.has_dimension, + column="polygon_geom", + check_func_kwargs={"dimension": 2}, + ), + DQRowRule( + criticality="error", + check_func=geo_check_funcs.has_dimension, + column=F.col("polygon_geom"), + check_func_kwargs={"dimension": 2}, + ), + # has_x_coordinate_between check + DQRowRule( + criticality="error", + check_func=geo_check_funcs.has_x_coordinate_between, + column="polygon_geom", + check_func_kwargs={"min_value": 0, "max_value": 10}, + ), + DQRowRule( + criticality="error", + check_func=geo_check_funcs.has_x_coordinate_between, + column=F.col("polygon_geom"), + check_func_kwargs={"min_value": 0, "max_value": 10}, + ), + # has_y_coordinate_between check + DQRowRule( + criticality="error", + check_func=geo_check_funcs.has_y_coordinate_between, + column="polygon_geom", + check_func_kwargs={"min_value": 0, "max_value": 10}, + ), + DQRowRule( + criticality="error", + check_func=geo_check_funcs.has_y_coordinate_between, + column=F.col("polygon_geom"), + check_func_kwargs={"min_value": 0, "max_value": 10}, + ), # is_data_fresh check DQRowRule( criticality="error", @@ -5336,7 +5607,9 @@ def test_apply_checks_all_checks_using_classes(ws, spark): schema = ( "col1: string, col2: int, col3: int, col4 array, col5: date, col6: timestamp, " "col7: map, col8: struct, col10: int, col11: string, " - "col_ipv4: string, col_ipv6: string" + "col_ipv4: string, col_ipv6: string, point_geom: string, linestring_geom: string, " + "polygon_geom: string, multipoint_geom: string, multilinestring_geom: string, " + "multipolygon_geom: string, geometrycollection_geom: string" ) test_df = spark.createDataFrame( [ @@ -5353,6 +5626,13 @@ def test_apply_checks_all_checks_using_classes(ws, spark): "val2", "255.255.255.255", "2001:0db8:85a3:08d3:1319:8a2e:0370:7344", + "POINT(1 1)", + "LINESTRING(1 1, 2 2)", + "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", + "MULTIPOINT(1 1, 2 2)", + "MULTILINESSTRING((1 1, 2 2))", + "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", + "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", ], [ "val2", @@ -5367,6 +5647,14 @@ def test_apply_checks_all_checks_using_classes(ws, spark): "val2", "255.255.255.1", "2001:0db8:85a3:08d3:ffff:ffff:ffff:ffff", + "POINT(1 1)", + "LINESTRING(1 1, 2 2)", + "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", + "MULTIPOINT(1 1, 2 2)", + "MULTILINESSTRING((1 1, 2 2))", + "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", + "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", + ], [ "val3", @@ -5381,6 +5669,13 @@ def test_apply_checks_all_checks_using_classes(ws, spark): "val2", "255.255.255.2", "2001:db8:85a3:8d3:1319:8a2e:3.112.115.68", + "POINT(1 1)", + "LINESTRING(1 1, 2 2)", + "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", + "MULTIPOINT(1 1, 2 2)", + "MULTILINESSTRING((1 1, 2 2))", + "MULTIPOLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", + "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", ], ], schema, @@ -5404,6 +5699,13 @@ def test_apply_checks_all_checks_using_classes(ws, spark): "val2", "255.255.255.255", "2001:0db8:85a3:08d3:1319:8a2e:0370:7344", + "POINT(1 1)", + "LINESTRING(1 1, 2 2)", + "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", + "MULTIPOINT(1 1, 2 2)", + "MULTILINESSTRING((1 1, 2 2))", + "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", + "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", None, None, ], @@ -5420,6 +5722,13 @@ def test_apply_checks_all_checks_using_classes(ws, spark): "val2", "255.255.255.1", "2001:0db8:85a3:08d3:ffff:ffff:ffff:ffff", + "POINT(1 1)", + "LINESTRING(1 1, 2 2)", + "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", + "MULTIPOINT(1 1, 2 2)", + "MULTILINESSTRING((1 1, 2 2))", + "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", + "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", None, None, ], @@ -5436,6 +5745,13 @@ def test_apply_checks_all_checks_using_classes(ws, spark): "val2", "255.255.255.2", "2001:db8:85a3:8d3:1319:8a2e:3.112.115.68", + "POINT(1 1)", + "LINESTRING(1 1, 2 2)", + "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", + "MULTIPOINT(1 1, 2 2)", + "MULTILINESSTRING((1 1, 2 2))", + "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", + "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", None, None, ], diff --git a/tests/perf/test_apply_checks.py b/tests/perf/test_apply_checks.py index 6282b5c5..ee18e0f6 100644 --- a/tests/perf/test_apply_checks.py +++ b/tests/perf/test_apply_checks.py @@ -1326,6 +1326,31 @@ def test_benchmark_is_ipv6_address_in_cidr(benchmark, ws, generated_ipv6_df, col actual_count = benchmark(lambda: checked.count()) assert actual_count == EXPECTED_ROWS +def test_benchmark_is_geometry(benchmark, ws, generated_df): + dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) + checks = [ + DQRowRule( + criticality="error", + check_func=check_funcs.is_geometry, + column="col1", + ) + ] + checked = dq_engine.apply_checks(generated_df, checks) + actual_count = benchmark(lambda: checked.count()) + assert actual_count == EXPECTED_ROWS + +def test_benchmark_is_geography(benchmark, ws, generated_df): + dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) + checks = [ + DQRowRule( + criticality="error", + check_func=check_funcs.is_geography, + column="col1", + ) + ] + checked = dq_engine.apply_checks(generated_df, checks) + actual_count = benchmark(lambda: checked.count()) + assert actual_count == EXPECTED_ROWS def test_benchmark_has_valid_schema(benchmark, ws, generated_df): dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) diff --git a/tests/resources/all_row_checks.yaml b/tests/resources/all_row_checks.yaml index 25a12cfb..831f23d6 100644 --- a/tests/resources/all_row_checks.yaml +++ b/tests/resources/all_row_checks.yaml @@ -30,9 +30,9 @@ arguments: column: col2 allowed: - - 1 - - 2 - - 3 + - 1 + - 2 + - 3 # is_not_null_and_is_in_list check - criticality: error @@ -41,9 +41,9 @@ arguments: column: col2 allowed: - - 1 - - 2 - - 3 + - 1 + - 2 + - 3 # is_not_null_and_not_empty_array check - criticality: error @@ -279,7 +279,7 @@ function: regex_match arguments: column: col2 - regex: '[0-9]+' + regex: "[0-9]+" negate: false # sql_expression check @@ -300,16 +300,16 @@ expression: col3 >= col2 and col3 <= 10 msg: col3 is less than col2 and col3 is greater than 10 columns: # optional for reporting - - col2 - - col3 + - col2 + - col3 # apply check to multiple columns - criticality: error check: function: is_not_null # 'column' as first argument for_each_column: # apply the check for each column in the list - - col3 - - col5 + - col3 + - col5 # is_not_null check applied to a struct column element (dot notation) - criticality: error @@ -389,10 +389,10 @@ check: function: is_not_null for_each_column: - - col1 # col - - col8.field1 # struct col - - try_element_at(col7, 'key1') # map col - - try_element_at(col4, 1) # array col + - col1 # col + - col8.field1 # struct col + - try_element_at(col7, 'key1') # map col + - try_element_at(col4, 1) # array col # is_valid_ipv4_address check - criticality: error @@ -407,7 +407,7 @@ function: is_ipv4_address_in_cidr arguments: column: col_ipv4 - cidr_block: '192.168.1.0/24' + cidr_block: "192.168.1.0/24" # is_valid_ipv6_address check - criticality: error @@ -422,7 +422,124 @@ function: is_ipv6_address_in_cidr arguments: column: col_ipv6 - cidr_block: '2001:0db8:85a3:08d3:0000:0000:0000:0000/64' + cidr_block: "2001:0db8:85a3:08d3:0000:0000:0000:0000/64" + +# is_latitude check +- criticality: error + check: + function: is_latitude + arguments: + column: col2 + +# is_longitude check +- criticality: error + check: + function: is_longitude + arguments: + column: col2 + +# is_geometry check +- criticality: error + check: + function: is_geometry + arguments: + column: point_geom + +# is_geography check +- criticality: error + check: + function: is_geography + arguments: + column: point_geom + +# is_point check +- criticality: error + check: + function: is_point + arguments: + column: point_geom + +# is_linestring check +- criticality: error + check: + function: is_linestring + arguments: + column: linestring_geom + +# is_polygon check +- criticality: error + check: + function: is_polygon + arguments: + column: polygon_geom + +# is_multipoint check +- criticality: error + check: + function: is_multipoint + arguments: + column: multipoint_geom + +# is_multilinestring check +- criticality: error + check: + function: is_multilinestring + arguments: + column: multilinestring_geom + +# is_multipolygon check +- criticality: error + check: + function: is_multipolygon + arguments: + column: multipolygon_geom + +# is_geometrycollection check +- criticality: error + check: + function: is_geometrycollection + arguments: + column: geometrycollection_geom + +# is_ogc_valid check +- criticality: error + check: + function: is_ogc_valid + arguments: + column: point_geom + +# is_non_empty_geometry check +- criticality: error + check: + function: is_non_empty_geometry + arguments: + column: point_geom + +# has_dimension check +- criticality: error + check: + function: has_dimension + arguments: + column: point_geom + dimension: 2 + +# has_x_coordinate_between check +- criticality: error + check: + function: has_x_coordinate_between + arguments: + column: polygon_geom + min_value: 0.0 + max_value: 10.0 + +# has_y_coordinate_between check +- criticality: error + check: + function: has_y_coordinate_between + arguments: + column: polygon_geom + min_value: 0.0 + max_value: 10.0 # is_data_fresh check with base_timestamp column as string - criticality: error @@ -432,4 +549,3 @@ column: col5 max_age_minutes: 18000 base_timestamp: col6 - From 6525f4eea9e0e4110d151f2777008b16fc4317b4 Mon Sep 17 00:00:00 2001 From: Tim Dikland Date: Thu, 2 Oct 2025 15:31:00 +0200 Subject: [PATCH 42/47] add benchmarks for geo check functions --- tests/perf/conftest.py | 25 +++++ tests/perf/test_apply_checks.py | 191 ++++++++++++++++++++++++++++++-- 2 files changed, 208 insertions(+), 8 deletions(-) diff --git a/tests/perf/conftest.py b/tests/perf/conftest.py index 59c160c9..642ed0dc 100644 --- a/tests/perf/conftest.py +++ b/tests/perf/conftest.py @@ -160,6 +160,31 @@ def generated_ipv6_df(spark): return gen.build() +@pytest.fixture +def generated_geo_df(spark): + geo_schema_str = ( + "point_geom: string, linestring_geom: string, polygon_geom: string, multipoint_geom: string, " + "multilinestring_geom: string, multipolygon_geom: string, geometrycollection_geom: string" + ) + schema = _parse_datatype_string(geo_schema_str) + + geo_templates = { + "point_geom": "POINT(x x)", + "linestring_geom": "LINESTRING(x x, x x)", + "polygon_geom": "POLYGON((x x, x x, x x, x x))", + "multipoint_geom": "MULTIPOINT(x x, x x)", + "multilinestring_geom": "MULTILINESSTRING((x x, x x))", + "multipolygon_geom": "MULTIPOLYGON(((x x, x x, x x, x x))", + "geometrycollection_geom": "GEOMETRYCOLLECTION(POINT(x x), LINESTRING(x x, x x), POLYGON((x x, x x, x x, x x)))", + } + + _, gen = make_data_gen(spark, n_rows=DEFAULT_ROWS, n_columns=len(geo_schema_str), partitions=DEFAULT_PARTITIONS) + gen = gen.withSchema(schema) + for col, template in geo_templates.items(): + gen = gen.withColumnSpec(col, template=template) + return gen.build() + + @pytest.fixture def make_ref_df(spark, n_rows=DEFAULT_ROWS): schema = _parse_datatype_string(REF_SCHEMA_STR) diff --git a/tests/perf/test_apply_checks.py b/tests/perf/test_apply_checks.py index ee18e0f6..75c64aad 100644 --- a/tests/perf/test_apply_checks.py +++ b/tests/perf/test_apply_checks.py @@ -4,6 +4,7 @@ from databricks.labs.dqx.config import ExtraParams import pytest from databricks.labs.dqx import check_funcs +from databricks.labs.dqx.geo import check_funcs as geo_check_funcs from tests.perf.conftest import DEFAULT_ROWS RUN_TIME = datetime(2025, 1, 1, 0, 0, 0, 0, tzinfo=timezone.utc) @@ -1326,32 +1327,206 @@ def test_benchmark_is_ipv6_address_in_cidr(benchmark, ws, generated_ipv6_df, col actual_count = benchmark(lambda: checked.count()) assert actual_count == EXPECTED_ROWS -def test_benchmark_is_geometry(benchmark, ws, generated_df): +@pytest.mark.benchmark(group="test_benchmark_is_geometry") +def test_benchmark_is_geometry(benchmark, ws, generated_geo_df): dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) checks = [ DQRowRule( criticality="error", - check_func=check_funcs.is_geometry, - column="col1", + check_func=geo_check_funcs.is_geometry, + column="point_geom", ) ] - checked = dq_engine.apply_checks(generated_df, checks) + checked = dq_engine.apply_checks(generated_geo_df, checks) actual_count = benchmark(lambda: checked.count()) assert actual_count == EXPECTED_ROWS -def test_benchmark_is_geography(benchmark, ws, generated_df): +@pytest.mark.benchmark(group="test_benchmark_is_geography") +def test_benchmark_is_geography(benchmark, ws, generated_geo_df): dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) checks = [ DQRowRule( criticality="error", - check_func=check_funcs.is_geography, - column="col1", + check_func=geo_check_funcs.is_geography, + column="point_geom", ) ] - checked = dq_engine.apply_checks(generated_df, checks) + checked = dq_engine.apply_checks(generated_geo_df, checks) + actual_count = benchmark(lambda: checked.count()) + assert actual_count == EXPECTED_ROWS + +@pytest.mark.benchmark(group="test_benchmark_is_point") +def test_benchmark_is_point(benchmark, ws, generated_geo_df): + dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) + checks = [ + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_point, + column="point_geom", + ) + ] + checked = dq_engine.apply_checks(generated_geo_df, checks) + actual_count = benchmark(lambda: checked.count()) + assert actual_count == EXPECTED_ROWS + +@pytest.mark.benchmark(group="test_benchmark_is_linestring") +def test_benchmark_is_linestring(benchmark, ws, generated_geo_df): + dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) + checks = [ + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_linestring, + column="linestring_geom", + ) + ] + checked = dq_engine.apply_checks(generated_geo_df, checks) + actual_count = benchmark(lambda: checked.count()) + assert actual_count == EXPECTED_ROWS + +@pytest.mark.benchmark(group="test_benchmark_is_polygon") +def test_benchmark_is_polygon(benchmark, ws, generated_geo_df): + dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) + checks = [ + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_polygon, + column="polygon_geom", + ) + ] + checked = dq_engine.apply_checks(generated_geo_df, checks) + actual_count = benchmark(lambda: checked.count()) + assert actual_count == EXPECTED_ROWS + +@pytest.mark.benchmark(group="test_benchmark_is_multipoint") +def test_benchmark_is_multipoint(benchmark, ws, generated_geo_df): + dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) + checks = [ + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_multipoint, + column="multipoint_geom", + ) + ] + checked = dq_engine.apply_checks(generated_geo_df, checks) + actual_count = benchmark(lambda: checked.count()) + assert actual_count == EXPECTED_ROWS + +@pytest.mark.benchmark(group="test_benchmark_is_multilinestring") +def test_benchmark_is_multilinestring(benchmark, ws, generated_geo_df): + dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) + checks = [ + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_multilinestring, + column="multilinestring_geom", + ) + ] + checked = dq_engine.apply_checks(generated_geo_df, checks) + actual_count = benchmark(lambda: checked.count()) + assert actual_count == EXPECTED_ROWS + +@pytest.mark.benchmark(group="test_benchmark_is_multipolygon") +def test_benchmark_is_multipolygon(benchmark, ws, generated_geo_df): + dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) + checks = [ + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_multipolygon, + column="multipolygon_geom", + ) + ] + checked = dq_engine.apply_checks(generated_geo_df, checks) + actual_count = benchmark(lambda: checked.count()) + assert actual_count == EXPECTED_ROWS + +@pytest.mark.benchmark(group="test_benchmark_is_geometrycollection") +def test_benchmark_is_geometrycollection(benchmark, ws, generated_geo_df): + dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) + checks = [ + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_geometrycollection, + column="geometrycollection_geom", + ) + ] + checked = dq_engine.apply_checks(generated_geo_df, checks) + actual_count = benchmark(lambda: checked.count()) + assert actual_count == EXPECTED_ROWS + +@pytest.mark.benchmark(group="test_benchmark_is_ogc_valid") +def test_benchmark_is_ogc_valid(benchmark, ws, generated_geo_df): + dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) + checks = [ + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_ogc_valid, + column="point_geom", + ) + ] + checked = dq_engine.apply_checks(generated_geo_df, checks) + actual_count = benchmark(lambda: checked.count()) + assert actual_count == EXPECTED_ROWS + +@pytest.mark.benchmark(group="test_benchmark_is_non_empty_geometry") +def test_benchmark_is_non_empty_geometry(benchmark, ws, generated_geo_df): + dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) + checks = [ + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_non_empty_geometry, + column="point_geom", + ) + ] + checked = dq_engine.apply_checks(generated_geo_df, checks) + actual_count = benchmark(lambda: checked.count()) + assert actual_count == EXPECTED_ROWS + +@pytest.mark.benchmark(group="test_benchmark_has_dimension") +def test_benchmark_has_dimension(benchmark, ws, generated_geo_df): + dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) + checks = [ + DQRowRule( + criticality="error", + check_func=geo_check_funcs.has_dimension, + column="polygon_geom", + check_func_kwargs={"dimension": 2}, + ) + ] + checked = dq_engine.apply_checks(generated_geo_df, checks) + actual_count = benchmark(lambda: checked.count()) + assert actual_count == EXPECTED_ROWS + +@pytest.mark.benchmark(group="test_benchmark_has_x_coordinate_between") +def test_benchmark_has_x_coordinate_between(benchmark, ws, generated_geo_df): + dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) + checks = [ + DQRowRule( + criticality="error", + check_func=geo_check_funcs.has_x_coordinate_between, + column="polygon_geom", + check_func_kwargs={"min_value": 0.0, "max_value": 10.0}, + ) + ] + checked = dq_engine.apply_checks(generated_geo_df, checks) + actual_count = benchmark(lambda: checked.count()) + assert actual_count == EXPECTED_ROWS + +@pytest.mark.benchmark(group="test_benchmark_has_y_coordinate_between") +def test_benchmark_has_y_coordinate_between(benchmark, ws, generated_geo_df): + dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) + checks = [ + DQRowRule( + criticality="error", + check_func=geo_check_funcs.has_y_coordinate_between, + column="polygon_geom", + check_func_kwargs={"min_value": 0.0, "max_value": 10.0}, + ) + ] + checked = dq_engine.apply_checks(generated_geo_df, checks) actual_count = benchmark(lambda: checked.count()) assert actual_count == EXPECTED_ROWS +@pytest.mark.benchmark(group="test_benchmark_has_valid_schema") def test_benchmark_has_valid_schema(benchmark, ws, generated_df): dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) checks = [ From 1844672e3a5a824e3313337923498a98407d0b2e Mon Sep 17 00:00:00 2001 From: Tim Dikland Date: Thu, 2 Oct 2025 20:59:03 +0200 Subject: [PATCH 43/47] integation test fixes pt1 --- tests/integration/test_apply_checks.py | 25 ++++++++++++------------- tests/perf/test_apply_checks.py | 15 +++++++++++++++ tests/resources/all_row_checks.yaml | 4 ++-- 3 files changed, 29 insertions(+), 15 deletions(-) diff --git a/tests/integration/test_apply_checks.py b/tests/integration/test_apply_checks.py index fc46a4eb..26594fd7 100644 --- a/tests/integration/test_apply_checks.py +++ b/tests/integration/test_apply_checks.py @@ -4470,8 +4470,8 @@ def test_apply_checks_all_row_checks_as_yaml_with_streaming(ws, make_schema, mak schema = ( "col1: string, col2: int, col3: int, col4 array, col5: date, col6: timestamp, " "col7: map, col8: struct, col10: int, col11: string, " - "col_ipv4: string, col_ipv6: string, point_geom: string, linestring_geom: string, " - "polygon_geom: string, multipoint_geom: string, multilinestring_geom: string, " + "col_ipv4: string, col_ipv6: string, point_geom: string, linestring_geom: string, " + "polygon_geom: string, multipoint_geom: string, multilinestring_geom: string, " "multipolygon_geom: string, geometrycollection_geom: string" ) test_df = spark.createDataFrame( @@ -4584,7 +4584,7 @@ def test_apply_checks_all_row_checks_as_yaml_with_streaming(ws, make_schema, mak "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", None, - None + None, ], [ "val2", @@ -4660,8 +4660,8 @@ def test_apply_checks_all_checks_as_yaml(ws, spark): schema = ( "col1: string, col2: int, col3: int, col4 array, col5: date, col6: timestamp, " "col7: map, col8: struct, col10: int, col11: string, " - "col_ipv4: string, col_ipv6: string, point_geom: string, linestring_geom: string, " - "polygon_geom: string, multipoint_geom: string, multilinestring_geom: string, " + "col_ipv4: string, col_ipv6: string, point_geom: string, linestring_geom: string, " + "polygon_geom: string, multipoint_geom: string, multilinestring_geom: string, " "multipolygon_geom: string, geometrycollection_geom: string" ) test_df = spark.createDataFrame( @@ -5565,26 +5565,26 @@ def test_apply_checks_all_checks_using_classes(ws, spark): criticality="error", check_func=geo_check_funcs.has_x_coordinate_between, column="polygon_geom", - check_func_kwargs={"min_value": 0, "max_value": 10}, + check_func_kwargs={"min_value": 0.0, "max_value": 10.0}, ), DQRowRule( criticality="error", check_func=geo_check_funcs.has_x_coordinate_between, column=F.col("polygon_geom"), - check_func_kwargs={"min_value": 0, "max_value": 10}, + check_func_kwargs={"min_value": 0.0, "max_value": 10.0}, ), # has_y_coordinate_between check DQRowRule( criticality="error", check_func=geo_check_funcs.has_y_coordinate_between, column="polygon_geom", - check_func_kwargs={"min_value": 0, "max_value": 10}, + check_func_kwargs={"min_value": 0.0, "max_value": 10.0}, ), DQRowRule( criticality="error", check_func=geo_check_funcs.has_y_coordinate_between, column=F.col("polygon_geom"), - check_func_kwargs={"min_value": 0, "max_value": 10}, + check_func_kwargs={"min_value": 0.0, "max_value": 10.0}, ), # is_data_fresh check DQRowRule( @@ -5607,8 +5607,8 @@ def test_apply_checks_all_checks_using_classes(ws, spark): schema = ( "col1: string, col2: int, col3: int, col4 array, col5: date, col6: timestamp, " "col7: map, col8: struct, col10: int, col11: string, " - "col_ipv4: string, col_ipv6: string, point_geom: string, linestring_geom: string, " - "polygon_geom: string, multipoint_geom: string, multilinestring_geom: string, " + "col_ipv4: string, col_ipv6: string, point_geom: string, linestring_geom: string, " + "polygon_geom: string, multipoint_geom: string, multilinestring_geom: string, " "multipolygon_geom: string, geometrycollection_geom: string" ) test_df = spark.createDataFrame( @@ -5654,7 +5654,6 @@ def test_apply_checks_all_checks_using_classes(ws, spark): "MULTILINESSTRING((1 1, 2 2))", "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", - ], [ "val3", @@ -5674,7 +5673,7 @@ def test_apply_checks_all_checks_using_classes(ws, spark): "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", "MULTIPOINT(1 1, 2 2)", "MULTILINESSTRING((1 1, 2 2))", - "MULTIPOLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", + "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", ], ], diff --git a/tests/perf/test_apply_checks.py b/tests/perf/test_apply_checks.py index 75c64aad..5a3aaed0 100644 --- a/tests/perf/test_apply_checks.py +++ b/tests/perf/test_apply_checks.py @@ -1327,6 +1327,7 @@ def test_benchmark_is_ipv6_address_in_cidr(benchmark, ws, generated_ipv6_df, col actual_count = benchmark(lambda: checked.count()) assert actual_count == EXPECTED_ROWS + @pytest.mark.benchmark(group="test_benchmark_is_geometry") def test_benchmark_is_geometry(benchmark, ws, generated_geo_df): dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) @@ -1341,6 +1342,7 @@ def test_benchmark_is_geometry(benchmark, ws, generated_geo_df): actual_count = benchmark(lambda: checked.count()) assert actual_count == EXPECTED_ROWS + @pytest.mark.benchmark(group="test_benchmark_is_geography") def test_benchmark_is_geography(benchmark, ws, generated_geo_df): dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) @@ -1355,6 +1357,7 @@ def test_benchmark_is_geography(benchmark, ws, generated_geo_df): actual_count = benchmark(lambda: checked.count()) assert actual_count == EXPECTED_ROWS + @pytest.mark.benchmark(group="test_benchmark_is_point") def test_benchmark_is_point(benchmark, ws, generated_geo_df): dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) @@ -1369,6 +1372,7 @@ def test_benchmark_is_point(benchmark, ws, generated_geo_df): actual_count = benchmark(lambda: checked.count()) assert actual_count == EXPECTED_ROWS + @pytest.mark.benchmark(group="test_benchmark_is_linestring") def test_benchmark_is_linestring(benchmark, ws, generated_geo_df): dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) @@ -1383,6 +1387,7 @@ def test_benchmark_is_linestring(benchmark, ws, generated_geo_df): actual_count = benchmark(lambda: checked.count()) assert actual_count == EXPECTED_ROWS + @pytest.mark.benchmark(group="test_benchmark_is_polygon") def test_benchmark_is_polygon(benchmark, ws, generated_geo_df): dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) @@ -1397,6 +1402,7 @@ def test_benchmark_is_polygon(benchmark, ws, generated_geo_df): actual_count = benchmark(lambda: checked.count()) assert actual_count == EXPECTED_ROWS + @pytest.mark.benchmark(group="test_benchmark_is_multipoint") def test_benchmark_is_multipoint(benchmark, ws, generated_geo_df): dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) @@ -1411,6 +1417,7 @@ def test_benchmark_is_multipoint(benchmark, ws, generated_geo_df): actual_count = benchmark(lambda: checked.count()) assert actual_count == EXPECTED_ROWS + @pytest.mark.benchmark(group="test_benchmark_is_multilinestring") def test_benchmark_is_multilinestring(benchmark, ws, generated_geo_df): dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) @@ -1425,6 +1432,7 @@ def test_benchmark_is_multilinestring(benchmark, ws, generated_geo_df): actual_count = benchmark(lambda: checked.count()) assert actual_count == EXPECTED_ROWS + @pytest.mark.benchmark(group="test_benchmark_is_multipolygon") def test_benchmark_is_multipolygon(benchmark, ws, generated_geo_df): dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) @@ -1439,6 +1447,7 @@ def test_benchmark_is_multipolygon(benchmark, ws, generated_geo_df): actual_count = benchmark(lambda: checked.count()) assert actual_count == EXPECTED_ROWS + @pytest.mark.benchmark(group="test_benchmark_is_geometrycollection") def test_benchmark_is_geometrycollection(benchmark, ws, generated_geo_df): dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) @@ -1453,6 +1462,7 @@ def test_benchmark_is_geometrycollection(benchmark, ws, generated_geo_df): actual_count = benchmark(lambda: checked.count()) assert actual_count == EXPECTED_ROWS + @pytest.mark.benchmark(group="test_benchmark_is_ogc_valid") def test_benchmark_is_ogc_valid(benchmark, ws, generated_geo_df): dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) @@ -1467,6 +1477,7 @@ def test_benchmark_is_ogc_valid(benchmark, ws, generated_geo_df): actual_count = benchmark(lambda: checked.count()) assert actual_count == EXPECTED_ROWS + @pytest.mark.benchmark(group="test_benchmark_is_non_empty_geometry") def test_benchmark_is_non_empty_geometry(benchmark, ws, generated_geo_df): dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) @@ -1481,6 +1492,7 @@ def test_benchmark_is_non_empty_geometry(benchmark, ws, generated_geo_df): actual_count = benchmark(lambda: checked.count()) assert actual_count == EXPECTED_ROWS + @pytest.mark.benchmark(group="test_benchmark_has_dimension") def test_benchmark_has_dimension(benchmark, ws, generated_geo_df): dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) @@ -1496,6 +1508,7 @@ def test_benchmark_has_dimension(benchmark, ws, generated_geo_df): actual_count = benchmark(lambda: checked.count()) assert actual_count == EXPECTED_ROWS + @pytest.mark.benchmark(group="test_benchmark_has_x_coordinate_between") def test_benchmark_has_x_coordinate_between(benchmark, ws, generated_geo_df): dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) @@ -1511,6 +1524,7 @@ def test_benchmark_has_x_coordinate_between(benchmark, ws, generated_geo_df): actual_count = benchmark(lambda: checked.count()) assert actual_count == EXPECTED_ROWS + @pytest.mark.benchmark(group="test_benchmark_has_y_coordinate_between") def test_benchmark_has_y_coordinate_between(benchmark, ws, generated_geo_df): dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) @@ -1526,6 +1540,7 @@ def test_benchmark_has_y_coordinate_between(benchmark, ws, generated_geo_df): actual_count = benchmark(lambda: checked.count()) assert actual_count == EXPECTED_ROWS + @pytest.mark.benchmark(group="test_benchmark_has_valid_schema") def test_benchmark_has_valid_schema(benchmark, ws, generated_df): dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) diff --git a/tests/resources/all_row_checks.yaml b/tests/resources/all_row_checks.yaml index 831f23d6..7551c715 100644 --- a/tests/resources/all_row_checks.yaml +++ b/tests/resources/all_row_checks.yaml @@ -429,14 +429,14 @@ check: function: is_latitude arguments: - column: col2 + column: col3 # is_longitude check - criticality: error check: function: is_longitude arguments: - column: col2 + column: col3 # is_geometry check - criticality: error From 5381a2bcdb0f217e01810157aae64405113fbd6d Mon Sep 17 00:00:00 2001 From: Tim Dikland Date: Thu, 2 Oct 2025 22:01:22 +0200 Subject: [PATCH 44/47] integation test fixes pt2 --- tests/integration/test_apply_checks.py | 36 +++++++++++++------------- tests/resources/all_row_checks.yaml | 2 +- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/tests/integration/test_apply_checks.py b/tests/integration/test_apply_checks.py index 26594fd7..4e597de2 100644 --- a/tests/integration/test_apply_checks.py +++ b/tests/integration/test_apply_checks.py @@ -4493,7 +4493,7 @@ def test_apply_checks_all_row_checks_as_yaml_with_streaming(ws, make_schema, mak "LINESTRING(1 1, 2 2)", "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", "MULTIPOINT(1 1, 2 2)", - "MULTILINESSTRING((1 1, 2 2))", + "MULTILINESTRING((1 1, 2 2))", "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", ], @@ -4514,7 +4514,7 @@ def test_apply_checks_all_row_checks_as_yaml_with_streaming(ws, make_schema, mak "LINESTRING(1 1, 2 2)", "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", "MULTIPOINT(1 1, 2 2)", - "MULTILINESSTRING((1 1, 2 2))", + "MULTILINESTRING((1 1, 2 2))", "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", ], @@ -4535,7 +4535,7 @@ def test_apply_checks_all_row_checks_as_yaml_with_streaming(ws, make_schema, mak "LINESTRING(1 1, 2 2)", "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", "MULTIPOINT(1 1, 2 2)", - "MULTILINESSTRING((1 1, 2 2))", + "MULTILINESTRING((1 1, 2 2))", "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", ], @@ -4580,7 +4580,7 @@ def test_apply_checks_all_row_checks_as_yaml_with_streaming(ws, make_schema, mak "LINESTRING(1 1, 2 2)", "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", "MULTIPOINT(1 1, 2 2)", - "MULTILINESSTRING((1 1, 2 2))", + "MULTILINESTRING((1 1, 2 2))", "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", None, @@ -4603,7 +4603,7 @@ def test_apply_checks_all_row_checks_as_yaml_with_streaming(ws, make_schema, mak "LINESTRING(1 1, 2 2)", "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", "MULTIPOINT(1 1, 2 2)", - "MULTILINESSTRING((1 1, 2 2))", + "MULTILINESTRING((1 1, 2 2))", "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", None, @@ -4626,7 +4626,7 @@ def test_apply_checks_all_row_checks_as_yaml_with_streaming(ws, make_schema, mak "LINESTRING(1 1, 2 2)", "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", "MULTIPOINT(1 1, 2 2)", - "MULTILINESSTRING((1 1, 2 2))", + "MULTILINESTRING((1 1, 2 2))", "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", None, @@ -4683,7 +4683,7 @@ def test_apply_checks_all_checks_as_yaml(ws, spark): "LINESTRING(1 1, 2 2)", "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", "MULTIPOINT(1 1, 2 2)", - "MULTILINESSTRING((1 1, 2 2))", + "MULTILINESTRING((1 1, 2 2))", "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", ], @@ -4704,7 +4704,7 @@ def test_apply_checks_all_checks_as_yaml(ws, spark): "LINESTRING(1 1, 2 2)", "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", "MULTIPOINT(1 1, 2 2)", - "MULTILINESSTRING((1 1, 2 2))", + "MULTILINESTRING((1 1, 2 2))", "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", ], @@ -4725,7 +4725,7 @@ def test_apply_checks_all_checks_as_yaml(ws, spark): "LINESTRING(1 1, 2 2)", "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", "MULTIPOINT(1 1, 2 2)", - "MULTILINESSTRING((1 1, 2 2))", + "MULTILINESTRING((1 1, 2 2))", "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", ], @@ -4758,7 +4758,7 @@ def test_apply_checks_all_checks_as_yaml(ws, spark): "LINESTRING(1 1, 2 2)", "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", "MULTIPOINT(1 1, 2 2)", - "MULTILINESSTRING((1 1, 2 2))", + "MULTILINESTRING((1 1, 2 2))", "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", None, @@ -4781,7 +4781,7 @@ def test_apply_checks_all_checks_as_yaml(ws, spark): "LINESTRING(1 1, 2 2)", "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", "MULTIPOINT(1 1, 2 2)", - "MULTILINESSTRING((1 1, 2 2))", + "MULTILINESTRING((1 1, 2 2))", "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", None, @@ -4804,7 +4804,7 @@ def test_apply_checks_all_checks_as_yaml(ws, spark): "LINESTRING(1 1, 2 2)", "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", "MULTIPOINT(1 1, 2 2)", - "MULTILINESSTRING((1 1, 2 2))", + "MULTILINESTRING((1 1, 2 2))", "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", None, @@ -5630,7 +5630,7 @@ def test_apply_checks_all_checks_using_classes(ws, spark): "LINESTRING(1 1, 2 2)", "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", "MULTIPOINT(1 1, 2 2)", - "MULTILINESSTRING((1 1, 2 2))", + "MULTILINESTRING((1 1, 2 2))", "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", ], @@ -5651,7 +5651,7 @@ def test_apply_checks_all_checks_using_classes(ws, spark): "LINESTRING(1 1, 2 2)", "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", "MULTIPOINT(1 1, 2 2)", - "MULTILINESSTRING((1 1, 2 2))", + "MULTILINESTRING((1 1, 2 2))", "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", ], @@ -5672,7 +5672,7 @@ def test_apply_checks_all_checks_using_classes(ws, spark): "LINESTRING(1 1, 2 2)", "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", "MULTIPOINT(1 1, 2 2)", - "MULTILINESSTRING((1 1, 2 2))", + "MULTILINESTRING((1 1, 2 2))", "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", ], @@ -5702,7 +5702,7 @@ def test_apply_checks_all_checks_using_classes(ws, spark): "LINESTRING(1 1, 2 2)", "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", "MULTIPOINT(1 1, 2 2)", - "MULTILINESSTRING((1 1, 2 2))", + "MULTILINESTRING((1 1, 2 2))", "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", None, @@ -5725,7 +5725,7 @@ def test_apply_checks_all_checks_using_classes(ws, spark): "LINESTRING(1 1, 2 2)", "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", "MULTIPOINT(1 1, 2 2)", - "MULTILINESSTRING((1 1, 2 2))", + "MULTILINESTRING((1 1, 2 2))", "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", None, @@ -5748,7 +5748,7 @@ def test_apply_checks_all_checks_using_classes(ws, spark): "LINESTRING(1 1, 2 2)", "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", "MULTIPOINT(1 1, 2 2)", - "MULTILINESSTRING((1 1, 2 2))", + "MULTILINESTRING((1 1, 2 2))", "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", None, diff --git a/tests/resources/all_row_checks.yaml b/tests/resources/all_row_checks.yaml index 7551c715..89757192 100644 --- a/tests/resources/all_row_checks.yaml +++ b/tests/resources/all_row_checks.yaml @@ -520,7 +520,7 @@ check: function: has_dimension arguments: - column: point_geom + column: polygon_geom dimension: 2 # has_x_coordinate_between check From 22aae45b43a5e70a67a20667f1f7e18bc1915030 Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Thu, 2 Oct 2025 22:46:32 +0200 Subject: [PATCH 45/47] updated docs --- docs/dqx/docs/reference/quality_checks.mdx | 33 +++---- .../llm/resources/yaml_checks_examples.yml | 85 +++++++++++++++++++ 2 files changed, 102 insertions(+), 16 deletions(-) diff --git a/docs/dqx/docs/reference/quality_checks.mdx b/docs/dqx/docs/reference/quality_checks.mdx index 71da7b94..f5b1a702 100644 --- a/docs/dqx/docs/reference/quality_checks.mdx +++ b/docs/dqx/docs/reference/quality_checks.mdx @@ -612,6 +612,7 @@ For brevity, the `name` field in the examples is omitted and it will be auto-gen ```python from databricks.labs.dqx.rule import DQRowRule, DQForEachColRule from databricks.labs.dqx import check_funcs +from databricks.labs.dqx.geo import check_funcs as geo_check_funcs from databricks.labs.dqx.pii import pii_detection_funcs from datetime import datetime @@ -953,98 +954,98 @@ checks = [ # is_latitude check DQRowRule( criticality="error", - check_func=check_funcs.is_latitude, + check_func=geo_check_funcs.is_latitude, column="col2" ), # is_longitude check DQRowRule( criticality="error", - check_func=check_funcs.is_longitude, + check_func=geo_check_funcs.is_longitude, column="col2" ), # is_geometry check DQRowRule( criticality="error", - check_func=check_funcs.is_geometry, + check_func=geo_check_funcs.is_geometry, column="point_geom" ), # is_geography check DQRowRule( criticality="error", - check_func=check_funcs.is_geography, + check_func=geo_check_funcs.is_geography, column="point_geom" ), # is_point check DQRowRule( criticality="error", - check_func=check_funcs.is_point, + check_func=geo_check_funcs.is_point, column="point_geom" ), # is_linestring check DQRowRule( criticality="error", - check_func=check_funcs.is_linestring, + check_func=geo_check_funcs.is_linestring, column="linestring_geom" ), # is_polygon check DQRowRule( criticality="error", - check_func=check_funcs.is_polygon, + check_func=geo_check_funcs.is_polygon, column="polygon_geom" ), # is_multipoint check DQRowRule( criticality="error", - check_func=check_funcs.is_multipoint, + check_func=geo_check_funcs.is_multipoint, column="multipoint_geom" ), # is_multilinestring check DQRowRule( criticality="error", - check_func=check_funcs.is_multilinestring, + check_func=geo_check_funcs.is_multilinestring, column="multilinestring_geom" ), # is_multipolygon check DQRowRule( criticality="error", - check_func=check_funcs.is_multipolygon, + check_func=geo_check_funcs.is_multipolygon, column="multipolygon_geom" ), # is_geometrycollection check DQRowRule( criticality="error", - check_func=check_funcs.is_geometrycollection, + check_func=geo_check_funcs.is_geometrycollection, column="geometrycollection_geom" ), # is_ogc_valid check DQRowRule( criticality="error", - check_func=check_funcs.is_ogc_valid, + check_func=geo_check_funcs.is_ogc_valid, column="point_geom" ), # is_non_empty_geometry check DQRowRule( criticality="error", - check_func=check_funcs.is_non_empty_geometry, + check_func=geo_check_funcs.is_non_empty_geometry, column="point_geom" ), # has_dimension check DQRowRule( criticality="error", - check_func=check_funcs.has_dimension, + check_func=geo_check_funcs.has_dimension, column="polygon_geom" check_func_kwargs={"dimension": 2} ), @@ -1052,7 +1053,7 @@ checks = [ # has_x_coordinate_between check DQRowRule( criticality="error", - check_func=check_funcs.has_x_coordinate_between, + check_func=geo_check_funcs.has_x_coordinate_between, column="polygon_geom" check_func_kwargs={"min_value": 0.0, "max_value": 10.0} ), @@ -1060,7 +1061,7 @@ checks = [ # has_y_coordinate_between check DQRowRule( criticality="error", - check_func=check_funcs.has_y_coordinate_between, + check_func=geo_check_funcs.has_y_coordinate_between, column="polygon_geom" check_func_kwargs={"min_value": 0.0, "max_value": 10.0} ), diff --git a/src/databricks/labs/dqx/llm/resources/yaml_checks_examples.yml b/src/databricks/labs/dqx/llm/resources/yaml_checks_examples.yml index 38abcabd..f127b75f 100644 --- a/src/databricks/labs/dqx/llm/resources/yaml_checks_examples.yml +++ b/src/databricks/labs/dqx/llm/resources/yaml_checks_examples.yml @@ -320,6 +320,91 @@ for_each_column: - col3 - col5 +- criticality: error + check: + function: is_latitude + arguments: + column: col2 +- criticality: error + check: + function: is_longitude + arguments: + column: col2 +- criticality: error + check: + function: is_geometry + arguments: + column: point_geom +- criticality: error + check: + function: is_geography + arguments: + column: point_geom +- criticality: error + check: + function: is_point + arguments: + column: point_geom +- criticality: error + check: + function: is_linestring + arguments: + column: linestring_geom +- criticality: error + check: + function: is_polygon + arguments: + column: polygon_geom +- criticality: error + check: + function: is_multipoint + arguments: + column: multipoint_geom +- criticality: error + check: + function: is_multilinestring + arguments: + column: multilinestring_geom +- criticality: error + check: + function: is_multipolygon + arguments: + column: multipolygon_geom +- criticality: error + check: + function: is_geometrycollection + arguments: + column: geometrycollection_geom +- criticality: error + check: + function: is_ogc_valid + arguments: + column: point_geom +- criticality: error + check: + function: is_non_empty_geometry + arguments: + column: point_geom +- criticality: error + check: + function: has_dimension + arguments: + column: polygon_geom + dimension: 2 +- criticality: error + check: + function: has_x_coordinate_between + arguments: + column: polygon_geom + min_value: 0.0 + max_value: 10.0 +- criticality: error + check: + function: has_y_coordinate_between + arguments: + column: polygon_geom + min_value: 0.0 + max_value: 10.0 - criticality: error check: function: is_not_null From 11731587b7487c9a95885cdf34b386010c40a376 Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Thu, 2 Oct 2025 23:38:01 +0200 Subject: [PATCH 46/47] updated tests and docs --- docs/dqx/docs/reference/quality_checks.mdx | 32 +-- src/databricks/labs/dqx/geo/check_funcs.py | 28 +- tests/conftest.py | 39 +++ tests/integration/conftest.py | 39 --- tests/integration/test_apply_checks.py | 287 +++++++++++++++------ tests/perf/conftest.py | 3 +- tests/perf/test_apply_checks.py | 70 +++-- tests/resources/all_row_checks.yaml | 117 --------- tests/resources/all_row_geo_checks.yaml | 119 +++++++++ 9 files changed, 443 insertions(+), 291 deletions(-) create mode 100644 tests/resources/all_row_geo_checks.yaml diff --git a/docs/dqx/docs/reference/quality_checks.mdx b/docs/dqx/docs/reference/quality_checks.mdx index f5b1a702..e2233c14 100644 --- a/docs/dqx/docs/reference/quality_checks.mdx +++ b/docs/dqx/docs/reference/quality_checks.mdx @@ -48,22 +48,22 @@ You can also define your own custom checks (see [Creating custom checks](#creati | `sql_expression` | Checks whether the values meet the condition provided as an SQL expression, e.g. `a = 'str1' and a > b`. SQL expressions are evaluated at runtime, so ensure that the expression is safe and that functions used within it (e.g. h3_ischildof, division) do not throw exceptions. You can achieve this by validating input arguments or columns beforehand using guards such as CASE WHEN, IS NOT NULL, RLIKE, or type try casts. | `expression`: sql expression to check on a DataFrame (fail the check if expression evaluates to True, pass if it evaluates to False); `msg`: optional message to output; `name`: optional name of the resulting column (it can be overwritten by `name` specified at the check level); `negate`: if the condition should be negated; `columns`: optional list of columns to be used for reporting and as name prefix if name not provided, unused in the actual logic | | `is_data_fresh` | Checks whether the values in the input timestamp column are not older than the specified number of minutes from the base timestamp column. This is useful for identifying stale data due to delayed pipelines and helps catch upstream issues early. | `column`: column of type timestamp/date to check (can be a string column name or a column expression); `max_age_minutes`: maximum age in minutes before data is considered stale; `base_timestamp`: optional base timestamp column from which the stale check is calculated. This can be a string, column expression, datetime value or literal value ex:F.lit(datetime(2024,1,1)). If not provided current_timestamp() function is used | | `does_not_contain_pii` | Checks whether the values in the input column contain Personally Identifiable Information (PII). Uses Microsoft Presidio to detect various named entities (e.g. PERSON, ADDRESS, EMAIL_ADDRESS). Requires installation of PII detection extras: `pip install 'databricks-labs-dqx[pii-detection]'`. See more details [here](#detecting-personally-identifiable-information-pii). | `column`: column to check (can be a string column name or a column expression); `threshold`: confidence threshold for PII detection (0.0 to 1.0, default: 0.7); `language`: optional language of the text (default: 'en'); `entities`: optional list of entities to detect; `nlp_engine_config`: optional dictionary configuring the NLP engine used for PII detection, see the [Presidio documentation](https://microsoft.github.io/presidio/analyzer/customizing_nlp_models/) for more information | -| `is_latitude` | Checks whether the values in the input column are valid latitude values (i.e. between -90 and 90 degrees) | `column`: column to check (can be a string column name or a column expression) | -| `is_longitude` | Checks whether the values in the input column are valid longitude values (i.e. between -180 and 180 degrees) | `column`: column to check (can be a string column name or a column expression) | -| `is_geometry` | Checks whether the values in the input column are valid geometries. | `column`: column to check (can be a string column name or a column expression) | -| `is_geography` | Checks whether the values in the input column are valid geographies. | `column`: column to check (can be a string column name or a column expression) | -| `is_point` | Checks whether the values in the input column are point geometries/geographies. | `column`: column to check (can be a string column name or a column expression) | -| `is_linestring` | Checks whether the values in the input column are linestring geometries/geographies. | `column`: column to check (can be a string column name or a column expression) | -| `is_polygon` | Checks whether the values in the input column are polygon geometries/geographies. | `column`: column to check (can be a string column name or a column expression) | -| `is_multipoint` | Checks whether the values in the input column are multipoint geometries/geographies. | `column`: column to check (can be a string column name or a column expression) | -| `is_multilinestring` | Checks whether the values in the input column are multilinestring geometries/geographies. | `column`: column to check (can be a string column name or a column expression) | -| `is_multipolygon` | Checks whether the values in the input column are multipolygon geometries/geographies. | `column`: column to check (can be a string column name or a column expression) | -| `is_geometrycollection` | Checks whether the values in the input column are geometrycollection geometries/geographies. | `column`: column to check (can be a string column name or a column expression) | -| `is_ogc_valid` | Checks whether the values in the input column are valid geometries in the OGC sense. I.e a bowtie polygon is invalid because it has a self intersection. | `column`: column to check (can be a string column name or a column expression) | -| `is_non_empty_geometry` | Checks whether the values in the input column are non-empty geometries. | `column`: column to check (can be a string column name or a column expression) | -| `has_dimension` | Checks whether the values in the input column are geometries of the specified dimension (2D projected dimension). | `column`: column to check (can be a string column name or a column expression); `dimension`: dimension to check | -| `has_x_coordinate_between` | Checks whether the values in the input column are geometries with x coordinate between the provided boundaries. | `column`: column to check (can be a string column name or a column expression); `min_value`: minimum value; `max_value`: maximum value | -| `has_y_coordinate_between` | Checks whether the values in the input column are geometries with y coordinate between the provided boundaries. | `column`: column to check (can be a string column name or a column expression); `min_value`: minimum value; `max_value`: maximum value | | `column`: column to check (can be a string column name or a column expression); `min_value`: minimum value; `max_value`: maximum value | +| `is_latitude` | Checks whether the values in the input column are valid latitude values (i.e. between -90 and 90 degrees). | `column`: column to check (can be a string column name or a column expression) | +| `is_longitude` | Checks whether the values in the input column are valid longitude values (i.e. between -180 and 180 degrees). | `column`: column to check (can be a string column name or a column expression) | +| `is_geometry` | Checks whether the values in the input column are valid geometries. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression) | +| `is_geography` | Checks whether the values in the input column are valid geographies. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression) | +| `is_point` | Checks whether the values in the input column are point geometries/geographies. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression) | +| `is_linestring` | Checks whether the values in the input column are linestring geometries/geographies. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression) | +| `is_polygon` | Checks whether the values in the input column are polygon geometries/geographies. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression) | +| `is_multipoint` | Checks whether the values in the input column are multipoint geometries/geographies. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression) | +| `is_multilinestring` | Checks whether the values in the input column are multilinestring geometries/geographies. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression) | +| `is_multipolygon` | Checks whether the values in the input column are multipolygon geometries/geographies. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression) | +| `is_geometrycollection` | Checks whether the values in the input column are geometrycollection geometries/geographies. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression) | +| `is_ogc_valid` | Checks whether the values in the input column are valid geometries in the OGC sense. I.e a bowtie polygon is invalid because it has a self intersection. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression) | +| `is_non_empty_geometry` | Checks whether the values in the input column are non-empty geometries. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression) | +| `has_dimension` | Checks whether the values in the input column are geometries of the specified dimension (2D projected dimension). This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression); `dimension`: dimension to check | +| `has_x_coordinate_between` | Checks whether the values in the input column are geometries with x coordinate between the provided boundaries. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression); `min_value`: minimum value; `max_value`: maximum value | +| `has_y_coordinate_between` | Checks whether the values in the input column are geometries with y coordinate between the provided boundaries. This function requires Databricks serverless compute or runtime >= 17.1. | `column`: column to check (can be a string column name or a column expression); `min_value`: minimum value; `max_value`: maximum value | | `column`: column to check (can be a string column name or a column expression); `min_value`: minimum value; `max_value`: maximum value | diff --git a/src/databricks/labs/dqx/geo/check_funcs.py b/src/databricks/labs/dqx/geo/check_funcs.py index 81c3ffe8..612f7a71 100644 --- a/src/databricks/labs/dqx/geo/check_funcs.py +++ b/src/databricks/labs/dqx/geo/check_funcs.py @@ -69,7 +69,7 @@ def is_geometry(column: str | Column) -> Column: Column object indicating whether the values in the input column are valid geometries Note: - This function requires Databricks runtime 17.1 or above. + This function requires Databricks serverless compute or runtime 17.1 or above. """ col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in @@ -96,7 +96,7 @@ def is_geography(column: str | Column) -> Column: Column object indicating whether the values in the input column are valid geographies Note: - This function requires Databricks runtime 17.1 or above. + This function requires Databricks serverless compute or runtime 17.1 or above. """ col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in @@ -123,7 +123,7 @@ def is_point(column: str | Column) -> Column: Column object indicating whether the values in the input column are point geometries Note: - This function requires Databricks runtime 17.1 or above. + This function requires Databricks serverless compute or runtime 17.1 or above. """ col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in @@ -150,7 +150,7 @@ def is_linestring(column: str | Column) -> Column: Column object indicating whether the values in the input column are linestring geometries Note: - This function requires Databricks runtime 17.1 or above. + This function requires Databricks serverless compute or runtime 17.1 or above. """ col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in @@ -177,7 +177,7 @@ def is_polygon(column: str | Column) -> Column: Column object indicating whether the values in the input column are polygon geometries Note: - This function requires Databricks runtime 17.1 or above. + This function requires Databricks serverless compute or runtime 17.1 or above. """ col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in @@ -204,7 +204,7 @@ def is_multipoint(column: str | Column) -> Column: Column object indicating whether the values in the input column are multipoint geometries Note: - This function requires Databricks runtime 17.1 or above. + This function requires Databricks serverless compute or runtime 17.1 or above. """ col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in @@ -231,7 +231,7 @@ def is_multilinestring(column: str | Column) -> Column: Column object indicating whether the values in the input column are multilinestring geometries Note: - This function requires Databricks runtime 17.1 or above. + This function requires Databricks serverless compute or runtime 17.1 or above. """ col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in @@ -258,7 +258,7 @@ def is_multipolygon(column: str | Column) -> Column: Column object indicating whether the values in the input column are multipolygon geometries Note: - This function requires Databricks runtime 17.1 or above. + This function requires Databricks serverless compute or runtime 17.1 or above. """ col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in @@ -285,7 +285,7 @@ def is_geometrycollection(column: str | Column) -> Column: Column object indicating whether the values in the input column are geometrycollection geometries Note: - This function requires Databricks runtime 17.1 or above. + This function requires Databricks serverless compute or runtime 17.1 or above. """ col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in @@ -312,7 +312,7 @@ def is_ogc_valid(column: str | Column) -> Column: Column object indicating whether the values in the input column are valid geometries Note: - This function requires Databricks runtime 17.1 or above. + This function requires Databricks serverless compute or runtime 17.1 or above. """ col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in @@ -340,7 +340,7 @@ def is_non_empty_geometry(column: str | Column) -> Column: Column object indicating whether the values in the input column are empty geometries Note: - This function requires Databricks runtime 17.1 or above. + This function requires Databricks serverless compute or runtime 17.1 or above. """ col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in @@ -369,7 +369,7 @@ def has_dimension(column: str | Column, dimension: int) -> Column: Column object indicating whether the geometries/geographies in the input column have a given dimension Note: - This function requires Databricks runtime 17.1 or above. + This function requires Databricks serverless compute or runtime 17.1 or above. """ col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in @@ -399,7 +399,7 @@ def has_x_coordinate_between(column: str | Column, min_value: float, max_value: Column object indicating whether the x coordinates of the geometries in the input column are between a given range Note: - This function requires Databricks runtime 17.1 or above. + This function requires Databricks serverless compute or runtime 17.1 or above. """ col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in @@ -431,7 +431,7 @@ def has_y_coordinate_between(column: str | Column, min_value: float, max_value: Column object indicating whether the y coordinates of the geometries in the input column are between a given range Note: - This function requires Databricks runtime 17.1 or above. + This function requires Databricks serverless compute or runtime 17.1 or above. """ col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column) # NOTE: This function is currently only available in Databricks runtime 17.1 or above or in diff --git a/tests/conftest.py b/tests/conftest.py index 98d2ba6f..422c326e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,4 +1,5 @@ import os +import re from collections.abc import Callable, Generator from dataclasses import replace from functools import cached_property @@ -40,6 +41,44 @@ def set_utc_timezone(): os.environ.pop("TZ") +@pytest.fixture +def skip_if_runtime_not_geo_compatible(ws, debug_env): + """ + Skip the test if the cluster runtime does not support the required geo functions, i.e. + * serverless clusters have the required geo functions + * standard clusters require runtime 17.1 or above + + Args: + ws (WorkspaceClient): Workspace client to interact with Databricks. + debug_env (dict): Test environment variables. + """ + if "DATABRICKS_SERVERLESS_COMPUTE_ID" in debug_env: + return # serverless clusters have the required geo functions + + # standard clusters require runtime 17.1 or above + cluster_id = debug_env.get("DATABRICKS_CLUSTER_ID") + if not cluster_id: + raise ValueError("DATABRICKS_CLUSTER_ID is not set in debug_env") + + # Fetch cluster details + cluster_info = ws.clusters.get(cluster_id) + runtime_version = cluster_info.spark_version + + if not runtime_version: + raise ValueError(f"Unable to retrieve runtime version for cluster {cluster_id}") + + # Extract major and minor version numbers + match = re.match(r"(\d+)\.(\d+)", runtime_version) + if not match: + raise ValueError(f"Invalid runtime version format: {runtime_version}") + + major, minor = [int(x) for x in match.groups()] + valid = major > 17 or (major == 17 and minor >= 1) + + if not valid: + pytest.skip("This test requires a cluster with runtime 17.1 or above") + + class CommonUtils: def __init__(self, env_or_skip_fixture: Callable[[str], str], ws: WorkspaceClient): self._env_or_skip = env_or_skip_fixture diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 75b8f63d..4e36ec60 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -1,5 +1,4 @@ import logging -import re from datetime import datetime, timezone from unittest.mock import patch from pyspark.sql import DataFrame @@ -273,41 +272,3 @@ def contains_expected_workflows(workflows, state): if all(item in workflow.items() for item in state.items()): return True return False - - -@pytest.fixture -def skip_if_runtime_not_geo_compatible(ws, debug_env): - """ - Skip the test if the cluster runtime does not support the required geo functions, i.e. - * serverless clusters have the required geo functions - * standard clusters require runtime 17.1 or above - - Args: - ws (WorkspaceClient): Workspace client to interact with Databricks. - debug_env (dict): Test environment variables. - """ - if "DATABRICKS_SERVERLESS_COMPUTE_ID" in debug_env: - return # serverless clusters have the required geo functions - - # standard clusters require runtime 17.1 or above - cluster_id = debug_env.get("DATABRICKS_CLUSTER_ID") - if not cluster_id: - raise ValueError("DATABRICKS_CLUSTER_ID is not set in debug_env") - - # Fetch cluster details - cluster_info = ws.clusters.get(cluster_id) - runtime_version = cluster_info.spark_version - - if not runtime_version: - raise ValueError(f"Unable to retrieve runtime version for cluster {cluster_id}") - - # Extract major and minor version numbers - match = re.match(r"(\d+)\.(\d+)", runtime_version) - if not match: - raise ValueError(f"Invalid runtime version format: {runtime_version}") - - major, minor = [int(x) for x in match.groups()] - valid = major > 17 or (major == 17 and minor >= 1) - - if not valid: - pytest.skip("This test requires a cluster with runtime 17.1 or above") diff --git a/tests/integration/test_apply_checks.py b/tests/integration/test_apply_checks.py index 4e597de2..259c6320 100644 --- a/tests/integration/test_apply_checks.py +++ b/tests/integration/test_apply_checks.py @@ -4470,9 +4470,7 @@ def test_apply_checks_all_row_checks_as_yaml_with_streaming(ws, make_schema, mak schema = ( "col1: string, col2: int, col3: int, col4 array, col5: date, col6: timestamp, " "col7: map, col8: struct, col10: int, col11: string, " - "col_ipv4: string, col_ipv6: string, point_geom: string, linestring_geom: string, " - "polygon_geom: string, multipoint_geom: string, multilinestring_geom: string, " - "multipolygon_geom: string, geometrycollection_geom: string" + "col_ipv4: string, col_ipv6: string" ) test_df = spark.createDataFrame( [ @@ -4489,13 +4487,6 @@ def test_apply_checks_all_row_checks_as_yaml_with_streaming(ws, make_schema, mak "val2", "192.168.1.1", "2001:0db8:85a3:08d3:1319:8a2e:0370:7344", - "POINT(1 1)", - "LINESTRING(1 1, 2 2)", - "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", - "MULTIPOINT(1 1, 2 2)", - "MULTILINESTRING((1 1, 2 2))", - "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", - "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", ], [ "val2", @@ -4510,13 +4501,6 @@ def test_apply_checks_all_row_checks_as_yaml_with_streaming(ws, make_schema, mak "val2", "192.168.1.2", "2001:0db8:85a3:08d3:ffff:ffff:ffff:ffff", - "POINT(1 1)", - "LINESTRING(1 1, 2 2)", - "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", - "MULTIPOINT(1 1, 2 2)", - "MULTILINESTRING((1 1, 2 2))", - "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", - "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", ], [ "val3", @@ -4531,13 +4515,6 @@ def test_apply_checks_all_row_checks_as_yaml_with_streaming(ws, make_schema, mak "val2", "192.168.1.3", "2001:db8:85a3:8d3:1319:8a2e:3.112.115.68", - "POINT(1 1)", - "LINESTRING(1 1, 2 2)", - "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", - "MULTIPOINT(1 1, 2 2)", - "MULTILINESTRING((1 1, 2 2))", - "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", - "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", ], ], schema, @@ -4576,13 +4553,6 @@ def test_apply_checks_all_row_checks_as_yaml_with_streaming(ws, make_schema, mak "val2", "192.168.1.1", "2001:0db8:85a3:08d3:1319:8a2e:0370:7344", - "POINT(1 1)", - "LINESTRING(1 1, 2 2)", - "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", - "MULTIPOINT(1 1, 2 2)", - "MULTILINESTRING((1 1, 2 2))", - "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", - "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", None, None, ], @@ -4599,13 +4569,6 @@ def test_apply_checks_all_row_checks_as_yaml_with_streaming(ws, make_schema, mak "val2", "192.168.1.2", "2001:0db8:85a3:08d3:ffff:ffff:ffff:ffff", - "POINT(1 1)", - "LINESTRING(1 1, 2 2)", - "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", - "MULTIPOINT(1 1, 2 2)", - "MULTILINESTRING((1 1, 2 2))", - "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", - "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", None, None, ], @@ -4622,6 +4585,119 @@ def test_apply_checks_all_row_checks_as_yaml_with_streaming(ws, make_schema, mak "val2", "192.168.1.3", "2001:db8:85a3:8d3:1319:8a2e:3.112.115.68", + None, + None, + ], + ], + expected_schema, + ) + + assert_df_equality(checked_df, expected, ignore_nullable=True) + + +def test_apply_checks_all_row_geo_checks_as_yaml_with_streaming( + skip_if_runtime_not_geo_compatible, ws, make_schema, make_random, make_volume, spark +): + catalog_name = "main" + schema_name = make_schema(catalog_name=catalog_name).name + input_table_name = f"{catalog_name}.{schema_name}.{make_random(6).lower()}" + output_table_name = f"{catalog_name}.{schema_name}.{make_random(6).lower()}" + volume = make_volume(catalog_name=catalog_name, schema_name=schema_name) + + file_path = Path(__file__).parent.parent / "resources" / "all_row_geo_checks.yaml" + with open(file_path, "r", encoding="utf-8") as f: + checks = yaml.safe_load(f) + + dq_engine = DQEngine(ws) + assert not dq_engine.validate_checks(checks).has_errors + + schema = ( + "col3: int, point_geom: string, linestring_geom: string, " + "polygon_geom: string, multipoint_geom: string, multilinestring_geom: string, " + "multipolygon_geom: string, geometrycollection_geom: string" + ) + test_df = spark.createDataFrame( + [ + [ + 1, + "POINT(1 1)", + "LINESTRING(1 1, 2 2)", + "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", + "MULTIPOINT(1 1, 2 2)", + "MULTILINESTRING((1 1, 2 2))", + "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", + "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", + ], + [ + 2, + "POINT(1 1)", + "LINESTRING(1 1, 2 2)", + "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", + "MULTIPOINT(1 1, 2 2)", + "MULTILINESTRING((1 1, 2 2))", + "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", + "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", + ], + [ + 3, + "POINT(1 1)", + "LINESTRING(1 1, 2 2)", + "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", + "MULTIPOINT(1 1, 2 2)", + "MULTILINESTRING((1 1, 2 2))", + "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", + "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", + ], + ], + schema, + ) + test_df.write.saveAsTable(input_table_name) + streaming_test_df = spark.readStream.table(input_table_name) + + streaming_checked_df = dq_engine.apply_checks_by_metadata(streaming_test_df, checks) + dq_engine.save_results_in_table( + output_df=streaming_checked_df, + output_config=OutputConfig( + location=output_table_name, + mode="append", + trigger={"availableNow": True}, + options={ + "checkpointLocation": f"/Volumes/{volume.catalog_name}/{volume.schema_name}/{volume.name}/{make_random(6).lower()}" + }, + ), + ) + + checked_df = spark.table(output_table_name) + + expected_schema = schema + REPORTING_COLUMNS + expected = spark.createDataFrame( + [ + [ + 1, + "POINT(1 1)", + "LINESTRING(1 1, 2 2)", + "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", + "MULTIPOINT(1 1, 2 2)", + "MULTILINESTRING((1 1, 2 2))", + "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", + "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", + None, + None, + ], + [ + 2, + "POINT(1 1)", + "LINESTRING(1 1, 2 2)", + "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", + "MULTIPOINT(1 1, 2 2)", + "MULTILINESTRING((1 1, 2 2))", + "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", + "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", + None, + None, + ], + [ + 3, "POINT(1 1)", "LINESTRING(1 1, 2 2)", "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", @@ -4653,6 +4729,8 @@ def test_apply_checks_all_checks_as_yaml(ws, spark): with open(file_path, "r", encoding="utf-8") as f: checks.extend(yaml.safe_load(f)) + # Geo checks are executed in a separate test as they require specific DBR + dq_engine = DQEngine(ws) status = dq_engine.validate_checks(checks) assert not status.has_errors @@ -4660,9 +4738,7 @@ def test_apply_checks_all_checks_as_yaml(ws, spark): schema = ( "col1: string, col2: int, col3: int, col4 array, col5: date, col6: timestamp, " "col7: map, col8: struct, col10: int, col11: string, " - "col_ipv4: string, col_ipv6: string, point_geom: string, linestring_geom: string, " - "polygon_geom: string, multipoint_geom: string, multilinestring_geom: string, " - "multipolygon_geom: string, geometrycollection_geom: string" + "col_ipv4: string, col_ipv6: string" ) test_df = spark.createDataFrame( [ @@ -4679,13 +4755,6 @@ def test_apply_checks_all_checks_as_yaml(ws, spark): "val2", "192.168.1.0", "2001:0db8:85a3:08d3:0000:0000:0000:0001", - "POINT(1 1)", - "LINESTRING(1 1, 2 2)", - "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", - "MULTIPOINT(1 1, 2 2)", - "MULTILINESTRING((1 1, 2 2))", - "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", - "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", ], [ "val2", @@ -4700,13 +4769,6 @@ def test_apply_checks_all_checks_as_yaml(ws, spark): "val2", "192.168.1.1", "2001:0db8:85a3:08d3:0000:0000:0000:1", - "POINT(1 1)", - "LINESTRING(1 1, 2 2)", - "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", - "MULTIPOINT(1 1, 2 2)", - "MULTILINESTRING((1 1, 2 2))", - "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", - "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", ], [ "val3", @@ -4721,13 +4783,6 @@ def test_apply_checks_all_checks_as_yaml(ws, spark): "val2", "192.168.1.2", "2001:0db8:85a3:08d3:0000::2", - "POINT(1 1)", - "LINESTRING(1 1, 2 2)", - "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", - "MULTIPOINT(1 1, 2 2)", - "MULTILINESTRING((1 1, 2 2))", - "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", - "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", ], ], schema, @@ -4754,13 +4809,6 @@ def test_apply_checks_all_checks_as_yaml(ws, spark): "val2", "192.168.1.0", "2001:0db8:85a3:08d3:0000:0000:0000:0001", - "POINT(1 1)", - "LINESTRING(1 1, 2 2)", - "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", - "MULTIPOINT(1 1, 2 2)", - "MULTILINESTRING((1 1, 2 2))", - "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", - "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", None, None, ], @@ -4777,13 +4825,6 @@ def test_apply_checks_all_checks_as_yaml(ws, spark): "val2", "192.168.1.1", "2001:0db8:85a3:08d3:0000:0000:0000:1", - "POINT(1 1)", - "LINESTRING(1 1, 2 2)", - "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", - "MULTIPOINT(1 1, 2 2)", - "MULTILINESTRING((1 1, 2 2))", - "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", - "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", None, None, ], @@ -4800,6 +4841,100 @@ def test_apply_checks_all_checks_as_yaml(ws, spark): "val2", "192.168.1.2", "2001:0db8:85a3:08d3:0000::2", + None, + None, + ], + ], + expected_schema, + ) + assert_df_equality(checked, expected, ignore_nullable=True) + + +def test_apply_checks_all_geo_checks_as_yaml(skip_if_runtime_not_geo_compatible, ws, spark): + """Test applying all geo checks from a yaml file.""" + file_path = Path(__file__).parent.parent / "resources" / "all_row_geo_checks.yaml" + with open(file_path, "r", encoding="utf-8") as f: + checks = yaml.safe_load(f) + + dq_engine = DQEngine(ws) + status = dq_engine.validate_checks(checks) + assert not status.has_errors + + schema = ( + "col3: int, point_geom: string, linestring_geom: string, " + "polygon_geom: string, multipoint_geom: string, multilinestring_geom: string, " + "multipolygon_geom: string, geometrycollection_geom: string" + ) + test_df = spark.createDataFrame( + [ + [ + 1, + "POINT(1 1)", + "LINESTRING(1 1, 2 2)", + "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", + "MULTIPOINT(1 1, 2 2)", + "MULTILINESTRING((1 1, 2 2))", + "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", + "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", + ], + [ + 2, + "POINT(1 1)", + "LINESTRING(1 1, 2 2)", + "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", + "MULTIPOINT(1 1, 2 2)", + "MULTILINESTRING((1 1, 2 2))", + "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", + "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", + ], + [ + 3, + "POINT(1 1)", + "LINESTRING(1 1, 2 2)", + "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", + "MULTIPOINT(1 1, 2 2)", + "MULTILINESTRING((1 1, 2 2))", + "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", + "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", + ], + ], + schema, + ) + + ref_df = test_df.withColumnRenamed("col1", "ref_col1").withColumnRenamed("col2", "ref_col2") + ref_dfs = {"ref_df_key": ref_df} + + checked = dq_engine.apply_checks_by_metadata(test_df, checks, ref_dfs=ref_dfs) + + expected_schema = schema + REPORTING_COLUMNS + expected = spark.createDataFrame( + [ + [ + 1, + "POINT(1 1)", + "LINESTRING(1 1, 2 2)", + "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", + "MULTIPOINT(1 1, 2 2)", + "MULTILINESTRING((1 1, 2 2))", + "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", + "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", + None, + None, + ], + [ + 2, + "POINT(1 1)", + "LINESTRING(1 1, 2 2)", + "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", + "MULTIPOINT(1 1, 2 2)", + "MULTILINESTRING((1 1, 2 2))", + "MULTIPOLYGON(((1 1, 3 1, 3 3, 1 3, 1 1)))", + "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", + None, + None, + ], + [ + 3, "POINT(1 1)", "LINESTRING(1 1, 2 2)", "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", diff --git a/tests/perf/conftest.py b/tests/perf/conftest.py index 642ed0dc..7a1adff7 100644 --- a/tests/perf/conftest.py +++ b/tests/perf/conftest.py @@ -163,12 +163,13 @@ def generated_ipv6_df(spark): @pytest.fixture def generated_geo_df(spark): geo_schema_str = ( - "point_geom: string, linestring_geom: string, polygon_geom: string, multipoint_geom: string, " + "num_col: int, point_geom: string, linestring_geom: string, polygon_geom: string, multipoint_geom: string, " "multilinestring_geom: string, multipolygon_geom: string, geometrycollection_geom: string" ) schema = _parse_datatype_string(geo_schema_str) geo_templates = { + "num_col": "int", "point_geom": "POINT(x x)", "linestring_geom": "LINESTRING(x x, x x)", "polygon_geom": "POLYGON((x x, x x, x x, x x))", diff --git a/tests/perf/test_apply_checks.py b/tests/perf/test_apply_checks.py index 5a3aaed0..1f223d47 100644 --- a/tests/perf/test_apply_checks.py +++ b/tests/perf/test_apply_checks.py @@ -1328,8 +1328,35 @@ def test_benchmark_is_ipv6_address_in_cidr(benchmark, ws, generated_ipv6_df, col assert actual_count == EXPECTED_ROWS -@pytest.mark.benchmark(group="test_benchmark_is_geometry") -def test_benchmark_is_geometry(benchmark, ws, generated_geo_df): +def test_benchmark_is_latitude(benchmark, ws, generated_geo_df): + dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) + checks = [ + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_latitude, + column="point_geom", + ) + ] + checked = dq_engine.apply_checks(generated_geo_df, checks) + actual_count = benchmark(lambda: checked.count()) + assert actual_count == EXPECTED_ROWS + + +def test_benchmark_is_longitude(benchmark, ws, generated_geo_df): + dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) + checks = [ + DQRowRule( + criticality="error", + check_func=geo_check_funcs.is_longitude, + column="point_geom", + ) + ] + checked = dq_engine.apply_checks(generated_geo_df, checks) + actual_count = benchmark(lambda: checked.count()) + assert actual_count == EXPECTED_ROWS + + +def test_benchmark_is_geometry(skip_if_runtime_not_geo_compatible, benchmark, ws, generated_geo_df): dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) checks = [ DQRowRule( @@ -1343,8 +1370,7 @@ def test_benchmark_is_geometry(benchmark, ws, generated_geo_df): assert actual_count == EXPECTED_ROWS -@pytest.mark.benchmark(group="test_benchmark_is_geography") -def test_benchmark_is_geography(benchmark, ws, generated_geo_df): +def test_benchmark_is_geography(skip_if_runtime_not_geo_compatible, benchmark, ws, generated_geo_df): dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) checks = [ DQRowRule( @@ -1358,8 +1384,7 @@ def test_benchmark_is_geography(benchmark, ws, generated_geo_df): assert actual_count == EXPECTED_ROWS -@pytest.mark.benchmark(group="test_benchmark_is_point") -def test_benchmark_is_point(benchmark, ws, generated_geo_df): +def test_benchmark_is_point(skip_if_runtime_not_geo_compatible, benchmark, ws, generated_geo_df): dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) checks = [ DQRowRule( @@ -1373,8 +1398,7 @@ def test_benchmark_is_point(benchmark, ws, generated_geo_df): assert actual_count == EXPECTED_ROWS -@pytest.mark.benchmark(group="test_benchmark_is_linestring") -def test_benchmark_is_linestring(benchmark, ws, generated_geo_df): +def test_benchmark_is_linestring(skip_if_runtime_not_geo_compatible, benchmark, ws, generated_geo_df): dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) checks = [ DQRowRule( @@ -1388,8 +1412,7 @@ def test_benchmark_is_linestring(benchmark, ws, generated_geo_df): assert actual_count == EXPECTED_ROWS -@pytest.mark.benchmark(group="test_benchmark_is_polygon") -def test_benchmark_is_polygon(benchmark, ws, generated_geo_df): +def test_benchmark_is_polygon(skip_if_runtime_not_geo_compatible, benchmark, ws, generated_geo_df): dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) checks = [ DQRowRule( @@ -1403,8 +1426,7 @@ def test_benchmark_is_polygon(benchmark, ws, generated_geo_df): assert actual_count == EXPECTED_ROWS -@pytest.mark.benchmark(group="test_benchmark_is_multipoint") -def test_benchmark_is_multipoint(benchmark, ws, generated_geo_df): +def test_benchmark_is_multipoint(skip_if_runtime_not_geo_compatible, benchmark, ws, generated_geo_df): dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) checks = [ DQRowRule( @@ -1418,8 +1440,7 @@ def test_benchmark_is_multipoint(benchmark, ws, generated_geo_df): assert actual_count == EXPECTED_ROWS -@pytest.mark.benchmark(group="test_benchmark_is_multilinestring") -def test_benchmark_is_multilinestring(benchmark, ws, generated_geo_df): +def test_benchmark_is_multilinestring(skip_if_runtime_not_geo_compatible, benchmark, ws, generated_geo_df): dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) checks = [ DQRowRule( @@ -1433,8 +1454,7 @@ def test_benchmark_is_multilinestring(benchmark, ws, generated_geo_df): assert actual_count == EXPECTED_ROWS -@pytest.mark.benchmark(group="test_benchmark_is_multipolygon") -def test_benchmark_is_multipolygon(benchmark, ws, generated_geo_df): +def test_benchmark_is_multipolygon(skip_if_runtime_not_geo_compatible, benchmark, ws, generated_geo_df): dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) checks = [ DQRowRule( @@ -1448,8 +1468,7 @@ def test_benchmark_is_multipolygon(benchmark, ws, generated_geo_df): assert actual_count == EXPECTED_ROWS -@pytest.mark.benchmark(group="test_benchmark_is_geometrycollection") -def test_benchmark_is_geometrycollection(benchmark, ws, generated_geo_df): +def test_benchmark_is_geometrycollection(skip_if_runtime_not_geo_compatible, benchmark, ws, generated_geo_df): dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) checks = [ DQRowRule( @@ -1463,8 +1482,7 @@ def test_benchmark_is_geometrycollection(benchmark, ws, generated_geo_df): assert actual_count == EXPECTED_ROWS -@pytest.mark.benchmark(group="test_benchmark_is_ogc_valid") -def test_benchmark_is_ogc_valid(benchmark, ws, generated_geo_df): +def test_benchmark_is_ogc_valid(skip_if_runtime_not_geo_compatible, benchmark, ws, generated_geo_df): dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) checks = [ DQRowRule( @@ -1478,8 +1496,7 @@ def test_benchmark_is_ogc_valid(benchmark, ws, generated_geo_df): assert actual_count == EXPECTED_ROWS -@pytest.mark.benchmark(group="test_benchmark_is_non_empty_geometry") -def test_benchmark_is_non_empty_geometry(benchmark, ws, generated_geo_df): +def test_benchmark_is_non_empty_geometry(skip_if_runtime_not_geo_compatible, benchmark, ws, generated_geo_df): dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) checks = [ DQRowRule( @@ -1493,8 +1510,7 @@ def test_benchmark_is_non_empty_geometry(benchmark, ws, generated_geo_df): assert actual_count == EXPECTED_ROWS -@pytest.mark.benchmark(group="test_benchmark_has_dimension") -def test_benchmark_has_dimension(benchmark, ws, generated_geo_df): +def test_benchmark_has_dimension(skip_if_runtime_not_geo_compatible, benchmark, ws, generated_geo_df): dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) checks = [ DQRowRule( @@ -1509,8 +1525,7 @@ def test_benchmark_has_dimension(benchmark, ws, generated_geo_df): assert actual_count == EXPECTED_ROWS -@pytest.mark.benchmark(group="test_benchmark_has_x_coordinate_between") -def test_benchmark_has_x_coordinate_between(benchmark, ws, generated_geo_df): +def test_benchmark_has_x_coordinate_between(skip_if_runtime_not_geo_compatible, benchmark, ws, generated_geo_df): dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) checks = [ DQRowRule( @@ -1525,8 +1540,7 @@ def test_benchmark_has_x_coordinate_between(benchmark, ws, generated_geo_df): assert actual_count == EXPECTED_ROWS -@pytest.mark.benchmark(group="test_benchmark_has_y_coordinate_between") -def test_benchmark_has_y_coordinate_between(benchmark, ws, generated_geo_df): +def test_benchmark_has_y_coordinate_between(skip_if_runtime_not_geo_compatible, benchmark, ws, generated_geo_df): dq_engine = DQEngine(workspace_client=ws, extra_params=EXTRA_PARAMS) checks = [ DQRowRule( diff --git a/tests/resources/all_row_checks.yaml b/tests/resources/all_row_checks.yaml index 89757192..75de531e 100644 --- a/tests/resources/all_row_checks.yaml +++ b/tests/resources/all_row_checks.yaml @@ -424,123 +424,6 @@ column: col_ipv6 cidr_block: "2001:0db8:85a3:08d3:0000:0000:0000:0000/64" -# is_latitude check -- criticality: error - check: - function: is_latitude - arguments: - column: col3 - -# is_longitude check -- criticality: error - check: - function: is_longitude - arguments: - column: col3 - -# is_geometry check -- criticality: error - check: - function: is_geometry - arguments: - column: point_geom - -# is_geography check -- criticality: error - check: - function: is_geography - arguments: - column: point_geom - -# is_point check -- criticality: error - check: - function: is_point - arguments: - column: point_geom - -# is_linestring check -- criticality: error - check: - function: is_linestring - arguments: - column: linestring_geom - -# is_polygon check -- criticality: error - check: - function: is_polygon - arguments: - column: polygon_geom - -# is_multipoint check -- criticality: error - check: - function: is_multipoint - arguments: - column: multipoint_geom - -# is_multilinestring check -- criticality: error - check: - function: is_multilinestring - arguments: - column: multilinestring_geom - -# is_multipolygon check -- criticality: error - check: - function: is_multipolygon - arguments: - column: multipolygon_geom - -# is_geometrycollection check -- criticality: error - check: - function: is_geometrycollection - arguments: - column: geometrycollection_geom - -# is_ogc_valid check -- criticality: error - check: - function: is_ogc_valid - arguments: - column: point_geom - -# is_non_empty_geometry check -- criticality: error - check: - function: is_non_empty_geometry - arguments: - column: point_geom - -# has_dimension check -- criticality: error - check: - function: has_dimension - arguments: - column: polygon_geom - dimension: 2 - -# has_x_coordinate_between check -- criticality: error - check: - function: has_x_coordinate_between - arguments: - column: polygon_geom - min_value: 0.0 - max_value: 10.0 - -# has_y_coordinate_between check -- criticality: error - check: - function: has_y_coordinate_between - arguments: - column: polygon_geom - min_value: 0.0 - max_value: 10.0 - # is_data_fresh check with base_timestamp column as string - criticality: error check: diff --git a/tests/resources/all_row_geo_checks.yaml b/tests/resources/all_row_geo_checks.yaml new file mode 100644 index 00000000..8cc66ed7 --- /dev/null +++ b/tests/resources/all_row_geo_checks.yaml @@ -0,0 +1,119 @@ +# The checks used in the test are also showcased in the docs under /docs/reference/quality_checks.mdx +# The checks should be kept up to date with the docs to make sure the documentation examples are validated. + +# is_latitude check +- criticality: error + check: + function: is_latitude + arguments: + column: col3 + +# is_longitude check +- criticality: error + check: + function: is_longitude + arguments: + column: col3 + +# is_geometry check +- criticality: error + check: + function: is_geometry + arguments: + column: point_geom + +# is_geography check +- criticality: error + check: + function: is_geography + arguments: + column: point_geom + +# is_point check +- criticality: error + check: + function: is_point + arguments: + column: point_geom + +# is_linestring check +- criticality: error + check: + function: is_linestring + arguments: + column: linestring_geom + +# is_polygon check +- criticality: error + check: + function: is_polygon + arguments: + column: polygon_geom + +# is_multipoint check +- criticality: error + check: + function: is_multipoint + arguments: + column: multipoint_geom + +# is_multilinestring check +- criticality: error + check: + function: is_multilinestring + arguments: + column: multilinestring_geom + +# is_multipolygon check +- criticality: error + check: + function: is_multipolygon + arguments: + column: multipolygon_geom + +# is_geometrycollection check +- criticality: error + check: + function: is_geometrycollection + arguments: + column: geometrycollection_geom + +# is_ogc_valid check +- criticality: error + check: + function: is_ogc_valid + arguments: + column: point_geom + +# is_non_empty_geometry check +- criticality: error + check: + function: is_non_empty_geometry + arguments: + column: point_geom + +# has_dimension check +- criticality: error + check: + function: has_dimension + arguments: + column: polygon_geom + dimension: 2 + +# has_x_coordinate_between check +- criticality: error + check: + function: has_x_coordinate_between + arguments: + column: polygon_geom + min_value: 0.0 + max_value: 10.0 + +# has_y_coordinate_between check +- criticality: error + check: + function: has_y_coordinate_between + arguments: + column: polygon_geom + min_value: 0.0 + max_value: 10.0 From 77f0e48da79f80a654a8f867890364feabc2b0d4 Mon Sep 17 00:00:00 2001 From: Marcin Wojtyczka Date: Thu, 2 Oct 2025 23:46:53 +0200 Subject: [PATCH 47/47] updated tests --- tests/integration/test_apply_checks.py | 221 +++++++++++++++---------- tests/resources/all_row_checks.yaml | 34 ++-- 2 files changed, 155 insertions(+), 100 deletions(-) diff --git a/tests/integration/test_apply_checks.py b/tests/integration/test_apply_checks.py index 259c6320..bec66c90 100644 --- a/tests/integration/test_apply_checks.py +++ b/tests/integration/test_apply_checks.py @@ -5539,6 +5539,143 @@ def test_apply_checks_all_checks_using_classes(ws, spark): user_metadata={"tag1": "value9", "tag2": "036"}, check_func_kwargs={"cidr_block": "2001:0db8:85a3:08d3:0000:0000:0000:0000/64"}, ), + # is_data_fresh check + DQRowRule( + criticality="error", + check_func=check_funcs.is_data_fresh, + column="col5", + check_func_kwargs={"max_age_minutes": 18000, "base_timestamp": "col6"}, + ), + # is_data_fresh_per_time_window check + DQDatasetRule( + criticality="error", + check_func=check_funcs.is_data_fresh_per_time_window, + column="col6", + check_func_kwargs={"window_minutes": 1, "min_records_per_window": 1, "lookback_windows": 3}, + ), + ] + + dq_engine = DQEngine(ws) + + schema = ( + "col1: string, col2: int, col3: int, col4 array, col5: date, col6: timestamp, " + "col7: map, col8: struct, col10: int, col11: string, " + "col_ipv4: string, col_ipv6: string" + ) + test_df = spark.createDataFrame( + [ + [ + "val1", + 1, + 1, + [1], + datetime(2025, 1, 2).date(), + datetime(2025, 1, 12, 1, 0, 0), + {"key1": 1}, + {"field1": 1}, + 2, + "val2", + "255.255.255.255", + "2001:0db8:85a3:08d3:1319:8a2e:0370:7344", + ], + [ + "val2", + 2, + 2, + [2], + datetime(2025, 1, 2).date(), + datetime(2025, 1, 12, 2, 0, 0), + {"key1": 1}, + {"field1": 1}, + 2, + "val2", + "255.255.255.1", + "2001:0db8:85a3:08d3:ffff:ffff:ffff:ffff", + ], + [ + "val3", + 3, + 3, + [3], + datetime(2025, 1, 2).date(), + datetime(2025, 1, 12, 3, 0, 0), + {"key1": 1}, + {"field1": 1}, + 2, + "val2", + "255.255.255.2", + "2001:db8:85a3:8d3:1319:8a2e:3.112.115.68", + ], + ], + schema, + ) + + checked = dq_engine.apply_checks(test_df, checks) + + expected_schema = schema + REPORTING_COLUMNS + expected = spark.createDataFrame( + [ + [ + "val1", + 1, + 1, + [1], + datetime(2025, 1, 2).date(), + datetime(2025, 1, 12, 1, 0, 0), + {"key1": 1}, + {"field1": 1}, + 2, + "val2", + "255.255.255.255", + "2001:0db8:85a3:08d3:1319:8a2e:0370:7344", + None, + None, + ], + [ + "val2", + 2, + 2, + [2], + datetime(2025, 1, 2).date(), + datetime(2025, 1, 12, 2, 0, 0), + {"key1": 1}, + {"field1": 1}, + 2, + "val2", + "255.255.255.1", + "2001:0db8:85a3:08d3:ffff:ffff:ffff:ffff", + None, + None, + ], + [ + "val3", + 3, + 3, + [3], + datetime(2025, 1, 2).date(), + datetime(2025, 1, 12, 3, 0, 0), + {"key1": 1}, + {"field1": 1}, + 2, + "val2", + "255.255.255.2", + "2001:db8:85a3:8d3:1319:8a2e:3.112.115.68", + None, + None, + ], + ], + expected_schema, + ) + assert_df_equality(checked, expected, ignore_nullable=True) + + +def test_apply_checks_all_geo_checks_using_classes(skip_if_runtime_not_geo_compatible, ws, spark): + """Test applying all geo checks using DQX classes. + + The checks used in the test are also showcased in the docs under /docs/reference/quality_checks.mdx + The checks should be kept up to date with the docs to make sure the documentation examples are validated. + """ + checks = [ # is_latitude check DQRowRule( criticality="error", @@ -5721,46 +5858,19 @@ def test_apply_checks_all_checks_using_classes(ws, spark): column=F.col("polygon_geom"), check_func_kwargs={"min_value": 0.0, "max_value": 10.0}, ), - # is_data_fresh check - DQRowRule( - criticality="error", - check_func=check_funcs.is_data_fresh, - column="col5", - check_func_kwargs={"max_age_minutes": 18000, "base_timestamp": "col6"}, - ), - # is_data_fresh_per_time_window check - DQDatasetRule( - criticality="error", - check_func=check_funcs.is_data_fresh_per_time_window, - column="col6", - check_func_kwargs={"window_minutes": 1, "min_records_per_window": 1, "lookback_windows": 3}, - ), ] dq_engine = DQEngine(ws) schema = ( - "col1: string, col2: int, col3: int, col4 array, col5: date, col6: timestamp, " - "col7: map, col8: struct, col10: int, col11: string, " - "col_ipv4: string, col_ipv6: string, point_geom: string, linestring_geom: string, " + "col2: int, point_geom: string, linestring_geom: string, " "polygon_geom: string, multipoint_geom: string, multilinestring_geom: string, " "multipolygon_geom: string, geometrycollection_geom: string" ) test_df = spark.createDataFrame( [ [ - "val1", 1, - 1, - [1], - datetime(2025, 1, 2).date(), - datetime(2025, 1, 12, 1, 0, 0), - {"key1": 1}, - {"field1": 1}, - 2, - "val2", - "255.255.255.255", - "2001:0db8:85a3:08d3:1319:8a2e:0370:7344", "POINT(1 1)", "LINESTRING(1 1, 2 2)", "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", @@ -5770,18 +5880,7 @@ def test_apply_checks_all_checks_using_classes(ws, spark): "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", ], [ - "val2", 2, - 2, - [2], - datetime(2025, 1, 2).date(), - datetime(2025, 1, 12, 2, 0, 0), - {"key1": 1}, - {"field1": 1}, - 2, - "val2", - "255.255.255.1", - "2001:0db8:85a3:08d3:ffff:ffff:ffff:ffff", "POINT(1 1)", "LINESTRING(1 1, 2 2)", "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", @@ -5791,18 +5890,7 @@ def test_apply_checks_all_checks_using_classes(ws, spark): "GEOMETRYCOLLECTION(POINT(1 1), LINESTRING(1 1, 2 2), POLYGON((1 1, 3 1, 3 3, 1 3, 1 1)))", ], [ - "val3", 3, - 3, - [3], - datetime(2025, 1, 2).date(), - datetime(2025, 1, 12, 3, 0, 0), - {"key1": 1}, - {"field1": 1}, - 2, - "val2", - "255.255.255.2", - "2001:db8:85a3:8d3:1319:8a2e:3.112.115.68", "POINT(1 1)", "LINESTRING(1 1, 2 2)", "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", @@ -5821,18 +5909,7 @@ def test_apply_checks_all_checks_using_classes(ws, spark): expected = spark.createDataFrame( [ [ - "val1", - 1, 1, - [1], - datetime(2025, 1, 2).date(), - datetime(2025, 1, 12, 1, 0, 0), - {"key1": 1}, - {"field1": 1}, - 2, - "val2", - "255.255.255.255", - "2001:0db8:85a3:08d3:1319:8a2e:0370:7344", "POINT(1 1)", "LINESTRING(1 1, 2 2)", "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", @@ -5844,18 +5921,7 @@ def test_apply_checks_all_checks_using_classes(ws, spark): None, ], [ - "val2", - 2, 2, - [2], - datetime(2025, 1, 2).date(), - datetime(2025, 1, 12, 2, 0, 0), - {"key1": 1}, - {"field1": 1}, - 2, - "val2", - "255.255.255.1", - "2001:0db8:85a3:08d3:ffff:ffff:ffff:ffff", "POINT(1 1)", "LINESTRING(1 1, 2 2)", "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", @@ -5867,18 +5933,7 @@ def test_apply_checks_all_checks_using_classes(ws, spark): None, ], [ - "val3", - 3, 3, - [3], - datetime(2025, 1, 2).date(), - datetime(2025, 1, 12, 3, 0, 0), - {"key1": 1}, - {"field1": 1}, - 2, - "val2", - "255.255.255.2", - "2001:db8:85a3:8d3:1319:8a2e:3.112.115.68", "POINT(1 1)", "LINESTRING(1 1, 2 2)", "POLYGON((1 1, 3 1, 3 3, 1 3, 1 1))", diff --git a/tests/resources/all_row_checks.yaml b/tests/resources/all_row_checks.yaml index 75de531e..a7e8f47d 100644 --- a/tests/resources/all_row_checks.yaml +++ b/tests/resources/all_row_checks.yaml @@ -30,9 +30,9 @@ arguments: column: col2 allowed: - - 1 - - 2 - - 3 + - 1 + - 2 + - 3 # is_not_null_and_is_in_list check - criticality: error @@ -41,9 +41,9 @@ arguments: column: col2 allowed: - - 1 - - 2 - - 3 + - 1 + - 2 + - 3 # is_not_null_and_not_empty_array check - criticality: error @@ -279,7 +279,7 @@ function: regex_match arguments: column: col2 - regex: "[0-9]+" + regex: '[0-9]+' negate: false # sql_expression check @@ -300,16 +300,16 @@ expression: col3 >= col2 and col3 <= 10 msg: col3 is less than col2 and col3 is greater than 10 columns: # optional for reporting - - col2 - - col3 + - col2 + - col3 # apply check to multiple columns - criticality: error check: function: is_not_null # 'column' as first argument for_each_column: # apply the check for each column in the list - - col3 - - col5 + - col3 + - col5 # is_not_null check applied to a struct column element (dot notation) - criticality: error @@ -389,10 +389,10 @@ check: function: is_not_null for_each_column: - - col1 # col - - col8.field1 # struct col - - try_element_at(col7, 'key1') # map col - - try_element_at(col4, 1) # array col + - col1 # col + - col8.field1 # struct col + - try_element_at(col7, 'key1') # map col + - try_element_at(col4, 1) # array col # is_valid_ipv4_address check - criticality: error @@ -407,7 +407,7 @@ function: is_ipv4_address_in_cidr arguments: column: col_ipv4 - cidr_block: "192.168.1.0/24" + cidr_block: '192.168.1.0/24' # is_valid_ipv6_address check - criticality: error @@ -422,7 +422,7 @@ function: is_ipv6_address_in_cidr arguments: column: col_ipv6 - cidr_block: "2001:0db8:85a3:08d3:0000:0000:0000:0000/64" + cidr_block: '2001:0db8:85a3:08d3:0000:0000:0000:0000/64' # is_data_fresh check with base_timestamp column as string - criticality: error