From 90247309e4247be877559c88b072a17f958ffed9 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 2 Oct 2025 14:56:45 +0200 Subject: [PATCH 01/24] Fix accidental boolean value where a string is intended. --- labs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/labs.yml b/labs.yml index 3fa55c156..a12f65222 100644 --- a/labs.yml +++ b/labs.yml @@ -37,7 +37,7 @@ commands: default: null - name: skip-validation description: Validate Transpiled Code, default True validation skipped, False validate - default: true + default: "true" - name: catalog-name description: Catalog Name Applicable only when Validation Mode is DATABRICKS default: null From 722597165fff00f8f05aec49c8e35118b1e3ba20 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 2 Oct 2025 14:58:26 +0200 Subject: [PATCH 02/24] Implement support for --overrides-file and --target-technology arguments during `transpile`. --- labs.yml | 6 ++ src/databricks/labs/lakebridge/cli.py | 51 +++++++++- tests/unit/test_cli_transpile.py | 131 ++++++++++++++++++++++++-- 3 files changed, 177 insertions(+), 11 deletions(-) diff --git a/labs.yml b/labs.yml index a12f65222..9b56d9ad0 100644 --- a/labs.yml +++ b/labs.yml @@ -26,6 +26,12 @@ commands: - name: source-dialect description: Dialect name as selected during `install-transpile` or refer to documentation default: null + - name: overrides-file + description: Path to a file containing transpiler overrides, if supported by the transpiler in use. + default: null + - name: target-technology + description: Target technology to use for code generation, if supported by the transpiler in use. + default: null - name: input-source description: Input Script Folder or File default: null diff --git a/src/databricks/labs/lakebridge/cli.py b/src/databricks/labs/lakebridge/cli.py index 4df7d4c5d..3b306ebe8 100644 --- a/src/databricks/labs/lakebridge/cli.py +++ b/src/databricks/labs/lakebridge/cli.py @@ -16,7 +16,7 @@ from databricks.labs.blueprint.cli import App from databricks.labs.blueprint.entrypoint import get_logger, is_in_debug -from databricks.labs.blueprint.installation import RootJsonValue +from databricks.labs.blueprint.installation import RootJsonValue, JsonObject from databricks.labs.blueprint.tui import Prompts @@ -91,6 +91,8 @@ def transpile( w: WorkspaceClient, transpiler_config_path: str | None = None, source_dialect: str | None = None, + overrides_file: str | None = None, + target_technology: str | None = None, input_source: str | None = None, output_folder: str | None = None, error_file_path: str | None = None, @@ -106,6 +108,8 @@ def transpile( checker = _TranspileConfigChecker(ctx.transpile_config, ctx.prompts, transpiler_repository) checker.use_transpiler_config_path(transpiler_config_path) checker.use_source_dialect(source_dialect) + checker.use_overrides_file(overrides_file) + checker.use_target_technology(target_technology) checker.use_input_source(input_source) checker.use_output_folder(output_folder) checker.use_error_file_path(error_file_path) @@ -217,6 +221,36 @@ def use_source_dialect(self, source_dialect: str | None) -> None: logger.debug(f"Pending source_dialect override: {source_dialect!r}") self._source_dialect_override = source_dialect + @staticmethod + def _validate_overrides_file(overrides_file: str, msg: str) -> None: + """Validate the overrides file: it must be a valid path that exists.""" + # Note: in addition to this check, later we verify the transpiler supports it. + if not Path(overrides_file).exists(): + raise_validation_exception(msg) + + def use_overrides_file(self, overrides_file: str | None) -> None: + if overrides_file is not None: + logger.debug(f"Setting overrides_file to: {overrides_file!r}") + msg = f"Invalid path for '--overrides-file', does not exist: {overrides_file}" + self._validate_overrides_file(overrides_file, msg) + try: + self._set_config_transpiler_option("overrides-file", overrides_file) + except ValueError: + # TODO: Update the `config.yml` format to disallow incompatible `transpiler_options`. + msg = "Cannot use --overrides-file; workspace config.yml has incompatible transpiler_options." + raise_validation_exception(msg) + + def use_target_technology(self, target_technology: str | None) -> None: + if target_technology is not None: + logger.debug(f"Setting target_technology to: {target_technology!r}") + # Cannot validate this here: depends on the transpiler engine, and will be checked later. + try: + self._set_config_transpiler_option("target-tech", target_technology) + except ValueError: + # TODO: Update the `config.yml` format to disallow incompatible `transpiler_options`. + msg = "Cannot use --target-technology; workspace config.yml has incompatible transpiler_options." + raise_validation_exception(msg) + @staticmethod def _validate_input_source(input_source: str, msg: str) -> None: """Validate the input source: it must be a path that exists.""" @@ -321,6 +355,19 @@ def use_schema_name(self, schema_name: str | None) -> None: logger.debug(f"Setting schema_name to: {schema_name!r}") self._config = dataclasses.replace(self._config, schema_name=schema_name) + def _set_config_transpiler_option(self, flag: str, value: str) -> None: + transpiler_options: JsonObject + match self._config.transpiler_options: + case None: + transpiler_options = {flag: value} + case Mapping() as found_options: + transpiler_options = {**found_options, flag: value} + case found_options: + # TODO: Update `config.yml' to constrain `transpiler_options` to be a dict[str, str]. + msg = f"Incompatible transpiler options configured, must be a mapping: {found_options!r}" + raise ValueError(msg) + self._config = dataclasses.replace(self._config, transpiler_options=transpiler_options) + def _configure_transpiler_config_path(self, source_dialect: str) -> TranspileEngine | None: """Configure the transpiler config path based on the requested source dialect.""" # Names of compatible transpiler engines for the given dialect. @@ -450,6 +497,8 @@ def _check_transpiler_options(self, engine: TranspileEngine) -> None: transpiler_options = self._config.transpiler_options if not isinstance(transpiler_options, Mapping): return + # Only checks if the option is present, does not validate the value. + # TODO: Validate the value for CHOICE/FORCE/CONFIRM options. checked_options = { option.flag: ( transpiler_options[option.flag] diff --git a/tests/unit/test_cli_transpile.py b/tests/unit/test_cli_transpile.py index 42c7bfc4c..ab39a40b4 100644 --- a/tests/unit/test_cli_transpile.py +++ b/tests/unit/test_cli_transpile.py @@ -41,11 +41,20 @@ def stubbed_transpiler_config_path(tmp_path: Path) -> Path: "options": { "all": [ { - "flag": "-experimental", - "method": "CONFIRM", - "prompt": "Do you want to use the experimental Databricks generator ?", + "flag": "overrides-file", + "method": "QUESTION", + "prompt": "Specify the config file to override", + "default": "", } - ] + ], + "informatica pc": [ + { + "flag": "target-tech", + "method": "CHOICE", + "prompt": "Specify which technology should be generated", + "choices": ["SPARKSQL", "PYSPARK"], + }, + ], }, } @@ -288,6 +297,101 @@ def test_transpile_with_invalid_input_source( cli.transpile(w=ws, input_source="invalid_path", transpiler_repository=transpiler_repository) +def test_transpile_overrides_file_specified( + mock_cli_for_transpile, + transpiler_repository: TranspilerRepository, + tmp_path: Path, +) -> None: + """Verify that the overrides file can be manually specified and is passed to the transpiler.""" + ws, cfg, _, do_transpile = mock_cli_for_transpile + overrides_path = tmp_path / "overrides.json" + overrides_path.write_text("{}") + + cli.transpile( + w=ws, + transpiler_config_path=cfg.transpiler_config_path, + source_dialect=cfg.source_dialect, + overrides_file=str(overrides_path), + input_source=cfg.input_source, + output_folder=cfg.output_folder, + error_file_path=cfg.error_file_path, + skip_validation=str(cfg.skip_validation), + catalog_name=cfg.catalog_name, + schema_name=cfg.schema_name, + transpiler_repository=transpiler_repository, + ) + do_transpile.assert_called_once_with( + ws, + ANY, + TranspileConfig( + transpiler_config_path=cfg.transpiler_config_path, + source_dialect=cfg.source_dialect, + input_source=cfg.input_source, + output_folder=cfg.output_folder, + error_file_path=cfg.error_file_path, + sdk_config=cfg.sdk_config, + skip_validation=cfg.skip_validation, + catalog_name=cfg.catalog_name, + schema_name=cfg.schema_name, + transpiler_options={"overrides-file": str(overrides_path)}, + ), + ) + + +def test_transpile_invalid_overrides_file_specified( + mock_cli_for_transpile, + transpiler_repository: TranspilerRepository, + tmp_path: Path, +) -> None: + """Verify that the overrides file argument is checked for whether it is a valid path.""" + ws, _, _, _ = mock_cli_for_transpile + with pytest.raises( + ValueError, match=re.escape("Invalid path for '--overrides-file', does not exist: does_not_exist.json") + ): + cli.transpile(w=ws, overrides_file="does_not_exist.json", transpiler_repository=transpiler_repository) + + +def test_transpile_target_technology_specified( + mock_cli_for_transpile, + transpiler_repository: TranspilerRepository, +) -> None: + """Verify that the target technology can be manually specified and is passed to the transpiler.""" + ws, cfg, set_cfg, do_transpile = mock_cli_for_transpile + cfg.source_dialect = "informatica pc" + cfg.transpiler_options = {"overrides-file": "a_file.json"} + set_cfg(cfg) + + cli.transpile( + w=ws, + transpiler_config_path=cfg.transpiler_config_path, + source_dialect=cfg.source_dialect, + target_technology="PYSPARK", + input_source=cfg.input_source, + output_folder=cfg.output_folder, + error_file_path=cfg.error_file_path, + skip_validation=str(cfg.skip_validation), + catalog_name=cfg.catalog_name, + schema_name=cfg.schema_name, + transpiler_repository=transpiler_repository, + ) + do_transpile.assert_called_once_with( + ws, + ANY, + TranspileConfig( + transpiler_config_path=cfg.transpiler_config_path, + source_dialect=cfg.source_dialect, + input_source=cfg.input_source, + output_folder=cfg.output_folder, + error_file_path=cfg.error_file_path, + sdk_config=cfg.sdk_config, + skip_validation=cfg.skip_validation, + catalog_name=cfg.catalog_name, + schema_name=cfg.schema_name, + transpiler_options={**cfg.transpiler_options, "target-tech": "PYSPARK"}, + ), + ) + + def test_transpile_with_valid_inputs( mock_cli_for_transpile, transpiler_config_path: Path, transpiler_repository: TranspilerRepository ) -> None: @@ -397,10 +501,17 @@ def test_describe_transpile(mock_cli_transpile_no_config, transpiler_repository: (out, _) = capsys.readouterr() json_description = json.loads(out) - experimental_option = { - "flag": "-experimental", - "method": "CONFIRM", - "prompt": "Do you want to use the experimental Databricks generator ?", + overrides_file_option = { + "flag": "overrides-file", + "method": "QUESTION", + "prompt": "Specify the config file to override", + "default": "", + } + target_tech_option = { + "flag": "target-tech", + "method": "CHOICE", + "prompt": "Specify which technology should be generated", + "choices": ["SPARKSQL", "PYSPARK"], } assert json_description == { @@ -414,8 +525,8 @@ def test_describe_transpile(mock_cli_transpile_no_config, transpiler_repository: "config-path": str(transpiler_repository.transpilers_path() / "stub-transpiler" / "lib" / "config.yml"), "versions": {"installed": None, "latest": None}, "supported-dialects": { - "informatica pc": {"options": [experimental_option]}, - "snowflake": {"options": [experimental_option]}, + "informatica pc": {"options": [overrides_file_option, target_tech_option]}, + "snowflake": {"options": [overrides_file_option]}, }, } ], From aeb6c3f52a4c2ed57383c823bee0c2b8e04c90e6 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 2 Oct 2025 19:50:18 +0200 Subject: [PATCH 03/24] Mark static fixtures as session-scope. --- tests/conftest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index c4cc06953..8e35f0d99 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -348,7 +348,7 @@ def mock_data_source(): return MockDataSource({}, {}) -@pytest.fixture +@pytest.fixture(scope="session") def bladebridge_artifact() -> Path: artifact = ( Path(__file__).parent @@ -362,7 +362,7 @@ def bladebridge_artifact() -> Path: return artifact -@pytest.fixture +@pytest.fixture(scope="session") def morpheus_artifact() -> Path: artifact = ( Path(__file__).parent From cbdec925bcbc5cfd629d60f3d6d603f174750dbe Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 2 Oct 2025 19:54:00 +0200 Subject: [PATCH 04/24] Refactor Bladebridge integration tests to use the CLI entrypoint. Other changes include: - Always test against the current (published) version of the BB plugin. - Only install and set up the plugin once. - Include the config/loading (from the workspace) and verification. - Check more of the details from running the transpiler, including the JSON output summary. - Remove the async bits. --- src/databricks/labs/lakebridge/cli.py | 7 +- .../integration/transpile/test_bladebridge.py | 176 ++++++++++++------ 2 files changed, 123 insertions(+), 60 deletions(-) diff --git a/src/databricks/labs/lakebridge/cli.py b/src/databricks/labs/lakebridge/cli.py index 3b306ebe8..5109711e2 100644 --- a/src/databricks/labs/lakebridge/cli.py +++ b/src/databricks/labs/lakebridge/cli.py @@ -86,7 +86,7 @@ def _remove_warehouse(ws: WorkspaceClient, warehouse_id: str): @lakebridge.command -def transpile( +def transpile( # pylint: disable=too-many-arguments *, w: WorkspaceClient, transpiler_config_path: str | None = None, @@ -99,10 +99,13 @@ def transpile( skip_validation: str | None = None, catalog_name: str | None = None, schema_name: str | None = None, + ctx: ApplicationContext | None = None, transpiler_repository: TranspilerRepository = TranspilerRepository.user_home(), ): """Transpiles source dialect to databricks dialect""" - ctx = ApplicationContext(w) + if ctx is None: + ctx = ApplicationContext(w) + del w logger.debug(f"Preconfigured transpiler config: {ctx.transpile_config!r}") ctx.add_user_agent_extra("cmd", "execute-transpile") checker = _TranspileConfigChecker(ctx.transpile_config, ctx.prompts, transpiler_repository) diff --git a/tests/integration/transpile/test_bladebridge.py b/tests/integration/transpile/test_bladebridge.py index adf4210d8..c2557d7f0 100644 --- a/tests/integration/transpile/test_bladebridge.py +++ b/tests/integration/transpile/test_bladebridge.py @@ -1,81 +1,143 @@ +import json import logging +from collections.abc import Generator +from functools import cached_property from pathlib import Path - +import pytest +from databricks.labs.blueprint.wheels import ProductInfo from databricks.sdk import WorkspaceClient +from databricks.labs.lakebridge import cli from databricks.labs.lakebridge.config import TranspileConfig -from databricks.labs.lakebridge.transpiler.execute import transpile +from databricks.labs.lakebridge.contexts.application import ApplicationContext from databricks.labs.lakebridge.transpiler.installers import WheelInstaller -from databricks.labs.lakebridge.transpiler.lsp.lsp_engine import LSPEngine from databricks.labs.lakebridge.transpiler.repository import TranspilerRepository -from .common_utils import run_transpile_and_assert +from .common_utils import assert_sql_outputs logger = logging.getLogger(__name__) -def _install_bladebridge(transpiler_repository: TranspilerRepository, bladebridge_artifact: Path | None) -> tuple: - WheelInstaller(transpiler_repository, "bladebridge", "databricks-bb-plugin", bladebridge_artifact).install() - config_path = transpiler_repository.transpiler_config_path("Bladebridge") - return config_path, LSPEngine.from_config_path(config_path) +@pytest.fixture(scope="module") +def repository_with_bladebridge(tmp_path_factory) -> TranspilerRepository: + """A module-scoped repository with the latest published version of Bladebridge installed, for re-use across tests.""" + labs_path = tmp_path_factory.mktemp("labs") + transpiler_repository = TranspilerRepository(labs_path) + path = WheelInstaller(transpiler_repository, "bladebridge", "databricks-bb-plugin").install() + assert path is not None and path.exists() + return transpiler_repository -async def test_transpiles_informatica_with_sparksql( - ws: WorkspaceClient, - bladebridge_artifact: Path, - tmp_path: Path, -) -> None: - labs_path = tmp_path / "labs" - output_folder = tmp_path / "output" - transpiler_repository = TranspilerRepository(labs_path) - await _transpile_informatica_with_sparksql(ws, transpiler_repository, bladebridge_artifact, output_folder) +class MockApplicationContext(ApplicationContext): + """A mock application context that uses a unique installation path.""" + @cached_property + def product_info(self) -> ProductInfo: + return ProductInfo.for_testing(ApplicationContext) -async def _transpile_informatica_with_sparksql( - ws: WorkspaceClient, - transpiler_repository: TranspilerRepository, - bladebridge_artifact: Path, - output_folder: Path, -) -> None: - config_path, lsp_engine = _install_bladebridge(transpiler_repository, bladebridge_artifact) +@pytest.fixture +def application_ctx(ws: WorkspaceClient) -> Generator[ApplicationContext, None, None]: + """A mock application context with a unique installation path, cleaned up after the test.""" + ctx = MockApplicationContext(ws) + yield ctx + ctx.installation.remove() + + +def test_transpiles_informatica_to_sparksql( + application_ctx: ApplicationContext, repository_with_bladebridge: TranspilerRepository, tmp_path: Path, capsys +) -> None: + """Check that 'transpile' can convert an Informatica (ETL) mapping to SparkSQL using Bladebridge.""" + # Prepare the application context with a configuration for converting Informatica (ETL) + config_path = repository_with_bladebridge.transpiler_config_path("Bladebridge") input_source = Path(__file__).parent.parent.parent / "resources" / "functional" / "informatica" + output_folder = tmp_path / "output" + output_folder.mkdir(parents=True, exist_ok=True) + errors_path = output_folder / "errors.log" transpile_config = TranspileConfig( transpiler_config_path=str(config_path), source_dialect="informatica (desktop edition)", input_source=str(input_source), output_folder=str(output_folder), + error_file_path=str(errors_path), skip_validation=True, - catalog_name="catalog", - schema_name="schema", - transpiler_options={"target-tech": "SPARKSQL"}, + transpiler_options={"overrides-file": None, "target-tech": "SPARKSQL"}, + ) + application_ctx.installation.save(transpile_config) + + # Run the conversion. + cli.transpile( + w=application_ctx.workspace_client, + ctx=application_ctx, + transpiler_repository=repository_with_bladebridge, ) - # TODO: Load the engine here, via the validation path. - await transpile(ws, lsp_engine, transpile_config) - # TODO: This seems to be flaky; debug logging to help diagnose the flakiness. - files = [f.name for f in output_folder.iterdir()] - logger.debug(f"Transpiled files: {files}") + (out, _) = capsys.readouterr() + + # Check the conversion summary. + summary = json.loads(out) + assert summary == [ + { + "total_files_processed": 1, + "total_queries_processed": 1, + "analysis_error_count": 0, + "parsing_error_count": 0, + "validation_error_count": 0, + "generation_error_count": 0, + "error_log_file": None, + } + ] + + # Check the conversion by merely looking for the files we expect from our reference Informatica mapping. assert (output_folder / "m_employees_load.py").exists() assert (output_folder / "wf_m_employees_load.json").exists() assert (output_folder / "wf_m_employees_load_params.py").exists() + # No errors should have been logged, which means the errors file should not exist. + assert not errors_path.exists() -async def test_transpile_sql_file(ws: WorkspaceClient, tmp_path: Path) -> None: - labs_path = tmp_path / "labs" - output_folder = tmp_path / "output" - transpiler_repository = TranspilerRepository(labs_path) - await _transpile_bb_sql_file(ws, transpiler_repository, output_folder) - - -async def _transpile_bb_sql_file( - ws: WorkspaceClient, - transpiler_repository: TranspilerRepository, - bb_output_folder: Path, +def test_transpile_teradata_sql( + application_ctx: ApplicationContext, repository_with_bladebridge: TranspilerRepository, tmp_path: Path, capsys ) -> None: - # SQL Version installs latest Bladebridge from pypi - config_path, lsp_engine = _install_bladebridge(transpiler_repository, None) - bb_input_source = Path(__file__).parent.parent.parent / "resources" / "functional" / "teradata" / "integration" - # The expected SQL Block is custom formatted to match the output of Bladebridge exactly. + """Check that 'transpile' can convert a Teradata (SQL) to DBSQL using Bladebridge, and then validate the output.""" + # Prepare the application context with a configuration for converting Teradata (SQL) + config_path = repository_with_bladebridge.transpiler_config_path("Bladebridge") + input_source = Path(__file__).parent.parent.parent / "resources" / "functional" / "teradata" / "integration" + output_folder = tmp_path / "output" + output_folder.mkdir(parents=True, exist_ok=True) + errors_path = output_folder / "errors.log" + transpile_config = TranspileConfig( + transpiler_config_path=str(config_path), + source_dialect="teradata", + input_source=str(input_source), + output_folder=str(output_folder), + error_file_path=str(errors_path), + skip_validation=False, + catalog_name="catalog", + schema_name="schema", + transpiler_options={"overrides-file": None}, + ) + application_ctx.installation.save(transpile_config) + + # Run the conversion. + cli.transpile(w=application_ctx.workspace_client, ctx=application_ctx) + (out, _) = capsys.readouterr() + + # Check the conversion summary. + summary = json.loads(out) + assert summary == [ + { + "total_files_processed": 2, + "total_queries_processed": 2, + "analysis_error_count": 0, + "parsing_error_count": 0, + "validation_error_count": 1, + "generation_error_count": 0, + "error_log_file": str(errors_path), + } + ] + + # Check the output. + # Note: these are formatted exactly to match the output of Bladebridge. expected_teradata_sql = """CREATE TABLE REF_TABLE ( col1 TINYINT NOT NULL, @@ -98,7 +160,6 @@ async def _transpile_bb_sql_file( col18 FLOAT NOT NULL, PRIMARY KEY (col1,col3) ) TBLPROPERTIES('delta.feature.allowColumnDefaults' = 'supported');""" - # The expected SQL Block is custom formatted to match the output of Bladebridge exactly. expected_validation_failure_sql = """-------------- Exception Start------------------- /* [UNRESOLVED_ROUTINE] Cannot resolve routine `cole` on search path [`system`.`builtin`, `system`.`session`, `catalog`.`schema`]. @@ -106,14 +167,13 @@ async def _transpile_bb_sql_file( select cole(hello) world from table; ---------------Exception End --------------------""" - - await run_transpile_and_assert( - ws, - lsp_engine, - config_path, - bb_input_source, - bb_output_folder, - "teradata", - expected_teradata_sql, - expected_validation_failure_sql, + assert_sql_outputs( + output_folder, + expected_sql=expected_teradata_sql, + expected_failure_sql=expected_validation_failure_sql, ) + + # Verify the errors that were reported. + reported_errors = list(errors_path.open()) + [only_error] = reported_errors + assert "[UNRESOLVED_ROUTINE] Cannot resolve routine `cole` on search path" in only_error From aacadf2412308df4090737319bc5a9a96554d534 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 3 Oct 2025 14:27:07 +0200 Subject: [PATCH 05/24] Formatting. --- tests/integration/transpile/test_bladebridge.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/integration/transpile/test_bladebridge.py b/tests/integration/transpile/test_bladebridge.py index c2557d7f0..6629872f4 100644 --- a/tests/integration/transpile/test_bladebridge.py +++ b/tests/integration/transpile/test_bladebridge.py @@ -45,7 +45,10 @@ def application_ctx(ws: WorkspaceClient) -> Generator[ApplicationContext, None, def test_transpiles_informatica_to_sparksql( - application_ctx: ApplicationContext, repository_with_bladebridge: TranspilerRepository, tmp_path: Path, capsys + application_ctx: ApplicationContext, + repository_with_bladebridge: TranspilerRepository, + tmp_path: Path, + capsys, ) -> None: """Check that 'transpile' can convert an Informatica (ETL) mapping to SparkSQL using Bladebridge.""" # Prepare the application context with a configuration for converting Informatica (ETL) @@ -96,7 +99,10 @@ def test_transpiles_informatica_to_sparksql( def test_transpile_teradata_sql( - application_ctx: ApplicationContext, repository_with_bladebridge: TranspilerRepository, tmp_path: Path, capsys + application_ctx: ApplicationContext, + repository_with_bladebridge: TranspilerRepository, + tmp_path: Path, + capsys, ) -> None: """Check that 'transpile' can convert a Teradata (SQL) to DBSQL using Bladebridge, and then validate the output.""" # Prepare the application context with a configuration for converting Teradata (SQL) From af75dd0924762a2866950872c87a0dd15c0e896b Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Mon, 6 Oct 2025 18:03:14 +0200 Subject: [PATCH 06/24] Missed a stray encoding. --- tests/unit/test_cli_transpile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_cli_transpile.py b/tests/unit/test_cli_transpile.py index ab39a40b4..ace74e43c 100644 --- a/tests/unit/test_cli_transpile.py +++ b/tests/unit/test_cli_transpile.py @@ -305,7 +305,7 @@ def test_transpile_overrides_file_specified( """Verify that the overrides file can be manually specified and is passed to the transpiler.""" ws, cfg, _, do_transpile = mock_cli_for_transpile overrides_path = tmp_path / "overrides.json" - overrides_path.write_text("{}") + overrides_path.write_text("{}", encoding="utf-8") cli.transpile( w=ws, From 3dbab6f0151641bac486510baa0fa8d8a7425aca Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Mon, 6 Oct 2025 18:06:32 +0200 Subject: [PATCH 07/24] Ensure existing unit tests properly set and verify the transpiler_options attribute of the configuration. --- tests/unit/test_cli_transpile.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_cli_transpile.py b/tests/unit/test_cli_transpile.py index ace74e43c..15f7f5314 100644 --- a/tests/unit/test_cli_transpile.py +++ b/tests/unit/test_cli_transpile.py @@ -127,6 +127,7 @@ async def do_transpile(*args, **kwargs): skip_validation=True, catalog_name="my_catalog", schema_name="my_schema", + transpiler_options={"overrides-file": None}, ) mock_app_context.return_value.workspace_client = mock_workspace_client @@ -156,6 +157,7 @@ async def do_transpile(*args, **kwargs): "Select the source dialect.*": "0", "Enter input SQL path.*": str(empty_input_source), "Enter output folder.*": str(output_folder), + "Specify which technology should be generated.*": "0", } ) mock_app_context = create_autospec(ApplicationContext) @@ -176,6 +178,10 @@ async def do_transpile(*args, **kwargs): skip_validation=False, catalog_name="remorph", schema_name="transpiler", + transpiler_options={ + "overrides-file": None, + "target-tech": "PYSPARK", + }, ) yield mock_workspace_client, expected_config, mock_transpile @@ -421,6 +427,7 @@ def test_transpile_with_valid_inputs( skip_validation=cfg.skip_validation, catalog_name=cfg.catalog_name, schema_name=cfg.schema_name, + transpiler_options=cfg.transpiler_options, ), ) @@ -428,6 +435,8 @@ def test_transpile_with_valid_inputs( def test_transpile_prints_errors( caplog, tmp_path: Path, mock_workspace_client: WorkspaceClient, transpiler_repository: TranspilerRepository ) -> None: + prompts = MockPrompts({"Do you want to use the experimental.*": "no"}) + ctx = ApplicationContext(ws=mock_workspace_client).replace(prompts=prompts) input_source = path_to_resource("lsp_transpiler", "unsupported_lca.sql") with caplog.at_level("ERROR"): cli.transpile( @@ -439,6 +448,7 @@ def test_transpile_prints_errors( skip_validation="true", catalog_name="my_catalog", schema_name="my_schema", + ctx=ctx, transpiler_repository=transpiler_repository, ) @@ -450,7 +460,12 @@ def test_transpile_informatica_transpiler_dialect( ) -> None: ws, cfg, _, do_transpile = mock_cli_for_transpile # Test with Informatica PC dialect ensure user agent handles sources dialect with spaces in them - cli.transpile(w=ws, source_dialect="informatica pc", transpiler_repository=transpiler_repository) + cli.transpile( + w=ws, + source_dialect="informatica pc", + target_technology="PYSPARK", + transpiler_repository=transpiler_repository, + ) do_transpile.assert_called_once_with( ws, ANY, @@ -464,6 +479,7 @@ def test_transpile_informatica_transpiler_dialect( skip_validation=cfg.skip_validation, catalog_name=cfg.catalog_name, schema_name=cfg.schema_name, + transpiler_options={**cfg.transpiler_options, "target-tech": "PYSPARK"}, ), ) @@ -485,7 +501,11 @@ def test_transpile_no_config_with_source_override( ) -> None: ws, expected_config, do_transpile = mock_cli_transpile_no_config cli.transpile(w=ws, transpiler_repository=transpiler_repository, source_dialect="snowflake") - expected_config = dataclasses.replace(expected_config, source_dialect="snowflake") + expected_config = dataclasses.replace( + expected_config, + source_dialect="snowflake", + transpiler_options={k: v for k, v in expected_config.transpiler_options.items() if k != "target-tech"}, + ) do_transpile.assert_called_once_with( ws, ANY, From 5a1eb0eadc4a5b5aaf8f182519f9432ec3833bda Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Mon, 6 Oct 2025 18:09:26 +0200 Subject: [PATCH 08/24] Fix some old invalid fixtures for dialect flags that include a leading dash ('-'). --- tests/resources/lsp_transpiler/lsp_config.yml | 2 +- tests/resources/lsp_transpiler/lsp_server.py | 2 +- tests/resources/transpiler_configs/rct/lib/config.yml | 2 +- tests/unit/conftest.py | 2 +- tests/unit/test_install.py | 4 ++-- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/resources/lsp_transpiler/lsp_config.yml b/tests/resources/lsp_transpiler/lsp_config.yml index 2ca7f18c5..3c7c6c8d5 100644 --- a/tests/resources/lsp_transpiler/lsp_config.yml +++ b/tests/resources/lsp_transpiler/lsp_config.yml @@ -11,7 +11,7 @@ remorph: - --stuff=12 options: all: - - flag: "-experimental" + - flag: "experimental" method: CONFIRM prompt: Do you want to use the experimental Databricks generator ? custom: diff --git a/tests/resources/lsp_transpiler/lsp_server.py b/tests/resources/lsp_transpiler/lsp_server.py index d5e4fbc57..4484f1b95 100644 --- a/tests/resources/lsp_transpiler/lsp_server.py +++ b/tests/resources/lsp_transpiler/lsp_server.py @@ -104,7 +104,7 @@ def dialect(self) -> str: @property def experimental(self) -> str | None: options = self.initialization_options.get("options", {}) or {} - return options.get("-experimental", None) + return options.get("experimental", None) @property def whatever(self) -> str | None: diff --git a/tests/resources/transpiler_configs/rct/lib/config.yml b/tests/resources/transpiler_configs/rct/lib/config.yml index 3e01d0a2d..5e0ffae0a 100644 --- a/tests/resources/transpiler_configs/rct/lib/config.yml +++ b/tests/resources/transpiler_configs/rct/lib/config.yml @@ -21,6 +21,6 @@ remorph: - databricks/labs/remorph/transpiler/server.py options: all: - - flag: "-experimental" + - flag: "experimental" method: CONFIRM prompt: Do you want to use the experimental Databricks generator ? diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 32ed2581a..a4011634e 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -39,7 +39,7 @@ def mock_databricks_config() -> Config: def transpile_config() -> TranspileConfig: return TranspileConfig( transpiler_config_path="sqlglot", - transpiler_options={"-experimental": True}, + transpiler_options={"experimental": True}, source_dialect="snowflake", input_source="input_sql", output_folder="output_folder", diff --git a/tests/unit/test_install.py b/tests/unit/test_install.py index fd9aee169..126dff501 100644 --- a/tests/unit/test_install.py +++ b/tests/unit/test_install.py @@ -1062,7 +1062,7 @@ def transpilers_path(self) -> Path: expected_config = LakebridgeConfiguration( transpile=TranspileConfig( transpiler_config_path=PATH_TO_TRANSPILER_CONFIG, - transpiler_options={"-experimental": True}, + transpiler_options={"experimental": True}, source_dialect="snowflake", input_source="/tmp/queries/snow", output_folder="/tmp/queries/databricks", @@ -1077,7 +1077,7 @@ def transpilers_path(self) -> Path: "config.yml", { "transpiler_config_path": PATH_TO_TRANSPILER_CONFIG, - "transpiler_options": {'-experimental': True}, + "transpiler_options": {'experimental': True}, "catalog_name": "remorph_test", "input_source": "/tmp/queries/snow", "output_folder": "/tmp/queries/databricks", From 734d46963703ed7d9728d2959ce8815e53cd906a Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Mon, 6 Oct 2025 18:10:53 +0200 Subject: [PATCH 09/24] Internal documentation, to help the next person who looks at this. --- src/databricks/labs/lakebridge/install.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/databricks/labs/lakebridge/install.py b/src/databricks/labs/lakebridge/install.py index 7c9edf082..ccd60772a 100644 --- a/src/databricks/labs/lakebridge/install.py +++ b/src/databricks/labs/lakebridge/install.py @@ -262,6 +262,11 @@ def _prompt_for_transpiler_options(self, transpiler_name: str, source_dialect: s config_options = self._transpiler_repository.transpiler_config_options(transpiler_name, source_dialect) if len(config_options) == 0: return None + # Semantics here are different from the other properties of a TranspileConfig. Specifically: + # - Entries are present for all options. + # - If the value is None, it means the user chose to not provide a value. (This differs to other + # attributes, where None means the user chose to provide a value later.) + # - There is no way to express 'provide a value later'. return {option.flag: option.prompt_for_value(self._prompts) for option in config_options} def _configure_catalog(self) -> str: From 77e733ff03c0e698e28745c3c92142c57928933f Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Mon, 6 Oct 2025 18:11:55 +0200 Subject: [PATCH 10/24] Split some TODO markers into multiple actions. --- src/databricks/labs/lakebridge/cli.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/databricks/labs/lakebridge/cli.py b/src/databricks/labs/lakebridge/cli.py index 5109711e2..21815e936 100644 --- a/src/databricks/labs/lakebridge/cli.py +++ b/src/databricks/labs/lakebridge/cli.py @@ -501,7 +501,8 @@ def _check_transpiler_options(self, engine: TranspileEngine) -> None: if not isinstance(transpiler_options, Mapping): return # Only checks if the option is present, does not validate the value. - # TODO: Validate the value for CHOICE/FORCE/CONFIRM options. + # TODO: Validate the value for CHOICE/CONFIRM options. + # TODO: Handle FORCE options: these are fixed by the transpiler, and cannot be overridden. checked_options = { option.flag: ( transpiler_options[option.flag] From 070dd38c2b9e89e86c0fd5765f20b43d33f9a11a Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Mon, 6 Oct 2025 18:13:33 +0200 Subject: [PATCH 11/24] Don't bail out if transpiler_options is not a mapping: log a warning and try to continue. Bailing out will lead to a crash. --- src/databricks/labs/lakebridge/cli.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/lakebridge/cli.py b/src/databricks/labs/lakebridge/cli.py index 21815e936..8171afcd6 100644 --- a/src/databricks/labs/lakebridge/cli.py +++ b/src/databricks/labs/lakebridge/cli.py @@ -498,8 +498,11 @@ def _check_transpiler_options(self, engine: TranspileEngine) -> None: assert self._config.source_dialect is not None, "Source dialect must be set before checking transpiler options." options_for_dialect = engine.options_for_dialect(self._config.source_dialect) transpiler_options = self._config.transpiler_options - if not isinstance(transpiler_options, Mapping): - return + if transpiler_options is None: + transpiler_options = {} + elif not isinstance(transpiler_options, Mapping): + logger.warning(f"Ignoring transpiler_options in config.yml, must be a mapping: {transpiler_options!r}") + transpiler_options = {} # Only checks if the option is present, does not validate the value. # TODO: Validate the value for CHOICE/CONFIRM options. # TODO: Handle FORCE options: these are fixed by the transpiler, and cannot be overridden. From 13cd9ed4576509bcb24424ee5351054e2e7b5c51 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Mon, 6 Oct 2025 18:47:51 +0200 Subject: [PATCH 12/24] Properly detect no answer from the user. --- src/databricks/labs/lakebridge/config.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/lakebridge/config.py b/src/databricks/labs/lakebridge/config.py index 9b5d0d418..0e2523b61 100644 --- a/src/databricks/labs/lakebridge/config.py +++ b/src/databricks/labs/lakebridge/config.py @@ -130,9 +130,10 @@ def prompt_for_value(self, prompts: Prompts) -> JsonValue: if self.method == LSPPromptMethod.CONFIRM: return prompts.confirm(self.prompt) if self.method == LSPPromptMethod.QUESTION: - default = self.default if self.default else "None" + no_answer = "" + default = self.default if self.default else no_answer result = prompts.question(self.prompt, default=default) - if result == "": + if result == no_answer: return None return result if self.method == LSPPromptMethod.CHOICE: From ec0f36b9d421ab2e26b093cede5908139066f793 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Mon, 6 Oct 2025 18:49:27 +0200 Subject: [PATCH 13/24] Internal documentation on why `prompts.question()` is being used this way. --- src/databricks/labs/lakebridge/config.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/databricks/labs/lakebridge/config.py b/src/databricks/labs/lakebridge/config.py index 0e2523b61..eee22416c 100644 --- a/src/databricks/labs/lakebridge/config.py +++ b/src/databricks/labs/lakebridge/config.py @@ -130,6 +130,11 @@ def prompt_for_value(self, prompts: Prompts) -> JsonValue: if self.method == LSPPromptMethod.CONFIRM: return prompts.confirm(self.prompt) if self.method == LSPPromptMethod.QUESTION: + # Hack to: + # - trick prompts.question() into indicating that no answer is required; + # - allow no answer to be given. + # Normally prompts.confirm() requires an answer, or returns the default, and the default can't be None. + # Note: LSP servers use '' as a default to indicate that no answer is required. no_answer = "" default = self.default if self.default else no_answer result = prompts.question(self.prompt, default=default) From ae3077cc93bc9ba1bd047bbdde91f0b7b7d7c941 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Mon, 6 Oct 2025 18:51:03 +0200 Subject: [PATCH 14/24] Don't prompt for optional transpiler-specific options: they're not needed, and can be provided as a command-line argument. --- src/databricks/labs/lakebridge/cli.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/databricks/labs/lakebridge/cli.py b/src/databricks/labs/lakebridge/cli.py index 8171afcd6..9521e844d 100644 --- a/src/databricks/labs/lakebridge/cli.py +++ b/src/databricks/labs/lakebridge/cli.py @@ -16,7 +16,7 @@ from databricks.labs.blueprint.cli import App from databricks.labs.blueprint.entrypoint import get_logger, is_in_debug -from databricks.labs.blueprint.installation import RootJsonValue, JsonObject +from databricks.labs.blueprint.installation import RootJsonValue, JsonObject, JsonValue from databricks.labs.blueprint.tui import Prompts @@ -25,7 +25,7 @@ PROFILER_SOURCE_SYSTEM, ) -from databricks.labs.lakebridge.config import TranspileConfig +from databricks.labs.lakebridge.config import TranspileConfig, LSPConfigOptionV1 from databricks.labs.lakebridge.contexts.application import ApplicationContext from databricks.labs.lakebridge.helpers.recon_config_utils import ReconConfigPrompts from databricks.labs.lakebridge.helpers.telemetry_utils import make_alphanum_or_semver @@ -510,12 +510,17 @@ def _check_transpiler_options(self, engine: TranspileEngine) -> None: option.flag: ( transpiler_options[option.flag] if option.flag in transpiler_options - else option.prompt_for_value(self._prompts) + else self._handle_missing_transpiler_option(option) ) for option in options_for_dialect } self._config = dataclasses.replace(self._config, transpiler_options=checked_options) + def _handle_missing_transpiler_option(self, option: LSPConfigOptionV1) -> JsonValue: + if option.default == "": + return None + return option.prompt_for_value(self._prompts) + def check(self) -> tuple[TranspileConfig, TranspileEngine]: """Checks that all configuration parameters are present and valid.""" logger.debug(f"Checking config: {self._config!r}") From 79e9d4032457275ef1eaa494730735c2773d94e7 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Mon, 6 Oct 2025 18:52:06 +0200 Subject: [PATCH 15/24] Internal documentation for how the transpiler-specific options are configured and work. --- src/databricks/labs/lakebridge/cli.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/databricks/labs/lakebridge/cli.py b/src/databricks/labs/lakebridge/cli.py index 9521e844d..49781f66a 100644 --- a/src/databricks/labs/lakebridge/cli.py +++ b/src/databricks/labs/lakebridge/cli.py @@ -517,6 +517,29 @@ def _check_transpiler_options(self, engine: TranspileEngine) -> None: self._config = dataclasses.replace(self._config, transpiler_options=checked_options) def _handle_missing_transpiler_option(self, option: LSPConfigOptionV1) -> JsonValue: + # Semantics during configuration: + # - Entries are present in the config file for all options the LSP server needs for a dialect. + # - If a value is `None`, it means the user wants the value to be left unset. + # - There is no 'provide it later' option: either it's set, or it's unset. + # As a corner case, if there is no entry present it means the user wasn't prompted. Here we have + # some complexity. We have two ways of obtaining a value: + # - The user could provide it on the command-line, using --target-technology or --overrides-file. + # Problem: via command-line options there's no way to indicate 'no value'. + # - We could prompt for it, assuming the user is running interactively. + # In terms of what is required by the option: + # - If the option has a default of , it means that no value is required. + # - Everything else requires a value. + # + # This leads to the following business rules: + # - If the option has a default of that means that no value is required, no further action is required. + # - Otherwise, a value is required: prompt for it. + # + # TODO: When adding non-interactive support, the otherwise branch need to be modified: + # 1. If it can be provided by the command-line, fail and ask the user to provide it. + # 2. If it cannot be provided by the command-line, prompt for it if we are running interactively. + # 3. If we cannot prompt because we are not running interactively, use the default if there is one. + # 4. Fail: the only way to provide a value is via the config.yml, which can be set via 'install-transpile'. + if option.default == "": return None return option.prompt_for_value(self._prompts) From 96b1925f704a44203519827a4461b954877946f0 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Mon, 6 Oct 2025 18:56:54 +0200 Subject: [PATCH 16/24] Simplify function. --- tests/integration/transpile/common_utils.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/tests/integration/transpile/common_utils.py b/tests/integration/transpile/common_utils.py index da73128be..5e052bae7 100644 --- a/tests/integration/transpile/common_utils.py +++ b/tests/integration/transpile/common_utils.py @@ -4,14 +4,10 @@ def assert_sql_outputs(output_folder: Path, expected_sql: str, expected_failure_sql: str) -> None: - assert (output_folder / "create_ddl.sql").exists() - with open(output_folder / "create_ddl.sql", "r", encoding="utf-8") as f: - actual_sql = f.read() - assert actual_sql.strip() == expected_sql.strip() + actual_sql = (output_folder / "create_ddl.sql").read_text(encoding="utf-8") + actual_failure_sql = (output_folder / "dummy_function.sql").read_text(encoding="utf-8") - assert (output_folder / "dummy_function.sql").exists() - with open(output_folder / "dummy_function.sql", "r", encoding="utf-8") as f: - actual_failure_sql = f.read() + assert actual_sql.strip() == expected_sql.strip() assert actual_failure_sql.strip() == expected_failure_sql.strip() From a95f4c21d8cc87e0b40de9b1a980c29d42aa3c80 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Mon, 6 Oct 2025 18:57:50 +0200 Subject: [PATCH 17/24] Integration tests that cover the overrides-file and target-technology command-line options for BB. --- .../integration/transpile/test_bladebridge.py | 119 +++++++++++++++++- 1 file changed, 117 insertions(+), 2 deletions(-) diff --git a/tests/integration/transpile/test_bladebridge.py b/tests/integration/transpile/test_bladebridge.py index 6629872f4..67600c089 100644 --- a/tests/integration/transpile/test_bladebridge.py +++ b/tests/integration/transpile/test_bladebridge.py @@ -6,6 +6,7 @@ import pytest from databricks.labs.blueprint.wheels import ProductInfo +from databricks.labs.blueprint.paths import WorkspacePath from databricks.sdk import WorkspaceClient from databricks.labs.lakebridge import cli @@ -41,7 +42,8 @@ def application_ctx(ws: WorkspaceClient) -> Generator[ApplicationContext, None, """A mock application context with a unique installation path, cleaned up after the test.""" ctx = MockApplicationContext(ws) yield ctx - ctx.installation.remove() + if WorkspacePath(ws, ctx.installation.install_folder()).exists(): + ctx.installation.remove() def test_transpiles_informatica_to_sparksql( @@ -98,6 +100,71 @@ def test_transpiles_informatica_to_sparksql( assert not errors_path.exists() +@pytest.mark.parametrize("provide_overrides", [True, False]) +def test_transpiles_informatica_to_sparksql_non_interactive( + provide_overrides: bool, + application_ctx: ApplicationContext, + repository_with_bladebridge: TranspilerRepository, + tmp_path: Path, + capsys, +) -> None: + """Check that 'transpile' can non-interactively convert an Informatica (ETL) mapping to SparkSQL using Bladebridge.""" + # Prepare the application context as if it were non-interactive (no config.yml file). + config_path = repository_with_bladebridge.transpiler_config_path("Bladebridge") + input_source = Path(__file__).parent.parent.parent / "resources" / "functional" / "informatica" + output_folder = tmp_path / "output" + output_folder.mkdir(parents=True, exist_ok=True) + errors_path = output_folder / "errors.log" + kwargs: dict[str, str] = {} + if provide_overrides: + # This is horrible but we need it for the minimum valid overrides file that will work with Informatica/SparkSQL. + transpilers_path = repository_with_bladebridge.transpilers_path() + overrides_base = next(transpilers_path.glob("**/base_infapc2databricks_sparksql.json")) + overrides_file = tmp_path / "overrides.json" + overrides_file.write_text(json.dumps({"inherit_from": [str(overrides_base.absolute())]}), encoding="utf-8") + kwargs["overrides_file"] = str(overrides_file) + + # Run the conversion: everything has to be passed as parameters. + cli.transpile( + w=application_ctx.workspace_client, + transpiler_config_path=str(config_path), + source_dialect="informatica (desktop edition)", + target_technology="SPARKSQL", + input_source=str(input_source), + output_folder=str(output_folder), + error_file_path=str(errors_path), + ctx=application_ctx, + transpiler_repository=repository_with_bladebridge, + **kwargs, + ) + (out, _) = capsys.readouterr() + + _check_transpile_informatica_to_sparksql(out, output_folder, errors_path) + + +def _check_transpile_informatica_to_sparksql(stdout: str, output_folder: Path, errors_path: Path) -> None: + # Check the conversion summary. + summary = json.loads(stdout) + assert summary == [ + { + "total_files_processed": 1, + "total_queries_processed": 1, + "analysis_error_count": 0, + "parsing_error_count": 0, + "validation_error_count": 0, + "generation_error_count": 0, + "error_log_file": None, + } + ] + + # Check the conversion by merely looking for the files we expect from our reference Informatica mapping. + assert (output_folder / "m_employees_load.py").exists() + assert (output_folder / "wf_m_employees_load.json").exists() + assert (output_folder / "wf_m_employees_load_params.py").exists() + # No errors should have been logged, which means the errors file should not exist. + assert not errors_path.exists() + + def test_transpile_teradata_sql( application_ctx: ApplicationContext, repository_with_bladebridge: TranspilerRepository, @@ -128,8 +195,56 @@ def test_transpile_teradata_sql( cli.transpile(w=application_ctx.workspace_client, ctx=application_ctx) (out, _) = capsys.readouterr() + _check_transpile_teradata_sql(out, output_folder, errors_path) + + +@pytest.mark.parametrize("provide_overrides", [True, False]) +def test_transpile_teradata_sql_non_interactive( + provide_overrides: bool, + application_ctx: ApplicationContext, + repository_with_bladebridge: TranspilerRepository, + tmp_path: Path, + capsys, +) -> None: + """Check that 'transpile' can non-interactively convert a Teradata (SQL) to DBSQL using Bladebridge, and then validate the output.""" + # Prepare the application context as if it were non-interactive (no config.yml file). + config_path = repository_with_bladebridge.transpiler_config_path("Bladebridge") + input_source = Path(__file__).parent.parent.parent / "resources" / "functional" / "teradata" / "integration" + output_folder = tmp_path / "output" + output_folder.mkdir(parents=True, exist_ok=True) + errors_path = output_folder / "errors.log" + kwargs: dict[str, str] = {} + if provide_overrides: + # This is horrible but we need it for the minimum valid overrides file that will work with Teradata. + transpilers_path = repository_with_bladebridge.transpilers_path() + overrides_base = next(transpilers_path.glob("**/base_teradata2databricks_sql.json")) + overrides_file = tmp_path / "overrides.json" + overrides_file.write_text(json.dumps({"inherit_from": [str(overrides_base.absolute())]}), encoding="utf-8") + kwargs["overrides_file"] = str(overrides_file) + + # Run the conversion: everything has to be passed as parameters. + cli.transpile( + w=application_ctx.workspace_client, + transpiler_config_path=str(config_path), + source_dialect="teradata", + input_source=str(input_source), + output_folder=str(output_folder), + error_file_path=str(errors_path), + skip_validation="false", + catalog_name="catalog", + schema_name="schema", + ctx=application_ctx, + transpiler_repository=repository_with_bladebridge, + **kwargs, + ) + (out, _) = capsys.readouterr() + + _check_transpile_teradata_sql(out, output_folder, errors_path) + + +def _check_transpile_teradata_sql(stdout: str, output_folder: Path, errors_path: Path) -> None: # Check the conversion summary. - summary = json.loads(out) + summary = json.loads(stdout) assert summary == [ { "total_files_processed": 2, From 10ba183a67fe8a86207f7edc2152c38f74813f50 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Mon, 6 Oct 2025 19:09:59 +0200 Subject: [PATCH 18/24] Update documentation to include the new options. --- docs/lakebridge/docs/installation.mdx | 32 +++++++++++++++++++++--- docs/lakebridge/docs/transpile/index.mdx | 32 +++++++++++++++++++++--- 2 files changed, 57 insertions(+), 7 deletions(-) diff --git a/docs/lakebridge/docs/installation.mdx b/docs/lakebridge/docs/installation.mdx index c8a8a2250..9a80f410d 100644 --- a/docs/lakebridge/docs/installation.mdx +++ b/docs/lakebridge/docs/installation.mdx @@ -127,11 +127,35 @@ Specify the config file to override the default[Bladebridge] config - press +databricks labs lakebridge transpile --help +``` +```console +Transpile SQL script to Databricks SQL + +Usage: + databricks labs lakebridge transpile [flags] + +Flags: + --catalog-name string Catalog Name Applicable only when Validation Mode is DATABRICKS + --error-file-path /errors.log Output Location For Storing Errors, defaults to (Current Working Directory) /errors.log + -h, --help help for transpile + --input-source string Input Script Folder or File + --output-folder /transpiled Output Location For Storing Transpiled Code, defaults to (Current Working Directory) /transpiled folder + --overrides-file string Path to a file containing transpiler overrides, if supported by the transpiler in use. + --schema-name string Schema Name Applicable only when Validation Mode is DATABRICKS + --skip-validation string Validate Transpiled Code, default True validation skipped, False validate (default "true") + --source-dialect install-transpile Dialect name as selected during install-transpile or refer to documentation + --target-technology string Target technology to use for code generation, if supported by the transpiler in use. + --transpiler-config-path string Path to the transpiler configuration file [Pluggable Transpilers] eg:- Morpheus/Bladebridge + +Global Flags: + --debug enable debug logging + -o, --output type output type: text or json (default text) + -p, --profile string ~/.databrickscfg profile + -t, --target string bundle target to use (if applicable) +``` [[back to top](#table-of-contents)] diff --git a/docs/lakebridge/docs/transpile/index.mdx b/docs/lakebridge/docs/transpile/index.mdx index be6f4eb9e..011b8710a 100644 --- a/docs/lakebridge/docs/transpile/index.mdx +++ b/docs/lakebridge/docs/transpile/index.mdx @@ -10,9 +10,33 @@ import useBaseUrl from '@docusaurus/useBaseUrl'; ## Verify Installation Verify the successful installation by executing the provided command; confirmation of a successful installation is indicated when the displayed output aligns with the example screenshot provided: ```bash - databricks labs lakebridge transpile --help - ``` -transpile-help +databricks labs lakebridge transpile --help +``` +```console +Transpile SQL script to Databricks SQL + +Usage: + databricks labs lakebridge transpile [flags] + +Flags: + --catalog-name string Catalog Name Applicable only when Validation Mode is DATABRICKS + --error-file-path /errors.log Output Location For Storing Errors, defaults to (Current Working Directory) /errors.log + -h, --help help for transpile + --input-source string Input Script Folder or File + --output-folder /transpiled Output Location For Storing Transpiled Code, defaults to (Current Working Directory) /transpiled folder + --overrides-file string Path to a file containing transpiler overrides, if supported by the transpiler in use. + --schema-name string Schema Name Applicable only when Validation Mode is DATABRICKS + --skip-validation string Validate Transpiled Code, default True validation skipped, False validate (default "true") + --source-dialect install-transpile Dialect name as selected during install-transpile or refer to documentation + --target-technology string Target technology to use for code generation, if supported by the transpiler in use. + --transpiler-config-path string Path to the transpiler configuration file [Pluggable Transpilers] eg:- Morpheus/Bladebridge + +Global Flags: + --debug enable debug logging + -o, --output type output type: text or json (default text) + -p, --profile string ~/.databrickscfg profile + -t, --target string bundle target to use (if applicable) +``` ## Execution Pre-Set Up When you run `install-transpile`, you will be prompted to enter all the required elements to transpile your code. You can @@ -24,6 +48,8 @@ The `transpile` command will trigger the conversion of the specified code, these - `output-folder [Optional]` - The path to the output folder where the transpiled SQL files will be stored. If not specified, the transpiled SQL files will be stored in a folder called `transpiled` in your current working directory. - `source-dialect [Optional]` - Dialect name (ex: snowflake, oracle, datastage, etc). If not specified, refers to the Source Dialect selected at installation time. +- `overrides-file [Optional]` - An optional path to a JSON file containing custom overrides for the transpilation process, if the underlying transpiler supports this. (Refer to [this documentation](pluggable_transpilers/bladebridge_configuration) for more details on custom overrides.) +- `target-technology [Optional]` - The target technology to use for conversion output, if the underlying transpiler supports this. - `error-file-path [Optional]` - The path to the file where the transpile errors will be stored. If not specified, the errors will be stored in a file called `errors.log` in your current working directory. - `skip-validation [Optional]` - The default value is True. If set to False, the transpiler will validate the transpiled SQL scripts against the Databricks catalog and schema provided by user. From 5d50103711143d0b74d5cf90900330b73ac2767a Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 7 Oct 2025 19:39:05 +0200 Subject: [PATCH 19/24] Capture and log the content of errors.log during transpiler tests. --- .../integration/transpile/test_bladebridge.py | 23 +++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/tests/integration/transpile/test_bladebridge.py b/tests/integration/transpile/test_bladebridge.py index 67600c089..38916c4c5 100644 --- a/tests/integration/transpile/test_bladebridge.py +++ b/tests/integration/transpile/test_bladebridge.py @@ -46,9 +46,25 @@ def application_ctx(ws: WorkspaceClient) -> Generator[ApplicationContext, None, ctx.installation.remove() +@pytest.fixture(name="errors_path") +def capture_errors_log(tmp_path: Path) -> Generator[Path, None, None]: + """The path to an errors log file. If it exists after the test, its content will be logged to help with debugging.""" + path = tmp_path / "errors.log" + yield path + try: + with open(path, encoding="utf-8", errors="replace") as f: + errors_logged = list(f) + except OSError: + logger.debug("No errors log found.") + else: + for line in errors_logged: + logger.error(f"Error logged: {line.strip()}") + + def test_transpiles_informatica_to_sparksql( application_ctx: ApplicationContext, repository_with_bladebridge: TranspilerRepository, + errors_path: Path, tmp_path: Path, capsys, ) -> None: @@ -58,7 +74,6 @@ def test_transpiles_informatica_to_sparksql( input_source = Path(__file__).parent.parent.parent / "resources" / "functional" / "informatica" output_folder = tmp_path / "output" output_folder.mkdir(parents=True, exist_ok=True) - errors_path = output_folder / "errors.log" transpile_config = TranspileConfig( transpiler_config_path=str(config_path), source_dialect="informatica (desktop edition)", @@ -105,6 +120,7 @@ def test_transpiles_informatica_to_sparksql_non_interactive( provide_overrides: bool, application_ctx: ApplicationContext, repository_with_bladebridge: TranspilerRepository, + errors_path: Path, tmp_path: Path, capsys, ) -> None: @@ -114,7 +130,6 @@ def test_transpiles_informatica_to_sparksql_non_interactive( input_source = Path(__file__).parent.parent.parent / "resources" / "functional" / "informatica" output_folder = tmp_path / "output" output_folder.mkdir(parents=True, exist_ok=True) - errors_path = output_folder / "errors.log" kwargs: dict[str, str] = {} if provide_overrides: # This is horrible but we need it for the minimum valid overrides file that will work with Informatica/SparkSQL. @@ -168,6 +183,7 @@ def _check_transpile_informatica_to_sparksql(stdout: str, output_folder: Path, e def test_transpile_teradata_sql( application_ctx: ApplicationContext, repository_with_bladebridge: TranspilerRepository, + errors_path: Path, tmp_path: Path, capsys, ) -> None: @@ -177,7 +193,6 @@ def test_transpile_teradata_sql( input_source = Path(__file__).parent.parent.parent / "resources" / "functional" / "teradata" / "integration" output_folder = tmp_path / "output" output_folder.mkdir(parents=True, exist_ok=True) - errors_path = output_folder / "errors.log" transpile_config = TranspileConfig( transpiler_config_path=str(config_path), source_dialect="teradata", @@ -203,6 +218,7 @@ def test_transpile_teradata_sql_non_interactive( provide_overrides: bool, application_ctx: ApplicationContext, repository_with_bladebridge: TranspilerRepository, + errors_path: Path, tmp_path: Path, capsys, ) -> None: @@ -212,7 +228,6 @@ def test_transpile_teradata_sql_non_interactive( input_source = Path(__file__).parent.parent.parent / "resources" / "functional" / "teradata" / "integration" output_folder = tmp_path / "output" output_folder.mkdir(parents=True, exist_ok=True) - errors_path = output_folder / "errors.log" kwargs: dict[str, str] = {} if provide_overrides: # This is horrible but we need it for the minimum valid overrides file that will work with Teradata. From a60088568724c1ed39259f8d737f75aa1f18ac1d Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 7 Oct 2025 19:39:50 +0200 Subject: [PATCH 20/24] Capture the logs from the Bladebridge LSP server during transpile tests. This is intended to help with debugging. --- .../integration/transpile/test_bladebridge.py | 97 +++++++++++++------ 1 file changed, 65 insertions(+), 32 deletions(-) diff --git a/tests/integration/transpile/test_bladebridge.py b/tests/integration/transpile/test_bladebridge.py index 38916c4c5..a06b9e31d 100644 --- a/tests/integration/transpile/test_bladebridge.py +++ b/tests/integration/transpile/test_bladebridge.py @@ -1,3 +1,4 @@ +import contextlib import json import logging from collections.abc import Generator @@ -29,6 +30,34 @@ def repository_with_bladebridge(tmp_path_factory) -> TranspilerRepository: return transpiler_repository +@contextlib.contextmanager +def capture_bladebridge_logs(transpiler_repository: TranspilerRepository, *, level: int = logging.DEBUG) -> Generator[None, None, None]: + """Reset the logs from Bladebridge before yielding, and capture them afterward, to help with test debugging.""" + # TODO: Move this into the core? + # - Extend the LSP config.yml to describe where error logs go. + # - If the LSP server fails, capture the error logs automatically. + + # Step 1: Remove any existing log files, so we know that anything afterward is fresh. + bladebridge_lib_dir = transpiler_repository.transpilers_path() / "bladebridge" / "lib" + for log_file in bladebridge_lib_dir.glob("*.log"): + logger.debug(f"Removing existing log file: {log_file}") + log_file.unlink(missing_ok=True) + + # Step 2: Yield to the caller, who will presumably run some Bladebridge operations. + yield + + # Step 3: Capture any logs that were produced, to help with debugging if the test failed. + produced_log_files = list(bladebridge_lib_dir.glob("*.log")) + logger.debug(f"Captured {len(produced_log_files)} log file(s): {produced_log_files}") + if not logger.isEnabledFor(level): + return + for log_file in produced_log_files: + logger.log(level, f"============ Bladebridge log: {log_file.name} starting... ==================") + for line in log_file.open(encoding="utf-8", errors="replace"): + logger.log(level, f"{log_file.name}: {line.strip()}") + logger.log(level, f"============ Bladebridge log: {log_file.name} finished. ====================") + + class MockApplicationContext(ApplicationContext): """A mock application context that uses a unique installation path.""" @@ -86,11 +115,12 @@ def test_transpiles_informatica_to_sparksql( application_ctx.installation.save(transpile_config) # Run the conversion. - cli.transpile( - w=application_ctx.workspace_client, - ctx=application_ctx, - transpiler_repository=repository_with_bladebridge, - ) + with capture_bladebridge_logs(repository_with_bladebridge): + cli.transpile( + w=application_ctx.workspace_client, + ctx=application_ctx, + transpiler_repository=repository_with_bladebridge, + ) (out, _) = capsys.readouterr() # Check the conversion summary. @@ -140,18 +170,19 @@ def test_transpiles_informatica_to_sparksql_non_interactive( kwargs["overrides_file"] = str(overrides_file) # Run the conversion: everything has to be passed as parameters. - cli.transpile( - w=application_ctx.workspace_client, - transpiler_config_path=str(config_path), - source_dialect="informatica (desktop edition)", - target_technology="SPARKSQL", - input_source=str(input_source), - output_folder=str(output_folder), - error_file_path=str(errors_path), - ctx=application_ctx, - transpiler_repository=repository_with_bladebridge, - **kwargs, - ) + with capture_bladebridge_logs(repository_with_bladebridge): + cli.transpile( + w=application_ctx.workspace_client, + transpiler_config_path=str(config_path), + source_dialect="informatica (desktop edition)", + target_technology="SPARKSQL", + input_source=str(input_source), + output_folder=str(output_folder), + error_file_path=str(errors_path), + ctx=application_ctx, + transpiler_repository=repository_with_bladebridge, + **kwargs, + ) (out, _) = capsys.readouterr() _check_transpile_informatica_to_sparksql(out, output_folder, errors_path) @@ -207,7 +238,8 @@ def test_transpile_teradata_sql( application_ctx.installation.save(transpile_config) # Run the conversion. - cli.transpile(w=application_ctx.workspace_client, ctx=application_ctx) + with capture_bladebridge_logs(repository_with_bladebridge): + cli.transpile(w=application_ctx.workspace_client, ctx=application_ctx) (out, _) = capsys.readouterr() _check_transpile_teradata_sql(out, output_folder, errors_path) @@ -238,20 +270,21 @@ def test_transpile_teradata_sql_non_interactive( kwargs["overrides_file"] = str(overrides_file) # Run the conversion: everything has to be passed as parameters. - cli.transpile( - w=application_ctx.workspace_client, - transpiler_config_path=str(config_path), - source_dialect="teradata", - input_source=str(input_source), - output_folder=str(output_folder), - error_file_path=str(errors_path), - skip_validation="false", - catalog_name="catalog", - schema_name="schema", - ctx=application_ctx, - transpiler_repository=repository_with_bladebridge, - **kwargs, - ) + with capture_bladebridge_logs(repository_with_bladebridge): + cli.transpile( + w=application_ctx.workspace_client, + transpiler_config_path=str(config_path), + source_dialect="teradata", + input_source=str(input_source), + output_folder=str(output_folder), + error_file_path=str(errors_path), + skip_validation="false", + catalog_name="catalog", + schema_name="schema", + ctx=application_ctx, + transpiler_repository=repository_with_bladebridge, + **kwargs, + ) (out, _) = capsys.readouterr() _check_transpile_teradata_sql(out, output_folder, errors_path) From b79a94a556981104020284ba93a51ae855df6f31 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 7 Oct 2025 20:03:59 +0200 Subject: [PATCH 21/24] Formatting. --- tests/integration/transpile/test_bladebridge.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/integration/transpile/test_bladebridge.py b/tests/integration/transpile/test_bladebridge.py index a06b9e31d..c3ecffdd0 100644 --- a/tests/integration/transpile/test_bladebridge.py +++ b/tests/integration/transpile/test_bladebridge.py @@ -31,7 +31,11 @@ def repository_with_bladebridge(tmp_path_factory) -> TranspilerRepository: @contextlib.contextmanager -def capture_bladebridge_logs(transpiler_repository: TranspilerRepository, *, level: int = logging.DEBUG) -> Generator[None, None, None]: +def capture_bladebridge_logs( + transpiler_repository: TranspilerRepository, + *, + level: int = logging.DEBUG, +) -> Generator[None, None, None]: """Reset the logs from Bladebridge before yielding, and capture them afterward, to help with test debugging.""" # TODO: Move this into the core? # - Extend the LSP config.yml to describe where error logs go. From a65907f55d46d06867d02b964826e61e5b8ca774 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 8 Oct 2025 15:06:34 +0200 Subject: [PATCH 22/24] Encapsulate the idea of an option being optional within the option, rather than having the CLI check directly for the sentinel. --- src/databricks/labs/lakebridge/cli.py | 2 +- src/databricks/labs/lakebridge/config.py | 14 +++++++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/databricks/labs/lakebridge/cli.py b/src/databricks/labs/lakebridge/cli.py index 0b5b987b4..62290e8cc 100644 --- a/src/databricks/labs/lakebridge/cli.py +++ b/src/databricks/labs/lakebridge/cli.py @@ -536,7 +536,7 @@ def _handle_missing_transpiler_option(self, option: LSPConfigOptionV1) -> JsonVa # 3. If we cannot prompt because we are not running interactively, use the default if there is one. # 4. Fail: the only way to provide a value is via the config.yml, which can be set via 'install-transpile'. - if option.default == "": + if option.is_optional(): return None return option.prompt_for_value(self._prompts) diff --git a/src/databricks/labs/lakebridge/config.py b/src/databricks/labs/lakebridge/config.py index eee22416c..56e502262 100644 --- a/src/databricks/labs/lakebridge/config.py +++ b/src/databricks/labs/lakebridge/config.py @@ -123,6 +123,15 @@ def parse(cls, data: JsonValue) -> "LSPConfigOptionV1": return LSPConfigOptionV1(flag, method, prompt, **optional) + def is_optional(self) -> bool: + # Semantics are currently that a value for an option is always required, except in the specific case of: + # - It being a QUESTION; AND + # - The default is set to the special "" value. + return self.method == LSPPromptMethod.QUESTION and self.default == self._question_optional_sentinel + + # Magic value that indicates no answer is required for a QUESTION prompt. + _question_optional_sentinel = "" + def prompt_for_value(self, prompts: Prompts) -> JsonValue: if self.method == LSPPromptMethod.FORCE: return self.default @@ -135,10 +144,9 @@ def prompt_for_value(self, prompts: Prompts) -> JsonValue: # - allow no answer to be given. # Normally prompts.confirm() requires an answer, or returns the default, and the default can't be None. # Note: LSP servers use '' as a default to indicate that no answer is required. - no_answer = "" - default = self.default if self.default else no_answer + default = self.default if self.default else self._question_optional_sentinel result = prompts.question(self.prompt, default=default) - if result == no_answer: + if result == self._question_optional_sentinel: return None return result if self.method == LSPPromptMethod.CHOICE: From ff56327ecdd4f9ccdcd17010769ee24034d8fa02 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 8 Oct 2025 17:55:39 +0200 Subject: [PATCH 23/24] Revert change in PR in this PR; another will fix this anyway. --- labs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/labs.yml b/labs.yml index 9b56d9ad0..788b3d264 100644 --- a/labs.yml +++ b/labs.yml @@ -43,7 +43,7 @@ commands: default: null - name: skip-validation description: Validate Transpiled Code, default True validation skipped, False validate - default: "true" + default: true - name: catalog-name description: Catalog Name Applicable only when Validation Mode is DATABRICKS default: null From b6e7793b4b59b1c47272c8b04ca3c0fefb02be0e Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 8 Oct 2025 18:01:40 +0200 Subject: [PATCH 24/24] Update the `labs.yml` description of the new arguments. This matches the style from #2089. --- labs.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/labs.yml b/labs.yml index 788b3d264..579ca0971 100644 --- a/labs.yml +++ b/labs.yml @@ -27,11 +27,9 @@ commands: description: Dialect name as selected during `install-transpile` or refer to documentation default: null - name: overrides-file - description: Path to a file containing transpiler overrides, if supported by the transpiler in use. - default: null + description: (Optional) Local `path` of a file containing transpiler overrides, if supported by the transpiler in use - name: target-technology - description: Target technology to use for code generation, if supported by the transpiler in use. - default: null + description: (Optional) Target technology to use for code generation, if supported by the transpiler in use - name: input-source description: Input Script Folder or File default: null