From 23f0fd95a8caa1eb8385f601703340d29f75d0d7 Mon Sep 17 00:00:00 2001 From: Julian Alves <28436330+donotpush@users.noreply.github.com> Date: Tue, 3 Dec 2024 00:45:25 +0100 Subject: [PATCH 01/32] chore: save oauth progress --- notes/oauth.py | 25 +++++++++++++++ .../load/pipeline/test_databricks_pipeline.py | 32 +++++++++++++++++++ 2 files changed, 57 insertions(+) create mode 100644 notes/oauth.py diff --git a/notes/oauth.py b/notes/oauth.py new file mode 100644 index 0000000000..8524a4ac6a --- /dev/null +++ b/notes/oauth.py @@ -0,0 +1,25 @@ +import os +import toml +from databricks import sql +from databricks.sdk.core import Config, oauth_service_principal + +secrets = toml.load("../tests/.dlt/secrets.toml") +dd = secrets["destination"]["databricks"]["credentials"] + +def credential_provider(): + config = Config( + host = f"https://{dd['server_hostname']}", + client_id = dd["client_id"], + client_secret = dd["client_secret"] + ) + return oauth_service_principal(config) + +with sql.connect(server_hostname = dd["server_hostname"], + http_path = dd["http_path"], + credentials_provider = credential_provider, + auth_type = "databricks-oauth") as conn_oauth: + cursor = conn_oauth.cursor() + cursor.execute('select * from samples.nyctaxi.trips limit 5') + result = cursor.fetchall() + for row in result: + print(row) diff --git a/tests/load/pipeline/test_databricks_pipeline.py b/tests/load/pipeline/test_databricks_pipeline.py index e802cde693..a722b8a42a 100644 --- a/tests/load/pipeline/test_databricks_pipeline.py +++ b/tests/load/pipeline/test_databricks_pipeline.py @@ -145,3 +145,35 @@ def test_databricks_gcs_external_location(destination_config: DestinationTestCon assert ( "credential_x" in pipeline.list_failed_jobs_in_package(info.loads_ids[0])[0].failed_message ) + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs( + default_sql_configs=True, bucket_subset=(AZ_BUCKET,), subset=("databricks",) + ), + ids=lambda x: x.name, +) +def test_databricks_oauth(destination_config: DestinationTestConfiguration) -> None: + from dlt.destinations import databricks, filesystem + from dlt.destinations.impl.databricks.configuration import DatabricksCredentials + from dlt import pipeline + + cred = DatabricksCredentials() + + stage = filesystem(AZ_BUCKET) + + dataset_name = "test_databricks_oauth2" + uniq_id() + bricks = databricks() + + pipeline = destination_config.setup_pipeline( + "test_databricks_oauth2", + dataset_name=dataset_name, + destination=bricks, + staging=stage + ) + + info = pipeline.run([1, 2, 3], table_name="digits", **destination_config.run_kwargs) + + assert info.has_failed_jobs is False + From a32fb5eb24d284019a4347decf7b2629c8793aba Mon Sep 17 00:00:00 2001 From: Julian Alves <28436330+donotpush@users.noreply.github.com> Date: Wed, 4 Dec 2024 13:00:55 +0100 Subject: [PATCH 02/32] make format --- dlt/common/libs/pyarrow.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/dlt/common/libs/pyarrow.py b/dlt/common/libs/pyarrow.py index 37268c0d2f..029cd75399 100644 --- a/dlt/common/libs/pyarrow.py +++ b/dlt/common/libs/pyarrow.py @@ -628,7 +628,14 @@ def row_tuples_to_arrow( " extracting an SQL VIEW that selects with cast." ) json_str_array = pa.array( - [None if s is None else json.dumps(s) if not issubclass(type(s), set) else json.dumps(list(s)) for s in columnar_known_types[field.name]] + [ + ( + None + if s is None + else json.dumps(s) if not issubclass(type(s), set) else json.dumps(list(s)) + ) + for s in columnar_known_types[field.name] + ] ) columnar_known_types[field.name] = json_str_array From 04d55e33df49d99369684c731bf56dc1d43405f2 Mon Sep 17 00:00:00 2001 From: Julian Alves <28436330+donotpush@users.noreply.github.com> Date: Wed, 4 Dec 2024 13:01:17 +0100 Subject: [PATCH 03/32] add databricks-sdk dependency --- poetry.lock | 22 +++++++++++++++++++++- pyproject.toml | 1 + 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/poetry.lock b/poetry.lock index 1bcff1de4a..0f3fcc9812 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2208,6 +2208,26 @@ nr-date = ">=2.0.0,<3.0.0" typeapi = ">=2.0.1,<3.0.0" typing-extensions = ">=3.10.0" +[[package]] +name = "databricks-sdk" +version = "0.38.0" +description = "Databricks SDK for Python (Beta)" +optional = false +python-versions = ">=3.7" +files = [ + {file = "databricks_sdk-0.38.0-py3-none-any.whl", hash = "sha256:3cc3808e7a294ccf99a3f19f1e86c8e36a5dc0845ac62112dcae2e625ef97c28"}, + {file = "databricks_sdk-0.38.0.tar.gz", hash = "sha256:65e505201b65d8a2b4110d3eabfebce5a25426d3ccdd5f8bc69eb03333ea1f39"}, +] + +[package.dependencies] +google-auth = ">=2.0,<3.0" +requests = ">=2.28.1,<3" + +[package.extras] +dev = ["autoflake", "databricks-connect", "httpx", "ipython", "ipywidgets", "isort", "langchain-openai", "openai", "pycodestyle", "pyfakefs", "pytest", "pytest-cov", "pytest-mock", "pytest-rerunfailures", "pytest-xdist", "requests-mock", "wheel", "yapf"] +notebook = ["ipython (>=8,<9)", "ipywidgets (>=8,<9)"] +openai = ["httpx", "langchain-openai", "openai"] + [[package]] name = "databricks-sql-connector" version = "2.9.6" @@ -10558,4 +10578,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "749c79ead9b1a800cbe5d9c93650e2ede7e9bcb240d07ff2d1787d032a0f2fa6" +content-hash = "4648f8a13a30d1c81031a3e4b4f8cc7dab46b495b30a7c6f54cdf4f415331d96" diff --git a/pyproject.toml b/pyproject.toml index a1a71a1a6a..548f39a3dc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -90,6 +90,7 @@ paramiko = {version = ">=3.3.0", optional = true} sqlglot = {version = ">=20.0.0", optional = true} db-dtypes = { version = ">=1.2.0", optional = true } aiohttp = { version = ">=3.9", optional = true } +databricks-sdk = "^0.38.0" [tool.poetry.extras] gcp = ["grpcio", "google-cloud-bigquery", "db-dtypes", "gcsfs"] From 604b6b2037c6278fca656c769ea3ce5ce863602a Mon Sep 17 00:00:00 2001 From: Julian Alves <28436330+donotpush@users.noreply.github.com> Date: Wed, 4 Dec 2024 13:01:41 +0100 Subject: [PATCH 04/32] delete poc code --- notes/oauth.py | 25 ------------------------- 1 file changed, 25 deletions(-) delete mode 100644 notes/oauth.py diff --git a/notes/oauth.py b/notes/oauth.py deleted file mode 100644 index 8524a4ac6a..0000000000 --- a/notes/oauth.py +++ /dev/null @@ -1,25 +0,0 @@ -import os -import toml -from databricks import sql -from databricks.sdk.core import Config, oauth_service_principal - -secrets = toml.load("../tests/.dlt/secrets.toml") -dd = secrets["destination"]["databricks"]["credentials"] - -def credential_provider(): - config = Config( - host = f"https://{dd['server_hostname']}", - client_id = dd["client_id"], - client_secret = dd["client_secret"] - ) - return oauth_service_principal(config) - -with sql.connect(server_hostname = dd["server_hostname"], - http_path = dd["http_path"], - credentials_provider = credential_provider, - auth_type = "databricks-oauth") as conn_oauth: - cursor = conn_oauth.cursor() - cursor.execute('select * from samples.nyctaxi.trips limit 5') - result = cursor.fetchall() - for row in result: - print(row) From fa59930125ed2eb811ab6235e1af8b60673604de Mon Sep 17 00:00:00 2001 From: Julian Alves <28436330+donotpush@users.noreply.github.com> Date: Wed, 4 Dec 2024 14:39:47 +0100 Subject: [PATCH 05/32] feat: databricks oauth impl --- .../impl/databricks/configuration.py | 26 ++++++++++++++++++- .../test_databricks_configuration.py | 12 +++++++++ .../load/pipeline/test_databricks_pipeline.py | 26 +++++-------------- 3 files changed, 44 insertions(+), 20 deletions(-) diff --git a/dlt/destinations/impl/databricks/configuration.py b/dlt/destinations/impl/databricks/configuration.py index c95b6eba4c..03f8dfffef 100644 --- a/dlt/destinations/impl/databricks/configuration.py +++ b/dlt/destinations/impl/databricks/configuration.py @@ -4,7 +4,9 @@ from dlt.common.typing import TSecretStrValue from dlt.common.configuration.specs.base_configuration import CredentialsConfiguration, configspec from dlt.common.destination.reference import DestinationClientDwhWithStagingConfiguration +from dlt.common.configuration.exceptions import ConfigurationValueError +from databricks.sdk.core import Config, oauth_service_principal, CredentialsProvider DATABRICKS_APPLICATION_ID = "dltHub_dlt" @@ -15,6 +17,9 @@ class DatabricksCredentials(CredentialsConfiguration): server_hostname: str = None http_path: str = None access_token: Optional[TSecretStrValue] = None + auth_type: str = None + client_id: Optional[TSecretStrValue] = None + client_secret: Optional[TSecretStrValue] = None http_headers: Optional[Dict[str, str]] = None session_configuration: Optional[Dict[str, Any]] = None """Dict of session parameters that will be passed to `databricks.sql.connect`""" @@ -30,17 +35,36 @@ class DatabricksCredentials(CredentialsConfiguration): "access_token", ] + def on_resolved(self) -> None: + if self.auth_type and self.auth_type != "databricks-oauth": + raise ConfigurationValueError( + "Invalid auth_type detected: only 'databricks-oauth' is supported. " + "If you are using an access token for authentication, omit the 'auth_type' field." + ) + + def _get_oauth_credentials(self) -> Optional[CredentialsProvider]: + config = Config( + host=f"https://{self.server_hostname}", + client_id=self.client_id, + client_secret=self.client_secret, + ) + return oauth_service_principal(config) + def to_connector_params(self) -> Dict[str, Any]: conn_params = dict( catalog=self.catalog, server_hostname=self.server_hostname, http_path=self.http_path, - access_token=self.access_token, session_configuration=self.session_configuration or {}, _socket_timeout=self.socket_timeout, **(self.connection_parameters or {}), ) + if self.auth_type == "databricks-oauth" and self.client_id and self.client_secret: + conn_params["credentials_provider"] = self._get_oauth_credentials + else: + conn_params["access_token"] = self.access_token + if self.user_agent_entry: conn_params["_user_agent_entry"] = ( conn_params.get("_user_agent_entry") or self.user_agent_entry diff --git a/tests/load/databricks/test_databricks_configuration.py b/tests/load/databricks/test_databricks_configuration.py index e27da4db2a..19ec5c5e73 100644 --- a/tests/load/databricks/test_databricks_configuration.py +++ b/tests/load/databricks/test_databricks_configuration.py @@ -62,6 +62,18 @@ def test_databricks_configuration() -> None: assert config.is_staging_external_location is None +def test_databricks_oauth_config() -> None: + from dlt.common.configuration.exceptions import ConfigurationValueError + + os.environ["DESTINATION__DATABRICKS__CREDENTIALS__AUTH_TYPE"] = "oauth" + with pytest.raises(ConfigurationValueError): + bricks = databricks() + bricks.configuration(None, accept_partial=True) + + os.environ["DESTINATION__DATABRICKS__CREDENTIALS__AUTH_TYPE"] = "databricks-oauth" + bricks.configuration(None, accept_partial=True) + + def test_databricks_abfss_converter() -> None: with pytest.raises(TerminalValueError): DatabricksLoadJob.ensure_databricks_abfss_url("az://dlt-ci-test-bucket") diff --git a/tests/load/pipeline/test_databricks_pipeline.py b/tests/load/pipeline/test_databricks_pipeline.py index a722b8a42a..488c7d05a8 100644 --- a/tests/load/pipeline/test_databricks_pipeline.py +++ b/tests/load/pipeline/test_databricks_pipeline.py @@ -10,7 +10,6 @@ ) from tests.pipeline.utils import assert_load_info - # mark all tests as essential, do not remove pytestmark = pytest.mark.essential @@ -149,31 +148,20 @@ def test_databricks_gcs_external_location(destination_config: DestinationTestCon @pytest.mark.parametrize( "destination_config", - destinations_configs( - default_sql_configs=True, bucket_subset=(AZ_BUCKET,), subset=("databricks",) - ), + destinations_configs(default_sql_configs=True, subset=("databricks",)), ids=lambda x: x.name, ) def test_databricks_oauth(destination_config: DestinationTestConfiguration) -> None: - from dlt.destinations import databricks, filesystem - from dlt.destinations.impl.databricks.configuration import DatabricksCredentials - from dlt import pipeline - - cred = DatabricksCredentials() - - stage = filesystem(AZ_BUCKET) - - dataset_name = "test_databricks_oauth2" + uniq_id() - bricks = databricks() + os.environ["DESTINATION__DATABRICKS__CREDENTIALS__AUTH_TYPE"] = "databricks-oauth" + dataset_name = "test_databricks_oauth" + uniq_id() pipeline = destination_config.setup_pipeline( - "test_databricks_oauth2", - dataset_name=dataset_name, - destination=bricks, - staging=stage + "test_databricks_oauth", dataset_name=dataset_name, destination="databricks" ) info = pipeline.run([1, 2, 3], table_name="digits", **destination_config.run_kwargs) - assert info.has_failed_jobs is False + with pipeline.sql_client() as client: + rows = client.execute_sql(f"select * from {dataset_name}.digits") + assert len(rows) == 3 From 342b86c8deae5259287fc7ddb6f9855366b41ab9 Mon Sep 17 00:00:00 2001 From: Julian Alves <28436330+donotpush@users.noreply.github.com> Date: Wed, 4 Dec 2024 16:29:19 +0100 Subject: [PATCH 06/32] docs: databricks oauth M2M --- .../dlt-ecosystem/destinations/databricks.md | 28 ++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/docs/website/docs/dlt-ecosystem/destinations/databricks.md b/docs/website/docs/dlt-ecosystem/destinations/databricks.md index 513a3b792f..c167b35376 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/databricks.md +++ b/docs/website/docs/dlt-ecosystem/destinations/databricks.md @@ -90,6 +90,32 @@ If you already have your Databricks workspace set up, you can skip to the [Loade Click your email in the top right corner and go to "User Settings". Go to "Developer" -> "Access Tokens". Generate a new token and save it. You will use it in your `dlt` configuration. +## OAuth M2M (Machine-to-Machine) Authentication + +You can authenticate to Databricks using a service principal via OAuth M2M. This method allows for secure, programmatic access to Databricks resources without requiring a user-managed personal access token. + +### Create a Service Principal in Databricks +Follow the instructions in the Databricks documentation to create a service principal and retrieve the client_id and client_secret: + +[Authenticate access to Databricks using OAuth M2M](https://docs.databricks.com/en/dev-tools/auth/oauth-m2m.html) + +Once you have the service principal credentials, update your secrets.toml as shown bellow. The auth_type must be set to "databricks-oauth" to enable this authentication method. + +### Configuration + +Add the following fields to your `.dlt/secrets.toml` file: +```toml +[destination.databricks.credentials] +server_hostname = "MY_DATABRICKS.azuredatabricks.net" +http_path = "/sql/1.0/warehouses/12345" +catalog = "my_catalog" +auth_type = "databricks-oauth" +client_id = "XXX" +client_secret = "XXX" +``` + + + ## Loader setup guide **1. Initialize a project with a pipeline that loads to Databricks by running** @@ -118,7 +144,7 @@ Example: [destination.databricks.credentials] server_hostname = "MY_DATABRICKS.azuredatabricks.net" http_path = "/sql/1.0/warehouses/12345" -access_token = "MY_ACCESS_TOKEN" +access_token = "MY_ACCESS_TOKEN" # Remove if using OAuth catalog = "my_catalog" ``` From 3b8bef17bf7f1c30227c8826135c740248fd1cb1 Mon Sep 17 00:00:00 2001 From: Julian Alves <28436330+donotpush@users.noreply.github.com> Date: Wed, 4 Dec 2024 17:15:45 +0100 Subject: [PATCH 07/32] fix: poetry lock --- poetry.lock | 104 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 102 insertions(+), 2 deletions(-) diff --git a/poetry.lock b/poetry.lock index 87536d28f8..7657aa734a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "about-time" @@ -3920,6 +3920,106 @@ files = [ {file = "google_re2-1.1-4-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1f4d4f0823e8b2f6952a145295b1ff25245ce9bb136aff6fe86452e507d4c1dd"}, {file = "google_re2-1.1-4-cp39-cp39-win32.whl", hash = "sha256:1afae56b2a07bb48cfcfefaa15ed85bae26a68f5dc7f9e128e6e6ea36914e847"}, {file = "google_re2-1.1-4-cp39-cp39-win_amd64.whl", hash = "sha256:aa7d6d05911ab9c8adbf3c225a7a120ab50fd2784ac48f2f0d140c0b7afc2b55"}, + {file = "google_re2-1.1-5-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:222fc2ee0e40522de0b21ad3bc90ab8983be3bf3cec3d349c80d76c8bb1a4beb"}, + {file = "google_re2-1.1-5-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:d4763b0b9195b72132a4e7de8e5a9bf1f05542f442a9115aa27cfc2a8004f581"}, + {file = "google_re2-1.1-5-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:209649da10c9d4a93d8a4d100ecbf9cc3b0252169426bec3e8b4ad7e57d600cf"}, + {file = "google_re2-1.1-5-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:68813aa333c1604a2df4a495b2a6ed065d7c8aebf26cc7e7abb5a6835d08353c"}, + {file = "google_re2-1.1-5-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:370a23ec775ad14e9d1e71474d56f381224dcf3e72b15d8ca7b4ad7dd9cd5853"}, + {file = "google_re2-1.1-5-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:14664a66a3ddf6bc9e56f401bf029db2d169982c53eff3f5876399104df0e9a6"}, + {file = "google_re2-1.1-5-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3ea3722cc4932cbcebd553b69dce1b4a73572823cff4e6a244f1c855da21d511"}, + {file = "google_re2-1.1-5-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e14bb264c40fd7c627ef5678e295370cd6ba95ca71d835798b6e37502fc4c690"}, + {file = "google_re2-1.1-5-cp310-cp310-win32.whl", hash = "sha256:39512cd0151ea4b3969c992579c79b423018b464624ae955be685fc07d94556c"}, + {file = "google_re2-1.1-5-cp310-cp310-win_amd64.whl", hash = "sha256:ac66537aa3bc5504320d922b73156909e3c2b6da19739c866502f7827b3f9fdf"}, + {file = "google_re2-1.1-5-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:5b5ea68d54890c9edb1b930dcb2658819354e5d3f2201f811798bbc0a142c2b4"}, + {file = "google_re2-1.1-5-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:33443511b6b83c35242370908efe2e8e1e7cae749c766b2b247bf30e8616066c"}, + {file = "google_re2-1.1-5-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:413d77bdd5ba0bfcada428b4c146e87707452ec50a4091ec8e8ba1413d7e0619"}, + {file = "google_re2-1.1-5-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:5171686e43304996a34baa2abcee6f28b169806d0e583c16d55e5656b092a414"}, + {file = "google_re2-1.1-5-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:3b284db130283771558e31a02d8eb8fb756156ab98ce80035ae2e9e3a5f307c4"}, + {file = "google_re2-1.1-5-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:296e6aed0b169648dc4b870ff47bd34c702a32600adb9926154569ef51033f47"}, + {file = "google_re2-1.1-5-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:38d50e68ead374160b1e656bbb5d101f0b95fb4cc57f4a5c12100155001480c5"}, + {file = "google_re2-1.1-5-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2a0416a35921e5041758948bcb882456916f22845f66a93bc25070ef7262b72a"}, + {file = "google_re2-1.1-5-cp311-cp311-win32.whl", hash = "sha256:a1d59568bbb5de5dd56dd6cdc79907db26cce63eb4429260300c65f43469e3e7"}, + {file = "google_re2-1.1-5-cp311-cp311-win_amd64.whl", hash = "sha256:72f5a2f179648b8358737b2b493549370debd7d389884a54d331619b285514e3"}, + {file = "google_re2-1.1-5-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:cbc72c45937b1dc5acac3560eb1720007dccca7c9879138ff874c7f6baf96005"}, + {file = "google_re2-1.1-5-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:5fadd1417fbef7235fa9453dba4eb102e6e7d94b1e4c99d5fa3dd4e288d0d2ae"}, + {file = "google_re2-1.1-5-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:040f85c63cc02696485b59b187a5ef044abe2f99b92b4fb399de40b7d2904ccc"}, + {file = "google_re2-1.1-5-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:64e3b975ee6d9bbb2420494e41f929c1a0de4bcc16d86619ab7a87f6ea80d6bd"}, + {file = "google_re2-1.1-5-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:8ee370413e00f4d828eaed0e83b8af84d7a72e8ee4f4bd5d3078bc741dfc430a"}, + {file = "google_re2-1.1-5-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:5b89383001079323f693ba592d7aad789d7a02e75adb5d3368d92b300f5963fd"}, + {file = "google_re2-1.1-5-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:63cb4fdfbbda16ae31b41a6388ea621510db82feb8217a74bf36552ecfcd50ad"}, + {file = "google_re2-1.1-5-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9ebedd84ae8be10b7a71a16162376fd67a2386fe6361ef88c622dcf7fd679daf"}, + {file = "google_re2-1.1-5-cp312-cp312-win32.whl", hash = "sha256:c8e22d1692bc2c81173330c721aff53e47ffd3c4403ff0cd9d91adfd255dd150"}, + {file = "google_re2-1.1-5-cp312-cp312-win_amd64.whl", hash = "sha256:5197a6af438bb8c4abda0bbe9c4fbd6c27c159855b211098b29d51b73e4cbcf6"}, + {file = "google_re2-1.1-5-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:b6727e0b98417e114b92688ad2aa256102ece51f29b743db3d831df53faf1ce3"}, + {file = "google_re2-1.1-5-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:711e2b6417eb579c61a4951029d844f6b95b9b373b213232efd413659889a363"}, + {file = "google_re2-1.1-5-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:71ae8b3df22c5c154c8af0f0e99d234a450ef1644393bc2d7f53fc8c0a1e111c"}, + {file = "google_re2-1.1-5-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:94a04e214bc521a3807c217d50cf099bbdd0c0a80d2d996c0741dbb995b5f49f"}, + {file = "google_re2-1.1-5-cp38-cp38-macosx_14_0_arm64.whl", hash = "sha256:a770f75358508a9110c81a1257721f70c15d9bb592a2fb5c25ecbd13566e52a5"}, + {file = "google_re2-1.1-5-cp38-cp38-macosx_14_0_x86_64.whl", hash = "sha256:07c9133357f7e0b17c6694d5dcb82e0371f695d7c25faef2ff8117ef375343ff"}, + {file = "google_re2-1.1-5-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:204ca6b1cf2021548f4a9c29ac015e0a4ab0a7b6582bf2183d838132b60c8fda"}, + {file = "google_re2-1.1-5-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f0b95857c2c654f419ca684ec38c9c3325c24e6ba7d11910a5110775a557bb18"}, + {file = "google_re2-1.1-5-cp38-cp38-win32.whl", hash = "sha256:347ac770e091a0364e822220f8d26ab53e6fdcdeaec635052000845c5a3fb869"}, + {file = "google_re2-1.1-5-cp38-cp38-win_amd64.whl", hash = "sha256:ec32bb6de7ffb112a07d210cf9f797b7600645c2d5910703fa07f456dd2150e0"}, + {file = "google_re2-1.1-5-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:eb5adf89060f81c5ff26c28e261e6b4997530a923a6093c9726b8dec02a9a326"}, + {file = "google_re2-1.1-5-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:a22630c9dd9ceb41ca4316bccba2643a8b1d5c198f21c00ed5b50a94313aaf10"}, + {file = "google_re2-1.1-5-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:544dc17fcc2d43ec05f317366375796351dec44058e1164e03c3f7d050284d58"}, + {file = "google_re2-1.1-5-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:19710af5ea88751c7768575b23765ce0dfef7324d2539de576f75cdc319d6654"}, + {file = "google_re2-1.1-5-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:f82995a205e08ad896f4bd5ce4847c834fab877e1772a44e5f262a647d8a1dec"}, + {file = "google_re2-1.1-5-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:63533c4d58da9dc4bc040250f1f52b089911699f0368e0e6e15f996387a984ed"}, + {file = "google_re2-1.1-5-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:79e00fcf0cb04ea35a22b9014712d448725ce4ddc9f08cc818322566176ca4b0"}, + {file = "google_re2-1.1-5-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bc41afcefee2da6c4ed883a93d7f527c4b960cd1d26bbb0020a7b8c2d341a60a"}, + {file = "google_re2-1.1-5-cp39-cp39-win32.whl", hash = "sha256:486730b5e1f1c31b0abc6d80abe174ce4f1188fe17d1b50698f2bf79dc6e44be"}, + {file = "google_re2-1.1-5-cp39-cp39-win_amd64.whl", hash = "sha256:4de637ca328f1d23209e80967d1b987d6b352cd01b3a52a84b4d742c69c3da6c"}, + {file = "google_re2-1.1-6-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:621e9c199d1ff0fdb2a068ad450111a84b3bf14f96dfe5a8a7a0deae5f3f4cce"}, + {file = "google_re2-1.1-6-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:220acd31e7dde95373f97c3d1f3b3bd2532b38936af28b1917ee265d25bebbf4"}, + {file = "google_re2-1.1-6-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:db34e1098d164f76251a6ece30e8f0ddfd65bb658619f48613ce71acb3f9cbdb"}, + {file = "google_re2-1.1-6-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:5152bac41d8073977582f06257219541d0fc46ad99b0bbf30e8f60198a43b08c"}, + {file = "google_re2-1.1-6-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:6191294799e373ee1735af91f55abd23b786bdfd270768a690d9d55af9ea1b0d"}, + {file = "google_re2-1.1-6-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:070cbafbb4fecbb02e98feb28a1eb292fb880f434d531f38cc33ee314b521f1f"}, + {file = "google_re2-1.1-6-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8437d078b405a59a576cbed544490fe041140f64411f2d91012e8ec05ab8bf86"}, + {file = "google_re2-1.1-6-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f00f9a9af8896040e37896d9b9fc409ad4979f1ddd85bb188694a7d95ddd1164"}, + {file = "google_re2-1.1-6-cp310-cp310-win32.whl", hash = "sha256:df26345f229a898b4fd3cafd5f82259869388cee6268fc35af16a8e2293dd4e5"}, + {file = "google_re2-1.1-6-cp310-cp310-win_amd64.whl", hash = "sha256:3665d08262c57c9b28a5bdeb88632ad792c4e5f417e5645901695ab2624f5059"}, + {file = "google_re2-1.1-6-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:b26b869d8aa1d8fe67c42836bf3416bb72f444528ee2431cfb59c0d3e02c6ce3"}, + {file = "google_re2-1.1-6-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:41fd4486c57dea4f222a6bb7f1ff79accf76676a73bdb8da0fcbd5ba73f8da71"}, + {file = "google_re2-1.1-6-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:0ee378e2e74e25960070c338c28192377c4dd41e7f4608f2688064bd2badc41e"}, + {file = "google_re2-1.1-6-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:a00cdbf662693367b36d075b29feb649fd7ee1b617cf84f85f2deebeda25fc64"}, + {file = "google_re2-1.1-6-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:4c09455014217a41499432b8c8f792f25f3df0ea2982203c3a8c8ca0e7895e69"}, + {file = "google_re2-1.1-6-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:6501717909185327935c7945e23bb5aa8fc7b6f237b45fe3647fa36148662158"}, + {file = "google_re2-1.1-6-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3510b04790355f199e7861c29234081900e1e1cbf2d1484da48aa0ba6d7356ab"}, + {file = "google_re2-1.1-6-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8c0e64c187ca406764f9e9ad6e750d62e69ed8f75bf2e865d0bfbc03b642361c"}, + {file = "google_re2-1.1-6-cp311-cp311-win32.whl", hash = "sha256:2a199132350542b0de0f31acbb3ca87c3a90895d1d6e5235f7792bb0af02e523"}, + {file = "google_re2-1.1-6-cp311-cp311-win_amd64.whl", hash = "sha256:83bdac8ceaece8a6db082ea3a8ba6a99a2a1ee7e9f01a9d6d50f79c6f251a01d"}, + {file = "google_re2-1.1-6-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:81985ff894cd45ab5a73025922ac28c0707759db8171dd2f2cc7a0e856b6b5ad"}, + {file = "google_re2-1.1-6-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:5635af26065e6b45456ccbea08674ae2ab62494008d9202df628df3b267bc095"}, + {file = "google_re2-1.1-6-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:813b6f04de79f4a8fdfe05e2cb33e0ccb40fe75d30ba441d519168f9d958bd54"}, + {file = "google_re2-1.1-6-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:5ec2f5332ad4fd232c3f2d6748c2c7845ccb66156a87df73abcc07f895d62ead"}, + {file = "google_re2-1.1-6-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:5a687b3b32a6cbb731647393b7c4e3fde244aa557f647df124ff83fb9b93e170"}, + {file = "google_re2-1.1-6-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:39a62f9b3db5d3021a09a47f5b91708b64a0580193e5352751eb0c689e4ad3d7"}, + {file = "google_re2-1.1-6-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ca0f0b45d4a1709cbf5d21f355e5809ac238f1ee594625a1e5ffa9ff7a09eb2b"}, + {file = "google_re2-1.1-6-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a64b3796a7a616c7861247bd061c9a836b5caf0d5963e5ea8022125601cf7b09"}, + {file = "google_re2-1.1-6-cp312-cp312-win32.whl", hash = "sha256:32783b9cb88469ba4cd9472d459fe4865280a6b1acdad4480a7b5081144c4eb7"}, + {file = "google_re2-1.1-6-cp312-cp312-win_amd64.whl", hash = "sha256:259ff3fd2d39035b9cbcbf375995f83fa5d9e6a0c5b94406ff1cc168ed41d6c6"}, + {file = "google_re2-1.1-6-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:e4711bcffe190acd29104d8ecfea0c0e42b754837de3fb8aad96e6cc3c613cdc"}, + {file = "google_re2-1.1-6-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:4d081cce43f39c2e813fe5990e1e378cbdb579d3f66ded5bade96130269ffd75"}, + {file = "google_re2-1.1-6-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:4f123b54d48450d2d6b14d8fad38e930fb65b5b84f1b022c10f2913bd956f5b5"}, + {file = "google_re2-1.1-6-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:e1928b304a2b591a28eb3175f9db7f17c40c12cf2d4ec2a85fdf1cc9c073ff91"}, + {file = "google_re2-1.1-6-cp38-cp38-macosx_14_0_arm64.whl", hash = "sha256:3a69f76146166aec1173003c1f547931bdf288c6b135fda0020468492ac4149f"}, + {file = "google_re2-1.1-6-cp38-cp38-macosx_14_0_x86_64.whl", hash = "sha256:fc08c388f4ebbbca345e84a0c56362180d33d11cbe9ccfae663e4db88e13751e"}, + {file = "google_re2-1.1-6-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b057adf38ce4e616486922f2f47fc7d19c827ba0a7f69d540a3664eba2269325"}, + {file = "google_re2-1.1-6-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4138c0b933ab099e96f5d8defce4486f7dfd480ecaf7f221f2409f28022ccbc5"}, + {file = "google_re2-1.1-6-cp38-cp38-win32.whl", hash = "sha256:9693e45b37b504634b1abbf1ee979471ac6a70a0035954592af616306ab05dd6"}, + {file = "google_re2-1.1-6-cp38-cp38-win_amd64.whl", hash = "sha256:5674d437baba0ea287a5a7f8f81f24265d6ae8f8c09384e2ef7b6f84b40a7826"}, + {file = "google_re2-1.1-6-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:7783137cb2e04f458a530c6d0ee9ef114815c1d48b9102f023998c371a3b060e"}, + {file = "google_re2-1.1-6-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:a49b7153935e7a303675f4deb5f5d02ab1305adefc436071348706d147c889e0"}, + {file = "google_re2-1.1-6-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:a96a8bb309182090704593c60bdb369a2756b38fe358bbf0d40ddeb99c71769f"}, + {file = "google_re2-1.1-6-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:dff3d4be9f27ef8ec3705eed54f19ef4ab096f5876c15fe011628c69ba3b561c"}, + {file = "google_re2-1.1-6-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:40f818b0b39e26811fa677978112a8108269977fdab2ba0453ac4363c35d9e66"}, + {file = "google_re2-1.1-6-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:8a7e53538cdb40ef4296017acfbb05cab0c19998be7552db1cfb85ba40b171b9"}, + {file = "google_re2-1.1-6-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6ee18e7569fb714e5bb8c42809bf8160738637a5e71ed5a4797757a1fb4dc4de"}, + {file = "google_re2-1.1-6-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1cda4f6d1a7d5b43ea92bc395f23853fba0caf8b1e1efa6e8c48685f912fcb89"}, + {file = "google_re2-1.1-6-cp39-cp39-win32.whl", hash = "sha256:6a9cdbdc36a2bf24f897be6a6c85125876dc26fea9eb4247234aec0decbdccfd"}, + {file = "google_re2-1.1-6-cp39-cp39-win_amd64.whl", hash = "sha256:73f646cecfad7cc5b4330b4192c25f2e29730a3b8408e089ffd2078094208196"}, ] [[package]] @@ -10538,4 +10638,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "4648f8a13a30d1c81031a3e4b4f8cc7dab46b495b30a7c6f54cdf4f415331d96" +content-hash = "082e734b54da834eeaf37b0cb3f4bfa1d58671ae2335aa29d18e8673a66f49ae" From f92507f3e0bff44e9210ea522037fd979c22ea96 Mon Sep 17 00:00:00 2001 From: Julian Alves <28436330+donotpush@users.noreply.github.com> Date: Wed, 4 Dec 2024 19:29:20 +0100 Subject: [PATCH 08/32] make lint --- .../impl/databricks/configuration.py | 8 ++++---- dlt/destinations/impl/databricks/sql_client.py | 17 ++++++++--------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/dlt/destinations/impl/databricks/configuration.py b/dlt/destinations/impl/databricks/configuration.py index 03f8dfffef..4221f03215 100644 --- a/dlt/destinations/impl/databricks/configuration.py +++ b/dlt/destinations/impl/databricks/configuration.py @@ -1,12 +1,12 @@ import dataclasses -from typing import ClassVar, Final, Optional, Any, Dict, List +from typing import Callable, ClassVar, Final, Optional, Any, Dict, List, cast from dlt.common.typing import TSecretStrValue from dlt.common.configuration.specs.base_configuration import CredentialsConfiguration, configspec from dlt.common.destination.reference import DestinationClientDwhWithStagingConfiguration from dlt.common.configuration.exceptions import ConfigurationValueError -from databricks.sdk.core import Config, oauth_service_principal, CredentialsProvider +from databricks.sdk.core import Config, oauth_service_principal DATABRICKS_APPLICATION_ID = "dltHub_dlt" @@ -42,13 +42,13 @@ def on_resolved(self) -> None: "If you are using an access token for authentication, omit the 'auth_type' field." ) - def _get_oauth_credentials(self) -> Optional[CredentialsProvider]: + def _get_oauth_credentials(self) -> Optional[Callable[[], Dict[str, str]]]: config = Config( host=f"https://{self.server_hostname}", client_id=self.client_id, client_secret=self.client_secret, ) - return oauth_service_principal(config) + return cast(Callable[[], Dict[str, str]], oauth_service_principal(config)) def to_connector_params(self) -> Dict[str, Any]: conn_params = dict( diff --git a/dlt/destinations/impl/databricks/sql_client.py b/dlt/destinations/impl/databricks/sql_client.py index 8bff4e0d73..010652dc48 100644 --- a/dlt/destinations/impl/databricks/sql_client.py +++ b/dlt/destinations/impl/databricks/sql_client.py @@ -13,8 +13,7 @@ Dict, ) - -from databricks import sql as databricks_lib +import databricks.sql from databricks.sql.client import ( Connection as DatabricksSqlConnection, Cursor as DatabricksSqlCursor, @@ -60,7 +59,7 @@ def iter_df(self, chunk_size: int) -> Generator[DataFrame, None, None]: class DatabricksSqlClient(SqlClientBase[DatabricksSqlConnection], DBTransaction): - dbapi: ClassVar[DBApi] = databricks_lib + dbapi: ClassVar[DBApi] = databricks.sql def __init__( self, @@ -75,7 +74,7 @@ def __init__( def open_connection(self) -> DatabricksSqlConnection: conn_params = self.credentials.to_connector_params() - self._conn = databricks_lib.connect( + self._conn = databricks.sql.connect( **conn_params, schema=self.dataset_name, use_inline_params="silent" ) return self._conn @@ -156,7 +155,7 @@ def catalog_name(self, escape: bool = True) -> Optional[str]: @staticmethod def _make_database_exception(ex: Exception) -> Exception: - if isinstance(ex, databricks_lib.ServerOperationError): + if isinstance(ex, databricks.sql.ServerOperationError): if "TABLE_OR_VIEW_NOT_FOUND" in str(ex): return DatabaseUndefinedRelation(ex) elif "SCHEMA_NOT_FOUND" in str(ex): @@ -164,15 +163,15 @@ def _make_database_exception(ex: Exception) -> Exception: elif "PARSE_SYNTAX_ERROR" in str(ex): return DatabaseTransientException(ex) return DatabaseTerminalException(ex) - elif isinstance(ex, databricks_lib.OperationalError): + elif isinstance(ex, databricks.sql.OperationalError): return DatabaseTransientException(ex) - elif isinstance(ex, (databricks_lib.ProgrammingError, databricks_lib.IntegrityError)): + elif isinstance(ex, (databricks.sql.ProgrammingError, databricks.sql.IntegrityError)): return DatabaseTerminalException(ex) - elif isinstance(ex, databricks_lib.DatabaseError): + elif isinstance(ex, databricks.sql.DatabaseError): return DatabaseTransientException(ex) else: return DatabaseTransientException(ex) @staticmethod def is_dbapi_exception(ex: Exception) -> bool: - return isinstance(ex, databricks_lib.DatabaseError) + return isinstance(ex, databricks.sql.DatabaseError) From 9c25d72dad6ac563931a8103964fb066bfb0053d Mon Sep 17 00:00:00 2001 From: Julian Alves <28436330+donotpush@users.noreply.github.com> Date: Thu, 5 Dec 2024 17:00:55 +0100 Subject: [PATCH 09/32] fix: auth_type optional --- dlt/destinations/impl/databricks/configuration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlt/destinations/impl/databricks/configuration.py b/dlt/destinations/impl/databricks/configuration.py index 4221f03215..2daf639473 100644 --- a/dlt/destinations/impl/databricks/configuration.py +++ b/dlt/destinations/impl/databricks/configuration.py @@ -17,7 +17,7 @@ class DatabricksCredentials(CredentialsConfiguration): server_hostname: str = None http_path: str = None access_token: Optional[TSecretStrValue] = None - auth_type: str = None + auth_type: Optional[str] = None client_id: Optional[TSecretStrValue] = None client_secret: Optional[TSecretStrValue] = None http_headers: Optional[Dict[str, str]] = None From f75b8c8c4a0152a8e020f8d57bd2a2ad385c8f93 Mon Sep 17 00:00:00 2001 From: Julian Alves <28436330+donotpush@users.noreply.github.com> Date: Thu, 5 Dec 2024 17:09:40 +0100 Subject: [PATCH 10/32] fix: databricks import lint --- dlt/destinations/impl/databricks/sql_client.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/dlt/destinations/impl/databricks/sql_client.py b/dlt/destinations/impl/databricks/sql_client.py index 010652dc48..c42c9701e6 100644 --- a/dlt/destinations/impl/databricks/sql_client.py +++ b/dlt/destinations/impl/databricks/sql_client.py @@ -13,7 +13,8 @@ Dict, ) -import databricks.sql +from databricks import sql as databricks_lib # type: ignore[attr-defined] + from databricks.sql.client import ( Connection as DatabricksSqlConnection, Cursor as DatabricksSqlCursor, @@ -59,7 +60,7 @@ def iter_df(self, chunk_size: int) -> Generator[DataFrame, None, None]: class DatabricksSqlClient(SqlClientBase[DatabricksSqlConnection], DBTransaction): - dbapi: ClassVar[DBApi] = databricks.sql + dbapi: ClassVar[DBApi] = databricks_lib def __init__( self, @@ -74,7 +75,7 @@ def __init__( def open_connection(self) -> DatabricksSqlConnection: conn_params = self.credentials.to_connector_params() - self._conn = databricks.sql.connect( + self._conn = databricks_lib.sql.connect( **conn_params, schema=self.dataset_name, use_inline_params="silent" ) return self._conn @@ -155,7 +156,7 @@ def catalog_name(self, escape: bool = True) -> Optional[str]: @staticmethod def _make_database_exception(ex: Exception) -> Exception: - if isinstance(ex, databricks.sql.ServerOperationError): + if isinstance(ex, databricks_lib.ServerOperationError): if "TABLE_OR_VIEW_NOT_FOUND" in str(ex): return DatabaseUndefinedRelation(ex) elif "SCHEMA_NOT_FOUND" in str(ex): @@ -163,15 +164,15 @@ def _make_database_exception(ex: Exception) -> Exception: elif "PARSE_SYNTAX_ERROR" in str(ex): return DatabaseTransientException(ex) return DatabaseTerminalException(ex) - elif isinstance(ex, databricks.sql.OperationalError): + elif isinstance(ex, databricks_lib.OperationalError): return DatabaseTransientException(ex) - elif isinstance(ex, (databricks.sql.ProgrammingError, databricks.sql.IntegrityError)): + elif isinstance(ex, (databricks_lib.ProgrammingError, databricks_lib.IntegrityError)): return DatabaseTerminalException(ex) - elif isinstance(ex, databricks.sql.DatabaseError): + elif isinstance(ex, databricks_lib.DatabaseError): return DatabaseTransientException(ex) else: return DatabaseTransientException(ex) @staticmethod def is_dbapi_exception(ex: Exception) -> bool: - return isinstance(ex, databricks.sql.DatabaseError) + return isinstance(ex, databricks_lib.DatabaseError) From ddebcf2aad10c7d062863b0c29aeeb5d761f88c2 Mon Sep 17 00:00:00 2001 From: Julian Alves <28436330+donotpush@users.noreply.github.com> Date: Thu, 5 Dec 2024 17:30:34 +0100 Subject: [PATCH 11/32] fix: databricks-sdk package optional --- poetry.lock | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/poetry.lock b/poetry.lock index 7657aa734a..e0a6d2911b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2212,7 +2212,7 @@ typing-extensions = ">=3.10.0" name = "databricks-sdk" version = "0.38.0" description = "Databricks SDK for Python (Beta)" -optional = false +optional = true python-versions = ">=3.7" files = [ {file = "databricks_sdk-0.38.0-py3-none-any.whl", hash = "sha256:3cc3808e7a294ccf99a3f19f1e86c8e36a5dc0845ac62112dcae2e625ef97c28"}, @@ -10638,4 +10638,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "082e734b54da834eeaf37b0cb3f4bfa1d58671ae2335aa29d18e8673a66f49ae" +content-hash = "b1448d6d7a850fe2b8c35c0d011d8beea502a28d2f5e77b9ad8eb90b326f7946" diff --git a/pyproject.toml b/pyproject.toml index cbc349c94f..5b53789f34 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -90,7 +90,7 @@ paramiko = {version = ">=3.3.0", optional = true} sqlglot = {version = ">=20.0.0", optional = true} db-dtypes = { version = ">=1.2.0", optional = true } aiohttp = { version = ">=3.9", optional = true } -databricks-sdk = "^0.38.0" +databricks-sdk = {version = ">=0.38.0", optional = true} [tool.poetry.extras] gcp = ["grpcio", "google-cloud-bigquery", "db-dtypes", "gcsfs"] From fe5cf1ed0e0a63e6cdda2ebb6beb59247cad6475 Mon Sep 17 00:00:00 2001 From: Julian Alves <28436330+donotpush@users.noreply.github.com> Date: Thu, 5 Dec 2024 17:31:06 +0100 Subject: [PATCH 12/32] fix: package namespace path --- dlt/destinations/impl/databricks/sql_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlt/destinations/impl/databricks/sql_client.py b/dlt/destinations/impl/databricks/sql_client.py index c42c9701e6..0a1e898928 100644 --- a/dlt/destinations/impl/databricks/sql_client.py +++ b/dlt/destinations/impl/databricks/sql_client.py @@ -75,7 +75,7 @@ def __init__( def open_connection(self) -> DatabricksSqlConnection: conn_params = self.credentials.to_connector_params() - self._conn = databricks_lib.sql.connect( + self._conn = databricks_lib.connect( **conn_params, schema=self.dataset_name, use_inline_params="silent" ) return self._conn From 7793ffc4bad5bebbb8fe3be205e7bf7e7f04ef7c Mon Sep 17 00:00:00 2001 From: Julian Alves <28436330+donotpush@users.noreply.github.com> Date: Thu, 5 Dec 2024 17:36:07 +0100 Subject: [PATCH 13/32] fix: poetry dependencies --- poetry.lock | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/poetry.lock b/poetry.lock index e0a6d2911b..88a5495c6e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -10612,7 +10612,7 @@ az = ["adlfs"] bigquery = ["db-dtypes", "gcsfs", "google-cloud-bigquery", "grpcio", "pyarrow"] cli = ["cron-descriptor", "pipdeptree"] clickhouse = ["adlfs", "clickhouse-connect", "clickhouse-driver", "gcsfs", "pyarrow", "s3fs"] -databricks = ["databricks-sql-connector"] +databricks = ["databricks-sdk", "databricks-sql-connector"] deltalake = ["deltalake", "pyarrow"] dremio = ["pyarrow"] duckdb = ["duckdb"] @@ -10638,4 +10638,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "b1448d6d7a850fe2b8c35c0d011d8beea502a28d2f5e77b9ad8eb90b326f7946" +content-hash = "fa1e98f02135eb816a2e7fe72974bf7acb09d2a5ba164836dafa03276e017a47" diff --git a/pyproject.toml b/pyproject.toml index 5b53789f34..c4e3c127d4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -113,7 +113,7 @@ weaviate = ["weaviate-client"] mssql = ["pyodbc"] synapse = ["pyodbc", "adlfs", "pyarrow"] qdrant = ["qdrant-client"] -databricks = ["databricks-sql-connector"] +databricks = ["databricks-sql-connector", "databricks-sdk"] clickhouse = ["clickhouse-driver", "clickhouse-connect", "s3fs", "gcsfs", "adlfs", "pyarrow"] dremio = ["pyarrow"] lancedb = ["lancedb", "pyarrow", "tantivy"] From 31e5e1f90afcc8c91462c2b963b5f05005a535c1 Mon Sep 17 00:00:00 2001 From: Julian Alves <28436330+donotpush@users.noreply.github.com> Date: Thu, 5 Dec 2024 17:46:59 +0100 Subject: [PATCH 14/32] fix: databricks dependency CI/CD --- .github/workflows/test_common.yml | 2 +- .github/workflows/test_doc_snippets.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test_common.yml b/.github/workflows/test_common.yml index 359ed43095..1fbaa0d6b2 100644 --- a/.github/workflows/test_common.yml +++ b/.github/workflows/test_common.yml @@ -90,7 +90,7 @@ jobs: # key: venv-${{ matrix.os }}-${{ matrix.python-version }}-${{ hashFiles('**/poetry.lock') }} - name: Install dependencies - run: poetry install --no-interaction --with sentry-sdk + run: poetry install --no-interaction -E databricks --with sentry-sdk - run: | poetry run pytest tests/common tests/normalize tests/reflection tests/load/test_dummy_client.py tests/extract/test_extract.py tests/extract/test_sources.py tests/pipeline/test_pipeline_state.py diff --git a/.github/workflows/test_doc_snippets.yml b/.github/workflows/test_doc_snippets.yml index ae06a72df9..552f62fea5 100644 --- a/.github/workflows/test_doc_snippets.yml +++ b/.github/workflows/test_doc_snippets.yml @@ -91,7 +91,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E duckdb -E weaviate -E parquet -E qdrant -E bigquery -E postgres -E lancedb --with docs,sentry-sdk --without airflow -E s3 + run: poetry install --no-interaction -E databricks -E duckdb -E weaviate -E parquet -E qdrant -E bigquery -E postgres -E lancedb --with docs,sentry-sdk --without airflow -E s3 - name: create secrets.toml for examples run: pwd && echo "$DLT_SECRETS_TOML" > docs/examples/.dlt/secrets.toml From b8f782d9786e30677780b2012ab590027d315628 Mon Sep 17 00:00:00 2001 From: Julian Alves <28436330+donotpush@users.noreply.github.com> Date: Mon, 9 Dec 2024 16:58:08 +0100 Subject: [PATCH 15/32] chore: add databricks dependency --- .github/workflows/test_airflow.yml | 2 +- .github/workflows/test_common.yml | 6 +++--- .github/workflows/test_dbt_runner.yml | 2 +- .github/workflows/test_destination_athena.yml | 2 +- .github/workflows/test_destination_athena_iceberg.yml | 2 +- .github/workflows/test_destination_bigquery.yml | 2 +- .github/workflows/test_destination_clickhouse.yml | 2 +- .github/workflows/test_destination_dremio.yml | 2 +- .github/workflows/test_destination_lancedb.yml | 2 +- .github/workflows/test_destination_motherduck.yml | 2 +- .github/workflows/test_destination_mssql.yml | 2 +- .github/workflows/test_destination_snowflake.yml | 2 +- .github/workflows/test_destination_synapse.yml | 2 +- .github/workflows/test_destinations.yml | 2 +- .github/workflows/test_local_destinations.yml | 2 +- .github/workflows/test_local_sources.yml | 2 +- .github/workflows/test_sqlalchemy_destinations.yml | 2 +- 17 files changed, 19 insertions(+), 19 deletions(-) diff --git a/.github/workflows/test_airflow.yml b/.github/workflows/test_airflow.yml index 2a96c4475e..e5daca436b 100644 --- a/.github/workflows/test_airflow.yml +++ b/.github/workflows/test_airflow.yml @@ -46,7 +46,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-airflow-runner - name: Install dependencies - run: poetry install --no-interaction --with airflow --with pipeline -E duckdb -E parquet --with sentry-sdk + run: poetry install --no-interaction --with airflow --with pipeline -E databricks -E duckdb -E parquet --with sentry-sdk - run: | poetry run pytest tests/helpers/airflow_tests diff --git a/.github/workflows/test_common.yml b/.github/workflows/test_common.yml index 1fbaa0d6b2..164292cd86 100644 --- a/.github/workflows/test_common.yml +++ b/.github/workflows/test_common.yml @@ -103,7 +103,7 @@ jobs: shell: cmd - name: Install duckdb dependencies - run: poetry install --no-interaction -E duckdb --with sentry-sdk + run: poetry install --no-interaction -E databricks -E duckdb --with sentry-sdk - run: | poetry run pytest tests/pipeline/test_pipeline.py tests/pipeline/test_import_export_schema.py @@ -116,7 +116,7 @@ jobs: shell: cmd - name: Install pyarrow - run: poetry install --no-interaction -E duckdb -E cli -E parquet --with sentry-sdk && poetry run pip install pyarrow==15.0.2 + run: poetry install --no-interaction -E databricks -E duckdb -E cli -E parquet --with sentry-sdk && poetry run pip install pyarrow==15.0.2 - run: | poetry run pytest tests/pipeline/test_pipeline_extra.py -k arrow @@ -129,7 +129,7 @@ jobs: shell: cmd - name: Install pipeline and sources dependencies - run: poetry install --no-interaction -E duckdb -E cli -E parquet -E deltalake -E sql_database --with sentry-sdk,pipeline,sources && poetry run pip install pyarrow==15.0.2 + run: poetry install --no-interaction -E databricks -E duckdb -E cli -E parquet -E deltalake -E sql_database --with sentry-sdk,pipeline,sources && poetry run pip install pyarrow==15.0.2 - run: | poetry run pytest tests/extract tests/pipeline tests/libs tests/cli/common tests/destinations tests/sources diff --git a/.github/workflows/test_dbt_runner.yml b/.github/workflows/test_dbt_runner.yml index ad29909d9a..6c0f4e4753 100644 --- a/.github/workflows/test_dbt_runner.yml +++ b/.github/workflows/test_dbt_runner.yml @@ -60,7 +60,7 @@ jobs: - name: Install dependencies # install dlt with postgres support - run: poetry install --no-interaction -E postgres -E postgis --with sentry-sdk,dbt + run: poetry install --no-interaction -E databricks -E postgres -E postgis --with sentry-sdk,dbt - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_athena.yml b/.github/workflows/test_destination_athena.yml index 1169fab0de..fd74cafce6 100644 --- a/.github/workflows/test_destination_athena.yml +++ b/.github/workflows/test_destination_athena.yml @@ -67,7 +67,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E athena --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E databricks -E athena --with sentry-sdk --with pipeline - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_athena_iceberg.yml b/.github/workflows/test_destination_athena_iceberg.yml index 7ccefcc055..7d8d1f430b 100644 --- a/.github/workflows/test_destination_athena_iceberg.yml +++ b/.github/workflows/test_destination_athena_iceberg.yml @@ -67,7 +67,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E athena --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E databricks -E athena --with sentry-sdk --with pipeline - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_bigquery.yml b/.github/workflows/test_destination_bigquery.yml index 7afc9b8a00..878fefb9be 100644 --- a/.github/workflows/test_destination_bigquery.yml +++ b/.github/workflows/test_destination_bigquery.yml @@ -66,7 +66,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E bigquery --with providers -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E databricks -E bigquery --with providers -E parquet --with sentry-sdk --with pipeline - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_clickhouse.yml b/.github/workflows/test_destination_clickhouse.yml index 7f297db971..568cbd232a 100644 --- a/.github/workflows/test_destination_clickhouse.yml +++ b/.github/workflows/test_destination_clickhouse.yml @@ -61,7 +61,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E clickhouse --with providers -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E databricks -E clickhouse --with providers -E parquet --with sentry-sdk --with pipeline - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_dremio.yml b/.github/workflows/test_destination_dremio.yml index 45c6d17db1..354e3f10bb 100644 --- a/.github/workflows/test_destination_dremio.yml +++ b/.github/workflows/test_destination_dremio.yml @@ -65,7 +65,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E databricks -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline - run: | poetry run pytest tests/load --ignore tests/load/sources diff --git a/.github/workflows/test_destination_lancedb.yml b/.github/workflows/test_destination_lancedb.yml index 6be89d3de3..266b19ba1e 100644 --- a/.github/workflows/test_destination_lancedb.yml +++ b/.github/workflows/test_destination_lancedb.yml @@ -65,7 +65,7 @@ jobs: run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml - name: Install dependencies - run: poetry install --no-interaction -E lancedb -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E databricks -E lancedb -E parquet --with sentry-sdk --with pipeline - name: Install embedding provider dependencies run: poetry run pip install openai diff --git a/.github/workflows/test_destination_motherduck.yml b/.github/workflows/test_destination_motherduck.yml index 0014b17655..4c53c4cad4 100644 --- a/.github/workflows/test_destination_motherduck.yml +++ b/.github/workflows/test_destination_motherduck.yml @@ -64,7 +64,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-motherduck - name: Install dependencies - run: poetry install --no-interaction -E motherduck -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E databricks -E motherduck -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_mssql.yml b/.github/workflows/test_destination_mssql.yml index 8b899e7da2..323bf6dc1f 100644 --- a/.github/workflows/test_destination_mssql.yml +++ b/.github/workflows/test_destination_mssql.yml @@ -69,7 +69,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E mssql -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E databricks -E mssql -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_snowflake.yml b/.github/workflows/test_destination_snowflake.yml index a720c479bd..94956dfdc7 100644 --- a/.github/workflows/test_destination_snowflake.yml +++ b/.github/workflows/test_destination_snowflake.yml @@ -64,7 +64,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E snowflake -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E databricks -E snowflake -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_synapse.yml b/.github/workflows/test_destination_synapse.yml index be1b493916..5a4e6a659a 100644 --- a/.github/workflows/test_destination_synapse.yml +++ b/.github/workflows/test_destination_synapse.yml @@ -67,7 +67,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E synapse -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E databricks -E synapse -E parquet --with sentry-sdk --with pipeline - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destinations.yml b/.github/workflows/test_destinations.yml index 933248d994..1ab0ee35c8 100644 --- a/.github/workflows/test_destinations.yml +++ b/.github/workflows/test_destinations.yml @@ -78,7 +78,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E redshift -E postgis -E postgres -E gs -E s3 -E az -E parquet -E duckdb -E cli -E filesystem --with sentry-sdk --with pipeline -E deltalake + run: poetry install --no-interaction -E databricks -E redshift -E postgis -E postgres -E gs -E s3 -E az -E parquet -E duckdb -E cli -E filesystem --with sentry-sdk --with pipeline -E deltalake - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_local_destinations.yml b/.github/workflows/test_local_destinations.yml index 4947a46a3b..b2306f4121 100644 --- a/.github/workflows/test_local_destinations.yml +++ b/.github/workflows/test_local_destinations.yml @@ -95,7 +95,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-local-destinations - name: Install dependencies - run: poetry install --no-interaction -E postgres -E postgis -E duckdb -E parquet -E filesystem -E cli -E weaviate -E qdrant -E sftp --with sentry-sdk --with pipeline -E deltalake + run: poetry install --no-interaction -E databricks -E postgres -E postgis -E duckdb -E parquet -E filesystem -E cli -E weaviate -E qdrant -E sftp --with sentry-sdk --with pipeline -E deltalake - name: Start SFTP server run: docker compose -f "tests/load/filesystem_sftp/docker-compose.yml" up -d diff --git a/.github/workflows/test_local_sources.yml b/.github/workflows/test_local_sources.yml index 39689f5c85..685f67d593 100644 --- a/.github/workflows/test_local_sources.yml +++ b/.github/workflows/test_local_sources.yml @@ -83,7 +83,7 @@ jobs: # TODO: which deps should we enable? - name: Install dependencies - run: poetry install --no-interaction -E postgres -E postgis -E duckdb -E parquet -E filesystem -E cli -E sql_database --with sentry-sdk,pipeline,sources + run: poetry install --no-interaction -E databricks -E postgres -E postgis -E duckdb -E parquet -E filesystem -E cli -E sql_database --with sentry-sdk,pipeline,sources # run sources tests in load against configured destinations - run: poetry run pytest tests/load/sources diff --git a/.github/workflows/test_sqlalchemy_destinations.yml b/.github/workflows/test_sqlalchemy_destinations.yml index c2572b322d..fc8706e89c 100644 --- a/.github/workflows/test_sqlalchemy_destinations.yml +++ b/.github/workflows/test_sqlalchemy_destinations.yml @@ -86,7 +86,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-local-destinations - name: Install dependencies - run: poetry install --no-interaction -E parquet -E filesystem -E sqlalchemy -E cli --with sentry-sdk --with pipeline && poetry run pip install mysqlclient && poetry run pip install "sqlalchemy==${{ matrix.sqlalchemy }}" + run: poetry install --no-interaction -E databricks -E parquet -E filesystem -E sqlalchemy -E cli --with sentry-sdk --with pipeline && poetry run pip install mysqlclient && poetry run pip install "sqlalchemy==${{ matrix.sqlalchemy }}" - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml From 5435919e9d9b0795c382fbee5cf38fda4a138d24 Mon Sep 17 00:00:00 2001 From: Julian Alves <28436330+donotpush@users.noreply.github.com> Date: Wed, 11 Dec 2024 10:37:16 +0100 Subject: [PATCH 16/32] Revert "chore: add databricks dependency" This reverts commit b8f782d9786e30677780b2012ab590027d315628. --- .github/workflows/test_airflow.yml | 2 +- .github/workflows/test_common.yml | 6 +++--- .github/workflows/test_dbt_runner.yml | 2 +- .github/workflows/test_destination_athena.yml | 2 +- .github/workflows/test_destination_athena_iceberg.yml | 2 +- .github/workflows/test_destination_bigquery.yml | 2 +- .github/workflows/test_destination_clickhouse.yml | 2 +- .github/workflows/test_destination_dremio.yml | 2 +- .github/workflows/test_destination_lancedb.yml | 2 +- .github/workflows/test_destination_motherduck.yml | 2 +- .github/workflows/test_destination_mssql.yml | 2 +- .github/workflows/test_destination_snowflake.yml | 2 +- .github/workflows/test_destination_synapse.yml | 2 +- .github/workflows/test_destinations.yml | 2 +- .github/workflows/test_local_destinations.yml | 2 +- .github/workflows/test_local_sources.yml | 2 +- .github/workflows/test_sqlalchemy_destinations.yml | 2 +- 17 files changed, 19 insertions(+), 19 deletions(-) diff --git a/.github/workflows/test_airflow.yml b/.github/workflows/test_airflow.yml index e5daca436b..2a96c4475e 100644 --- a/.github/workflows/test_airflow.yml +++ b/.github/workflows/test_airflow.yml @@ -46,7 +46,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-airflow-runner - name: Install dependencies - run: poetry install --no-interaction --with airflow --with pipeline -E databricks -E duckdb -E parquet --with sentry-sdk + run: poetry install --no-interaction --with airflow --with pipeline -E duckdb -E parquet --with sentry-sdk - run: | poetry run pytest tests/helpers/airflow_tests diff --git a/.github/workflows/test_common.yml b/.github/workflows/test_common.yml index 164292cd86..1fbaa0d6b2 100644 --- a/.github/workflows/test_common.yml +++ b/.github/workflows/test_common.yml @@ -103,7 +103,7 @@ jobs: shell: cmd - name: Install duckdb dependencies - run: poetry install --no-interaction -E databricks -E duckdb --with sentry-sdk + run: poetry install --no-interaction -E duckdb --with sentry-sdk - run: | poetry run pytest tests/pipeline/test_pipeline.py tests/pipeline/test_import_export_schema.py @@ -116,7 +116,7 @@ jobs: shell: cmd - name: Install pyarrow - run: poetry install --no-interaction -E databricks -E duckdb -E cli -E parquet --with sentry-sdk && poetry run pip install pyarrow==15.0.2 + run: poetry install --no-interaction -E duckdb -E cli -E parquet --with sentry-sdk && poetry run pip install pyarrow==15.0.2 - run: | poetry run pytest tests/pipeline/test_pipeline_extra.py -k arrow @@ -129,7 +129,7 @@ jobs: shell: cmd - name: Install pipeline and sources dependencies - run: poetry install --no-interaction -E databricks -E duckdb -E cli -E parquet -E deltalake -E sql_database --with sentry-sdk,pipeline,sources && poetry run pip install pyarrow==15.0.2 + run: poetry install --no-interaction -E duckdb -E cli -E parquet -E deltalake -E sql_database --with sentry-sdk,pipeline,sources && poetry run pip install pyarrow==15.0.2 - run: | poetry run pytest tests/extract tests/pipeline tests/libs tests/cli/common tests/destinations tests/sources diff --git a/.github/workflows/test_dbt_runner.yml b/.github/workflows/test_dbt_runner.yml index 6c0f4e4753..ad29909d9a 100644 --- a/.github/workflows/test_dbt_runner.yml +++ b/.github/workflows/test_dbt_runner.yml @@ -60,7 +60,7 @@ jobs: - name: Install dependencies # install dlt with postgres support - run: poetry install --no-interaction -E databricks -E postgres -E postgis --with sentry-sdk,dbt + run: poetry install --no-interaction -E postgres -E postgis --with sentry-sdk,dbt - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_athena.yml b/.github/workflows/test_destination_athena.yml index fd74cafce6..1169fab0de 100644 --- a/.github/workflows/test_destination_athena.yml +++ b/.github/workflows/test_destination_athena.yml @@ -67,7 +67,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E databricks -E athena --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E athena --with sentry-sdk --with pipeline - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_athena_iceberg.yml b/.github/workflows/test_destination_athena_iceberg.yml index 7d8d1f430b..7ccefcc055 100644 --- a/.github/workflows/test_destination_athena_iceberg.yml +++ b/.github/workflows/test_destination_athena_iceberg.yml @@ -67,7 +67,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E databricks -E athena --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E athena --with sentry-sdk --with pipeline - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_bigquery.yml b/.github/workflows/test_destination_bigquery.yml index 878fefb9be..7afc9b8a00 100644 --- a/.github/workflows/test_destination_bigquery.yml +++ b/.github/workflows/test_destination_bigquery.yml @@ -66,7 +66,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E databricks -E bigquery --with providers -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E bigquery --with providers -E parquet --with sentry-sdk --with pipeline - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_clickhouse.yml b/.github/workflows/test_destination_clickhouse.yml index 568cbd232a..7f297db971 100644 --- a/.github/workflows/test_destination_clickhouse.yml +++ b/.github/workflows/test_destination_clickhouse.yml @@ -61,7 +61,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E databricks -E clickhouse --with providers -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E clickhouse --with providers -E parquet --with sentry-sdk --with pipeline - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_dremio.yml b/.github/workflows/test_destination_dremio.yml index 354e3f10bb..45c6d17db1 100644 --- a/.github/workflows/test_destination_dremio.yml +++ b/.github/workflows/test_destination_dremio.yml @@ -65,7 +65,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E databricks -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline - run: | poetry run pytest tests/load --ignore tests/load/sources diff --git a/.github/workflows/test_destination_lancedb.yml b/.github/workflows/test_destination_lancedb.yml index 266b19ba1e..6be89d3de3 100644 --- a/.github/workflows/test_destination_lancedb.yml +++ b/.github/workflows/test_destination_lancedb.yml @@ -65,7 +65,7 @@ jobs: run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml - name: Install dependencies - run: poetry install --no-interaction -E databricks -E lancedb -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E lancedb -E parquet --with sentry-sdk --with pipeline - name: Install embedding provider dependencies run: poetry run pip install openai diff --git a/.github/workflows/test_destination_motherduck.yml b/.github/workflows/test_destination_motherduck.yml index 4c53c4cad4..0014b17655 100644 --- a/.github/workflows/test_destination_motherduck.yml +++ b/.github/workflows/test_destination_motherduck.yml @@ -64,7 +64,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-motherduck - name: Install dependencies - run: poetry install --no-interaction -E databricks -E motherduck -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E motherduck -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_mssql.yml b/.github/workflows/test_destination_mssql.yml index 323bf6dc1f..8b899e7da2 100644 --- a/.github/workflows/test_destination_mssql.yml +++ b/.github/workflows/test_destination_mssql.yml @@ -69,7 +69,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E databricks -E mssql -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E mssql -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_snowflake.yml b/.github/workflows/test_destination_snowflake.yml index 94956dfdc7..a720c479bd 100644 --- a/.github/workflows/test_destination_snowflake.yml +++ b/.github/workflows/test_destination_snowflake.yml @@ -64,7 +64,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E databricks -E snowflake -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E snowflake -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_synapse.yml b/.github/workflows/test_destination_synapse.yml index 5a4e6a659a..be1b493916 100644 --- a/.github/workflows/test_destination_synapse.yml +++ b/.github/workflows/test_destination_synapse.yml @@ -67,7 +67,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E databricks -E synapse -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E synapse -E parquet --with sentry-sdk --with pipeline - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destinations.yml b/.github/workflows/test_destinations.yml index 1ab0ee35c8..933248d994 100644 --- a/.github/workflows/test_destinations.yml +++ b/.github/workflows/test_destinations.yml @@ -78,7 +78,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E databricks -E redshift -E postgis -E postgres -E gs -E s3 -E az -E parquet -E duckdb -E cli -E filesystem --with sentry-sdk --with pipeline -E deltalake + run: poetry install --no-interaction -E redshift -E postgis -E postgres -E gs -E s3 -E az -E parquet -E duckdb -E cli -E filesystem --with sentry-sdk --with pipeline -E deltalake - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_local_destinations.yml b/.github/workflows/test_local_destinations.yml index b2306f4121..4947a46a3b 100644 --- a/.github/workflows/test_local_destinations.yml +++ b/.github/workflows/test_local_destinations.yml @@ -95,7 +95,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-local-destinations - name: Install dependencies - run: poetry install --no-interaction -E databricks -E postgres -E postgis -E duckdb -E parquet -E filesystem -E cli -E weaviate -E qdrant -E sftp --with sentry-sdk --with pipeline -E deltalake + run: poetry install --no-interaction -E postgres -E postgis -E duckdb -E parquet -E filesystem -E cli -E weaviate -E qdrant -E sftp --with sentry-sdk --with pipeline -E deltalake - name: Start SFTP server run: docker compose -f "tests/load/filesystem_sftp/docker-compose.yml" up -d diff --git a/.github/workflows/test_local_sources.yml b/.github/workflows/test_local_sources.yml index 685f67d593..39689f5c85 100644 --- a/.github/workflows/test_local_sources.yml +++ b/.github/workflows/test_local_sources.yml @@ -83,7 +83,7 @@ jobs: # TODO: which deps should we enable? - name: Install dependencies - run: poetry install --no-interaction -E databricks -E postgres -E postgis -E duckdb -E parquet -E filesystem -E cli -E sql_database --with sentry-sdk,pipeline,sources + run: poetry install --no-interaction -E postgres -E postgis -E duckdb -E parquet -E filesystem -E cli -E sql_database --with sentry-sdk,pipeline,sources # run sources tests in load against configured destinations - run: poetry run pytest tests/load/sources diff --git a/.github/workflows/test_sqlalchemy_destinations.yml b/.github/workflows/test_sqlalchemy_destinations.yml index fc8706e89c..c2572b322d 100644 --- a/.github/workflows/test_sqlalchemy_destinations.yml +++ b/.github/workflows/test_sqlalchemy_destinations.yml @@ -86,7 +86,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-local-destinations - name: Install dependencies - run: poetry install --no-interaction -E databricks -E parquet -E filesystem -E sqlalchemy -E cli --with sentry-sdk --with pipeline && poetry run pip install mysqlclient && poetry run pip install "sqlalchemy==${{ matrix.sqlalchemy }}" + run: poetry install --no-interaction -E parquet -E filesystem -E sqlalchemy -E cli --with sentry-sdk --with pipeline && poetry run pip install mysqlclient && poetry run pip install "sqlalchemy==${{ matrix.sqlalchemy }}" - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml From 295617251fb82e4b5b8468707a444c79fb8c6637 Mon Sep 17 00:00:00 2001 From: Julian Alves <28436330+donotpush@users.noreply.github.com> Date: Wed, 11 Dec 2024 10:37:23 +0100 Subject: [PATCH 17/32] Revert "fix: databricks dependency CI/CD" This reverts commit 31e5e1f90afcc8c91462c2b963b5f05005a535c1. --- .github/workflows/test_common.yml | 2 +- .github/workflows/test_doc_snippets.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test_common.yml b/.github/workflows/test_common.yml index 1fbaa0d6b2..359ed43095 100644 --- a/.github/workflows/test_common.yml +++ b/.github/workflows/test_common.yml @@ -90,7 +90,7 @@ jobs: # key: venv-${{ matrix.os }}-${{ matrix.python-version }}-${{ hashFiles('**/poetry.lock') }} - name: Install dependencies - run: poetry install --no-interaction -E databricks --with sentry-sdk + run: poetry install --no-interaction --with sentry-sdk - run: | poetry run pytest tests/common tests/normalize tests/reflection tests/load/test_dummy_client.py tests/extract/test_extract.py tests/extract/test_sources.py tests/pipeline/test_pipeline_state.py diff --git a/.github/workflows/test_doc_snippets.yml b/.github/workflows/test_doc_snippets.yml index 552f62fea5..ae06a72df9 100644 --- a/.github/workflows/test_doc_snippets.yml +++ b/.github/workflows/test_doc_snippets.yml @@ -91,7 +91,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E databricks -E duckdb -E weaviate -E parquet -E qdrant -E bigquery -E postgres -E lancedb --with docs,sentry-sdk --without airflow -E s3 + run: poetry install --no-interaction -E duckdb -E weaviate -E parquet -E qdrant -E bigquery -E postgres -E lancedb --with docs,sentry-sdk --without airflow -E s3 - name: create secrets.toml for examples run: pwd && echo "$DLT_SECRETS_TOML" > docs/examples/.dlt/secrets.toml From 77b47148a48ada8f0f79c54468dca60929f27392 Mon Sep 17 00:00:00 2001 From: Julian Alves <28436330+donotpush@users.noreply.github.com> Date: Wed, 11 Dec 2024 11:57:38 +0100 Subject: [PATCH 18/32] refactor databricks auth --- .../impl/databricks/configuration.py | 26 +++++-------------- .../impl/databricks/sql_client.py | 17 ++++++++++++ .../test_databricks_configuration.py | 12 --------- .../load/pipeline/test_databricks_pipeline.py | 9 +++++-- 4 files changed, 30 insertions(+), 34 deletions(-) diff --git a/dlt/destinations/impl/databricks/configuration.py b/dlt/destinations/impl/databricks/configuration.py index 2daf639473..939c95fb11 100644 --- a/dlt/destinations/impl/databricks/configuration.py +++ b/dlt/destinations/impl/databricks/configuration.py @@ -1,13 +1,11 @@ import dataclasses -from typing import Callable, ClassVar, Final, Optional, Any, Dict, List, cast +from typing import ClassVar, Final, Optional, Any, Dict, List from dlt.common.typing import TSecretStrValue from dlt.common.configuration.specs.base_configuration import CredentialsConfiguration, configspec from dlt.common.destination.reference import DestinationClientDwhWithStagingConfiguration from dlt.common.configuration.exceptions import ConfigurationValueError -from databricks.sdk.core import Config, oauth_service_principal - DATABRICKS_APPLICATION_ID = "dltHub_dlt" @@ -17,7 +15,6 @@ class DatabricksCredentials(CredentialsConfiguration): server_hostname: str = None http_path: str = None access_token: Optional[TSecretStrValue] = None - auth_type: Optional[str] = None client_id: Optional[TSecretStrValue] = None client_secret: Optional[TSecretStrValue] = None http_headers: Optional[Dict[str, str]] = None @@ -32,24 +29,18 @@ class DatabricksCredentials(CredentialsConfiguration): "server_hostname", "http_path", "catalog", + "client_id", + "client_secret", "access_token", ] def on_resolved(self) -> None: - if self.auth_type and self.auth_type != "databricks-oauth": + if not ((self.client_id and self.client_secret) or self.access_token): raise ConfigurationValueError( - "Invalid auth_type detected: only 'databricks-oauth' is supported. " - "If you are using an access token for authentication, omit the 'auth_type' field." + "No valid authentication method detected. Provide either 'client_id' and" + " 'client_secret' for OAuth, or 'access_token' for token-based authentication." ) - def _get_oauth_credentials(self) -> Optional[Callable[[], Dict[str, str]]]: - config = Config( - host=f"https://{self.server_hostname}", - client_id=self.client_id, - client_secret=self.client_secret, - ) - return cast(Callable[[], Dict[str, str]], oauth_service_principal(config)) - def to_connector_params(self) -> Dict[str, Any]: conn_params = dict( catalog=self.catalog, @@ -60,11 +51,6 @@ def to_connector_params(self) -> Dict[str, Any]: **(self.connection_parameters or {}), ) - if self.auth_type == "databricks-oauth" and self.client_id and self.client_secret: - conn_params["credentials_provider"] = self._get_oauth_credentials - else: - conn_params["access_token"] = self.access_token - if self.user_agent_entry: conn_params["_user_agent_entry"] = ( conn_params.get("_user_agent_entry") or self.user_agent_entry diff --git a/dlt/destinations/impl/databricks/sql_client.py b/dlt/destinations/impl/databricks/sql_client.py index 0a1e898928..7658a775d7 100644 --- a/dlt/destinations/impl/databricks/sql_client.py +++ b/dlt/destinations/impl/databricks/sql_client.py @@ -11,8 +11,11 @@ Tuple, Union, Dict, + cast, + Callable, ) +from databricks.sdk.core import Config, oauth_service_principal from databricks import sql as databricks_lib # type: ignore[attr-defined] from databricks.sql.client import ( @@ -73,8 +76,22 @@ def __init__( self._conn: DatabricksSqlConnection = None self.credentials = credentials + def _get_oauth_credentials(self) -> Optional[Callable[[], Dict[str, str]]]: + config = Config( + host=f"https://{self.credentials.server_hostname}", + client_id=self.credentials.client_id, + client_secret=self.credentials.client_secret, + ) + return cast(Callable[[], Dict[str, str]], oauth_service_principal(config)) + def open_connection(self) -> DatabricksSqlConnection: conn_params = self.credentials.to_connector_params() + + if self.credentials.client_id and self.credentials.client_secret: + conn_params["credentials_provider"] = self._get_oauth_credentials + else: + conn_params["access_token"] = self.credentials.access_token + self._conn = databricks_lib.connect( **conn_params, schema=self.dataset_name, use_inline_params="silent" ) diff --git a/tests/load/databricks/test_databricks_configuration.py b/tests/load/databricks/test_databricks_configuration.py index 19ec5c5e73..e27da4db2a 100644 --- a/tests/load/databricks/test_databricks_configuration.py +++ b/tests/load/databricks/test_databricks_configuration.py @@ -62,18 +62,6 @@ def test_databricks_configuration() -> None: assert config.is_staging_external_location is None -def test_databricks_oauth_config() -> None: - from dlt.common.configuration.exceptions import ConfigurationValueError - - os.environ["DESTINATION__DATABRICKS__CREDENTIALS__AUTH_TYPE"] = "oauth" - with pytest.raises(ConfigurationValueError): - bricks = databricks() - bricks.configuration(None, accept_partial=True) - - os.environ["DESTINATION__DATABRICKS__CREDENTIALS__AUTH_TYPE"] = "databricks-oauth" - bricks.configuration(None, accept_partial=True) - - def test_databricks_abfss_converter() -> None: with pytest.raises(TerminalValueError): DatabricksLoadJob.ensure_databricks_abfss_url("az://dlt-ci-test-bucket") diff --git a/tests/load/pipeline/test_databricks_pipeline.py b/tests/load/pipeline/test_databricks_pipeline.py index 488c7d05a8..8a227fd547 100644 --- a/tests/load/pipeline/test_databricks_pipeline.py +++ b/tests/load/pipeline/test_databricks_pipeline.py @@ -152,11 +152,16 @@ def test_databricks_gcs_external_location(destination_config: DestinationTestCon ids=lambda x: x.name, ) def test_databricks_oauth(destination_config: DestinationTestConfiguration) -> None: - os.environ["DESTINATION__DATABRICKS__CREDENTIALS__AUTH_TYPE"] = "databricks-oauth" + from dlt.destinations import databricks + + bricks = databricks() + config = bricks.configuration(None, accept_partial=True) + + assert config.credentials.client_id and config.credentials.client_secret dataset_name = "test_databricks_oauth" + uniq_id() pipeline = destination_config.setup_pipeline( - "test_databricks_oauth", dataset_name=dataset_name, destination="databricks" + "test_databricks_oauth", dataset_name=dataset_name, destination=bricks ) info = pipeline.run([1, 2, 3], table_name="digits", **destination_config.run_kwargs) From edc1e2de5e79a58d85cb0955194dff9ab36f91e5 Mon Sep 17 00:00:00 2001 From: Julian Alves <28436330+donotpush@users.noreply.github.com> Date: Wed, 11 Dec 2024 12:04:58 +0100 Subject: [PATCH 19/32] update databrick docs --- docs/website/docs/dlt-ecosystem/destinations/databricks.md | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/docs/website/docs/dlt-ecosystem/destinations/databricks.md b/docs/website/docs/dlt-ecosystem/destinations/databricks.md index c167b35376..1b1a4ab28e 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/databricks.md +++ b/docs/website/docs/dlt-ecosystem/destinations/databricks.md @@ -99,7 +99,7 @@ Follow the instructions in the Databricks documentation to create a service prin [Authenticate access to Databricks using OAuth M2M](https://docs.databricks.com/en/dev-tools/auth/oauth-m2m.html) -Once you have the service principal credentials, update your secrets.toml as shown bellow. The auth_type must be set to "databricks-oauth" to enable this authentication method. +Once you have the service principal credentials, update your secrets.toml as shown bellow. ### Configuration @@ -109,13 +109,10 @@ Add the following fields to your `.dlt/secrets.toml` file: server_hostname = "MY_DATABRICKS.azuredatabricks.net" http_path = "/sql/1.0/warehouses/12345" catalog = "my_catalog" -auth_type = "databricks-oauth" client_id = "XXX" client_secret = "XXX" ``` - - ## Loader setup guide **1. Initialize a project with a pipeline that loads to Databricks by running** From 7db988088b20bb6600246808e9d995a74a553535 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Zawadzki?= Date: Wed, 4 Dec 2024 17:35:34 +0100 Subject: [PATCH 20/32] Docs: add missing mention of the required `endpoint_url` config (#2120) --- docs/website/docs/dlt-ecosystem/destinations/clickhouse.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/website/docs/dlt-ecosystem/destinations/clickhouse.md b/docs/website/docs/dlt-ecosystem/destinations/clickhouse.md index 3bd1ae8e15..40ee5d71e8 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/clickhouse.md +++ b/docs/website/docs/dlt-ecosystem/destinations/clickhouse.md @@ -229,8 +229,7 @@ To set up GCS staging with HMAC authentication in dlt: 1. Create HMAC keys for your GCS service account by following the [Google Cloud guide](https://cloud.google.com/storage/docs/authentication/managing-hmackeys#create). -2. Configure the HMAC keys (`aws_access_key_id` and `aws_secret_access_key`) in your dlt project's ClickHouse destination settings in `config.toml`, similar to how you would configure AWS S3 - credentials: +2. Configure the HMAC keys (`aws_access_key_id` and `aws_secret_access_key`) as well as `endpoint_url` in your dlt project's ClickHouse destination settings in `config.toml`, similar to how you would configure AWS S3 credentials: ```toml [destination.filesystem] From c9b74de42d17f5abce50121f5ea115cf10f0cdca Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Wed, 4 Dec 2024 17:37:15 +0100 Subject: [PATCH 21/32] Fix formatting in dlt.common.libs.pyarrow (#2102) From 8a650b198694e31a952cd6c3f8153579de8fdb2a Mon Sep 17 00:00:00 2001 From: rudolfix Date: Tue, 10 Dec 2024 21:41:17 +0100 Subject: [PATCH 22/32] checks notebook presence before finding userdata (#2117) --- dlt/common/configuration/providers/toml.py | 6 ++++++ .../configuration/test_toml_provider.py | 21 ++++++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/dlt/common/configuration/providers/toml.py b/dlt/common/configuration/providers/toml.py index 3636565fae..e586fef225 100644 --- a/dlt/common/configuration/providers/toml.py +++ b/dlt/common/configuration/providers/toml.py @@ -124,6 +124,12 @@ def _read_google_colab_secrets(self, name: str, file_name: str) -> tomlkit.TOMLD """Try to load the toml from google colab userdata object""" try: from google.colab import userdata + from dlt.common.runtime.exec_info import is_notebook + + # make sure we work in interactive mode (get_ipython() is available) + # when dlt cli is run, userdata is available but without a kernel + if not is_notebook(): + return None try: return tomlkit.loads(userdata.get(file_name)) diff --git a/tests/common/configuration/test_toml_provider.py b/tests/common/configuration/test_toml_provider.py index 481c21b7bb..9538849976 100644 --- a/tests/common/configuration/test_toml_provider.py +++ b/tests/common/configuration/test_toml_provider.py @@ -4,6 +4,7 @@ import yaml from typing import Any, Dict, Type import datetime # noqa: I251 +from unittest.mock import Mock import dlt from dlt.common import pendulum, json @@ -538,11 +539,28 @@ def loader() -> Dict[str, Any]: def test_colab_toml() -> None: + import builtins + # use a path without any settings files try: sys.path.append("tests/common/cases/modules") - # secrets are in user data + + # ipython not present provider: SettingsTomlProvider = SecretsTomlProvider("tests/common/null", global_dir=None) + assert provider.is_empty + + get_ipython_m = Mock() + get_ipython_m.return_value = "google.colab.Shell" + # make it available to all modules + builtins.get_ipython = get_ipython_m # type: ignore[attr-defined] + # test mock + assert get_ipython() == "google.colab.Shell" # type: ignore[name-defined] # noqa + from dlt.common.runtime.exec_info import is_notebook + + assert is_notebook() + + # secrets are in user data + provider = SecretsTomlProvider("tests/common/null", global_dir=None) assert provider.to_toml() == 'api_key="api"' # config is not in userdata provider = ConfigTomlProvider("tests/common/null", "unknown") @@ -551,4 +569,5 @@ def test_colab_toml() -> None: provider = SecretsTomlProvider("tests/common/cases/configuration/.dlt", global_dir=None) assert provider.get_value("secret_value", str, None) == ("2137", "secret_value") finally: + delattr(builtins, "get_ipython") sys.path.pop() From 2a749a50207f806cee5b4fd0481fd3bde613cf6a Mon Sep 17 00:00:00 2001 From: Steinthor Palsson Date: Tue, 10 Dec 2024 17:35:22 -0500 Subject: [PATCH 23/32] Add open/closed range arguments for incremental (#1991) * Add open/closed range arguments for incremental * Docs for incremental range args * Docstring * Typo * Ensure deduplication is disabled when range_start=='open' * Cache transformer settings --- dlt/common/incremental/typing.py | 4 + dlt/extract/incremental/__init__.py | 60 +++-- dlt/extract/incremental/transform.py | 75 ++++-- dlt/sources/sql_database/helpers.py | 12 +- .../verified-sources/sql_database/advanced.md | 49 +++- .../docs/general-usage/incremental-loading.md | 5 +- tests/extract/test_incremental.py | 111 +++++++- .../load/sources/sql_database/test_helpers.py | 237 ++++++++++++------ .../sql_database/test_sql_database_source.py | 17 +- 9 files changed, 434 insertions(+), 136 deletions(-) diff --git a/dlt/common/incremental/typing.py b/dlt/common/incremental/typing.py index 460e2f234b..2ca981bff0 100644 --- a/dlt/common/incremental/typing.py +++ b/dlt/common/incremental/typing.py @@ -8,6 +8,8 @@ LastValueFunc = Callable[[Sequence[TCursorValue]], Any] OnCursorValueMissing = Literal["raise", "include", "exclude"] +TIncrementalRange = Literal["open", "closed"] + class IncrementalColumnState(TypedDict): initial_value: Optional[Any] @@ -26,3 +28,5 @@ class IncrementalArgs(TypedDict, total=False): allow_external_schedulers: Optional[bool] lag: Optional[Union[float, int]] on_cursor_value_missing: Optional[OnCursorValueMissing] + range_start: Optional[TIncrementalRange] + range_end: Optional[TIncrementalRange] diff --git a/dlt/extract/incremental/__init__.py b/dlt/extract/incremental/__init__.py index 28d33bb71f..5e7bae49c6 100644 --- a/dlt/extract/incremental/__init__.py +++ b/dlt/extract/incremental/__init__.py @@ -42,6 +42,7 @@ LastValueFunc, OnCursorValueMissing, IncrementalArgs, + TIncrementalRange, ) from dlt.extract.items import SupportsPipe, TTableHintTemplate, ItemTransform from dlt.extract.incremental.transform import ( @@ -104,6 +105,11 @@ class Incremental(ItemTransform[TDataItem], BaseConfiguration, Generic[TCursorVa Note that if logical "end date" is present then also "end_value" will be set which means that resource state is not used and exactly this range of date will be loaded on_cursor_value_missing: Specify what happens when the cursor_path does not exist in a record or a record has `None` at the cursor_path: raise, include, exclude lag: Optional value used to define a lag or attribution window. For datetime cursors, this is interpreted as seconds. For other types, it uses the + or - operator depending on the last_value_func. + range_start: Decide whether the incremental filtering range is `open` or `closed` on the start value side. Default is `closed`. + Setting this to `open` means that items with the same cursor value as the last value from the previous run (or `initial_value`) are excluded from the result. + The `open` range disables deduplication logic so it can serve as an optimization when you know cursors don't overlap between pipeline runs. + range_end: Decide whether the incremental filtering range is `open` or `closed` on the end value side. Default is `open` (exact `end_value` is excluded). + Setting this to `closed` means that items with the exact same cursor value as the `end_value` are included in the result. """ # this is config/dataclass so declare members @@ -116,6 +122,8 @@ class Incremental(ItemTransform[TDataItem], BaseConfiguration, Generic[TCursorVa on_cursor_value_missing: OnCursorValueMissing = "raise" lag: Optional[float] = None duplicate_cursor_warning_threshold: ClassVar[int] = 200 + range_start: TIncrementalRange = "closed" + range_end: TIncrementalRange = "open" # incremental acting as empty EMPTY: ClassVar["Incremental[Any]"] = None @@ -132,6 +140,8 @@ def __init__( allow_external_schedulers: bool = False, on_cursor_value_missing: OnCursorValueMissing = "raise", lag: Optional[float] = None, + range_start: TIncrementalRange = "closed", + range_end: TIncrementalRange = "open", ) -> None: # make sure that path is valid if cursor_path: @@ -174,9 +184,11 @@ def __init__( self.start_out_of_range: bool = False """Becomes true on the first item that is out of range of `start_value`. I.e. when using `max` this is a value that is lower than `start_value`""" - self._transformers: Dict[str, IncrementalTransform] = {} + self._transformers: Dict[Type[IncrementalTransform], IncrementalTransform] = {} self._bound_pipe: SupportsPipe = None """Bound pipe""" + self.range_start = range_start + self.range_end = range_end @property def primary_key(self) -> Optional[TTableHintTemplate[TColumnNames]]: @@ -190,22 +202,6 @@ def primary_key(self, value: str) -> None: for transform in self._transformers.values(): transform.primary_key = value - def _make_transforms(self) -> None: - types = [("arrow", ArrowIncremental), ("json", JsonIncremental)] - for dt, kls in types: - self._transformers[dt] = kls( - self.resource_name, - self.cursor_path, - self.initial_value, - self.start_value, - self.end_value, - self.last_value_func, - self._primary_key, - set(self._cached_state["unique_hashes"]), - self.on_cursor_value_missing, - self.lag, - ) - @classmethod def from_existing_state( cls, resource_name: str, cursor_path: str @@ -489,7 +485,8 @@ def bind(self, pipe: SupportsPipe) -> "Incremental[TCursorValue]": ) # cache state self._cached_state = self.get_state() - self._make_transforms() + # Clear transforms so we get new instances + self._transformers.clear() return self def can_close(self) -> bool: @@ -520,15 +517,34 @@ def __str__(self) -> str: f" {self.last_value_func}" ) + def _make_or_get_transformer(self, cls: Type[IncrementalTransform]) -> IncrementalTransform: + if transformer := self._transformers.get(cls): + return transformer + transformer = self._transformers[cls] = cls( + self.resource_name, + self.cursor_path, + self.initial_value, + self.start_value, + self.end_value, + self.last_value_func, + self._primary_key, + set(self._cached_state["unique_hashes"]), + self.on_cursor_value_missing, + self.lag, + self.range_start, + self.range_end, + ) + return transformer + def _get_transformer(self, items: TDataItems) -> IncrementalTransform: # Assume list is all of the same type for item in items if isinstance(items, list) else [items]: if is_arrow_item(item): - return self._transformers["arrow"] + return self._make_or_get_transformer(ArrowIncremental) elif pandas is not None and isinstance(item, pandas.DataFrame): - return self._transformers["arrow"] - return self._transformers["json"] - return self._transformers["json"] + return self._make_or_get_transformer(ArrowIncremental) + return self._make_or_get_transformer(JsonIncremental) + return self._make_or_get_transformer(JsonIncremental) def __call__(self, rows: TDataItems, meta: Any = None) -> Optional[TDataItems]: if rows is None: diff --git a/dlt/extract/incremental/transform.py b/dlt/extract/incremental/transform.py index 22b1194b51..1d213e26c2 100644 --- a/dlt/extract/incremental/transform.py +++ b/dlt/extract/incremental/transform.py @@ -13,7 +13,12 @@ IncrementalPrimaryKeyMissing, IncrementalCursorPathHasValueNone, ) -from dlt.common.incremental.typing import TCursorValue, LastValueFunc, OnCursorValueMissing +from dlt.common.incremental.typing import ( + TCursorValue, + LastValueFunc, + OnCursorValueMissing, + TIncrementalRange, +) from dlt.extract.utils import resolve_column_value from dlt.extract.items import TTableHintTemplate @@ -57,6 +62,8 @@ def __init__( unique_hashes: Set[str], on_cursor_value_missing: OnCursorValueMissing = "raise", lag: Optional[float] = None, + range_start: TIncrementalRange = "closed", + range_end: TIncrementalRange = "open", ) -> None: self.resource_name = resource_name self.cursor_path = cursor_path @@ -71,6 +78,9 @@ def __init__( self.start_unique_hashes = set(unique_hashes) self.on_cursor_value_missing = on_cursor_value_missing self.lag = lag + self.range_start = range_start + self.range_end = range_end + # compile jsonpath self._compiled_cursor_path = compile_path(cursor_path) # for simple column name we'll fallback to search in dict @@ -107,6 +117,8 @@ def __call__( def deduplication_disabled(self) -> bool: """Skip deduplication when length of the key is 0 or if lag is applied.""" # disable deduplication if end value is set - state is not saved + if self.range_start == "open": + return True if self.end_value is not None: return True # disable deduplication if lag is applied - destination must deduplicate ranges @@ -191,10 +203,10 @@ def __call__( # Filter end value ranges exclusively, so in case of "max" function we remove values >= end_value if self.end_value is not None: try: - if ( - last_value_func((row_value, self.end_value)) != self.end_value - or last_value_func((row_value,)) == self.end_value - ): + if last_value_func((row_value, self.end_value)) != self.end_value: + return None, False, True + + if self.range_end == "open" and last_value_func((row_value,)) == self.end_value: return None, False, True except Exception as ex: raise IncrementalCursorInvalidCoercion( @@ -221,6 +233,9 @@ def __call__( ) from ex # new_value is "less" or equal to last_value (the actual max) if last_value == new_value: + if self.range_start == "open": + # We only want greater than last_value + return None, False, False # use func to compute row_value into last_value compatible processed_row_value = last_value_func((row_value,)) # skip the record that is not a start_value or new_value: that record was already processed @@ -258,6 +273,31 @@ def __call__( class ArrowIncremental(IncrementalTransform): _dlt_index = "_dlt_index" + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + if self.last_value_func is max: + self.compute = pa.compute.max + self.end_compare = ( + pa.compute.less if self.range_end == "open" else pa.compute.less_equal + ) + self.last_value_compare = ( + pa.compute.greater_equal if self.range_start == "closed" else pa.compute.greater + ) + self.new_value_compare = pa.compute.greater + elif self.last_value_func is min: + self.compute = pa.compute.min + self.end_compare = ( + pa.compute.greater if self.range_end == "open" else pa.compute.greater_equal + ) + self.last_value_compare = ( + pa.compute.less_equal if self.range_start == "closed" else pa.compute.less + ) + self.new_value_compare = pa.compute.less + else: + raise NotImplementedError( + "Only min or max last_value_func is supported for arrow tables" + ) + def compute_unique_values(self, item: "TAnyArrowItem", unique_columns: List[str]) -> List[str]: if not unique_columns: return [] @@ -312,28 +352,13 @@ def __call__( if not tbl: # row is None or empty arrow table return tbl, start_out_of_range, end_out_of_range - if self.last_value_func is max: - compute = pa.compute.max - end_compare = pa.compute.less - last_value_compare = pa.compute.greater_equal - new_value_compare = pa.compute.greater - elif self.last_value_func is min: - compute = pa.compute.min - end_compare = pa.compute.greater - last_value_compare = pa.compute.less_equal - new_value_compare = pa.compute.less - else: - raise NotImplementedError( - "Only min or max last_value_func is supported for arrow tables" - ) - # TODO: Json path support. For now assume the cursor_path is a column name cursor_path = self.cursor_path # The new max/min value try: # NOTE: datetimes are always pendulum in UTC - row_value = from_arrow_scalar(compute(tbl[cursor_path])) + row_value = from_arrow_scalar(self.compute(tbl[cursor_path])) cursor_data_type = tbl.schema.field(cursor_path).type row_value_scalar = to_arrow_scalar(row_value, cursor_data_type) except KeyError as e: @@ -364,10 +389,10 @@ def __call__( cursor_data_type, str(ex), ) from ex - tbl = tbl.filter(end_compare(tbl[cursor_path], end_value_scalar)) + tbl = tbl.filter(self.end_compare(tbl[cursor_path], end_value_scalar)) # Is max row value higher than end value? # NOTE: pyarrow bool *always* evaluates to python True. `as_py()` is necessary - end_out_of_range = not end_compare(row_value_scalar, end_value_scalar).as_py() + end_out_of_range = not self.end_compare(row_value_scalar, end_value_scalar).as_py() if self.start_value is not None: try: @@ -383,7 +408,7 @@ def __call__( str(ex), ) from ex # Remove rows lower or equal than the last start value - keep_filter = last_value_compare(tbl[cursor_path], start_value_scalar) + keep_filter = self.last_value_compare(tbl[cursor_path], start_value_scalar) start_out_of_range = bool(pa.compute.any(pa.compute.invert(keep_filter)).as_py()) tbl = tbl.filter(keep_filter) if not self.deduplication_disabled: @@ -407,7 +432,7 @@ def __call__( if ( self.last_value is None - or new_value_compare( + or self.new_value_compare( row_value_scalar, to_arrow_scalar(self.last_value, cursor_data_type) ).as_py() ): # Last value has changed diff --git a/dlt/sources/sql_database/helpers.py b/dlt/sources/sql_database/helpers.py index a8be2a6427..ee38c7dd98 100644 --- a/dlt/sources/sql_database/helpers.py +++ b/dlt/sources/sql_database/helpers.py @@ -94,12 +94,16 @@ def __init__( self.end_value = incremental.end_value self.row_order: TSortOrder = self.incremental.row_order self.on_cursor_value_missing = self.incremental.on_cursor_value_missing + self.range_start = self.incremental.range_start + self.range_end = self.incremental.range_end else: self.cursor_column = None self.last_value = None self.end_value = None self.row_order = None self.on_cursor_value_missing = None + self.range_start = None + self.range_end = None def _make_query(self) -> SelectAny: table = self.table @@ -110,11 +114,11 @@ def _make_query(self) -> SelectAny: # generate where if last_value_func is max: # Query ordered and filtered according to last_value function - filter_op = operator.ge - filter_op_end = operator.lt + filter_op = operator.ge if self.range_start == "closed" else operator.gt + filter_op_end = operator.lt if self.range_end == "open" else operator.le elif last_value_func is min: - filter_op = operator.le - filter_op_end = operator.gt + filter_op = operator.le if self.range_start == "closed" else operator.lt + filter_op_end = operator.gt if self.range_end == "open" else operator.ge else: # Custom last_value, load everything and let incremental handle filtering return query # type: ignore[no-any-return] diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/advanced.md b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/advanced.md index 6ff3a267d2..c532f6d357 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/advanced.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/advanced.md @@ -16,7 +16,7 @@ Efficient data management often requires loading only new or updated data from y Incremental loading uses a cursor column (e.g., timestamp or auto-incrementing ID) to load only data newer than a specified initial value, enhancing efficiency by reducing processing time and resource use. Read [here](../../../walkthroughs/sql-incremental-configuration) for more details on incremental loading with `dlt`. -#### How to configure +### How to configure 1. **Choose a cursor column**: Identify a column in your SQL table that can serve as a reliable indicator of new or updated rows. Common choices include timestamp columns or auto-incrementing IDs. 1. **Set an initial value**: Choose a starting value for the cursor to begin loading data. This could be a specific timestamp or ID from which you wish to start loading data. 1. **Deduplication**: When using incremental loading, the system automatically handles the deduplication of rows based on the primary key (if available) or row hash for tables without a primary key. @@ -27,7 +27,7 @@ Incremental loading uses a cursor column (e.g., timestamp or auto-incrementing I If your cursor column name contains special characters (e.g., `$`) you need to escape it when passing it to the `incremental` function. For example, if your cursor column is `example_$column`, you should pass it as `"'example_$column'"` or `'"example_$column"'` to the `incremental` function: `incremental("'example_$column'", initial_value=...)`. ::: -#### Examples +### Examples 1. **Incremental loading with the resource `sql_table`**. @@ -52,7 +52,7 @@ If your cursor column name contains special characters (e.g., `$`) you need to e print(extract_info) ``` - Behind the scene, the loader generates a SQL query filtering rows with `last_modified` values greater than the incremental value. In the first run, this is the initial value (midnight (00:00:00) January 1, 2024). + Behind the scene, the loader generates a SQL query filtering rows with `last_modified` values greater or equal to the incremental value. In the first run, this is the initial value (midnight (00:00:00) January 1, 2024). In subsequent runs, it is the latest value of `last_modified` that `dlt` stores in [state](../../../general-usage/state). 2. **Incremental loading with the source `sql_database`**. @@ -78,6 +78,49 @@ If your cursor column name contains special characters (e.g., `$`) you need to e * `apply_hints` is a powerful method that enables schema modifications after resource creation, like adjusting write disposition and primary keys. You can choose from various tables and use `apply_hints` multiple times to create pipelines with merged, appended, or replaced resources. ::: +### Inclusive and exclusive filtering + +By default the incremental filtering is inclusive on the start value side so that +rows with cursor equal to the last run's cursor are fetched again from the database. + +The SQL query generated looks something like this (assuming `last_value_func` is `max`): + +```sql +SELECT * FROM family +WHERE last_modified >= :start_value +ORDER BY last_modified ASC +``` + +That means some rows overlapping with the previous load are fetched from the database. +Duplicates are then filtered out by dlt using either the primary key or a hash of the row's contents. + +This ensures there are no gaps in the extracted sequence. But it does come with some performance overhead, +both due to the deduplication processing and the cost of fetching redundant records from the database. + +This is not always needed. If you know that your data does not contain overlapping cursor values then you +can optimize extraction by passing `range_start="open"` to incremental. + +This both disables the deduplication process and changes the operator used in the SQL `WHERE` clause from `>=` (greater-or-equal) to `>` (greater than), so that no overlapping rows are fetched. + +E.g. + +```py +table = sql_table( + table='family', + incremental=dlt.sources.incremental( + 'last_modified', # Cursor column name + initial_value=pendulum.DateTime(2024, 1, 1, 0, 0, 0), # Initial cursor value + range_start="open", # exclude the start value + ) +) +``` + +It's a good option if: + +* The cursor is an auto incrementing ID +* The cursor is a high precision timestamp and two records are never created at exactly the same time +* Your pipeline runs are timed in such a way that new data is not generated during the load + ## Parallelized extraction You can extract each table in a separate thread (no multiprocessing at this point). This will decrease loading time if your queries take time to execute or your network latency/speed is low. To enable this, declare your sources/resources as follows: diff --git a/docs/website/docs/general-usage/incremental-loading.md b/docs/website/docs/general-usage/incremental-loading.md index 3f452f0d16..5008795ed4 100644 --- a/docs/website/docs/general-usage/incremental-loading.md +++ b/docs/website/docs/general-usage/incremental-loading.md @@ -693,7 +693,7 @@ august_issues = repo_issues( ... ``` -Note that dlt's incremental filtering considers the ranges half-closed. `initial_value` is inclusive, `end_value` is exclusive, so chaining ranges like above works without overlaps. +Note that dlt's incremental filtering considers the ranges half-closed. `initial_value` is inclusive, `end_value` is exclusive, so chaining ranges like above works without overlaps. This behaviour can be changed with the `range_start` (default `"closed"`) and `range_end` (default `"open"`) arguments. ### Declare row order to not request unnecessary data @@ -793,6 +793,9 @@ def some_data(last_timestamp=dlt.sources.incremental("item.ts", primary_key=())) yield {"delta": i, "item": {"ts": pendulum.now().timestamp()}} ``` +This deduplication process is always enabled when `range_start` is set to `"closed"` (default). +When you pass `range_start="open"` no deduplication is done as it is not needed as rows with the previous cursor value are excluded. This can be a useful optimization to avoid the performance overhead of deduplication if the cursor field is guaranteed to be unique. + ### Using `dlt.sources.incremental` with dynamically created resources When resources are [created dynamically](source.md#create-resources-dynamically), it is possible to use the `dlt.sources.incremental` definition as well. diff --git a/tests/extract/test_incremental.py b/tests/extract/test_incremental.py index 725872b621..3ebc9d1201 100644 --- a/tests/extract/test_incremental.py +++ b/tests/extract/test_incremental.py @@ -5,7 +5,7 @@ from datetime import datetime, date # noqa: I251 from itertools import chain, count from time import sleep -from typing import Any, Optional, Literal, Sequence, Dict +from typing import Any, Optional, Literal, Sequence, Dict, Iterable from unittest import mock import duckdb @@ -1522,6 +1522,7 @@ def some_data(last_timestamp=dlt.sources.incremental("ts", primary_key=())): @pytest.mark.parametrize("item_type", ALL_TEST_DATA_ITEM_FORMATS) def test_apply_hints_incremental(item_type: TestDataItemFormat) -> None: + os.environ["COMPLETED_PROB"] = "1.0" # make it complete immediately p = dlt.pipeline(pipeline_name=uniq_id(), destination="dummy") data = [{"created_at": 1}, {"created_at": 2}, {"created_at": 3}] source_items = data_to_item_format(item_type, data) @@ -3851,3 +3852,111 @@ def some_data(): for col in table_schema["columns"].values(): assert "incremental" not in col + + +@pytest.mark.parametrize("item_type", ALL_TEST_DATA_ITEM_FORMATS) +@pytest.mark.parametrize("last_value_func", [min, max]) +def test_start_range_open(item_type: TestDataItemFormat, last_value_func: Any) -> None: + data_range: Iterable[int] = range(1, 12) + if last_value_func == max: + initial_value = 5 + # Only items higher than inital extracted + expected_items = list(range(6, 12)) + order_dir = "ASC" + elif last_value_func == min: + data_range = reversed(data_range) # type: ignore[call-overload] + initial_value = 5 + # Only items lower than inital extracted + expected_items = list(reversed(range(1, 5))) + order_dir = "DESC" + + @dlt.resource + def some_data( + updated_at: dlt.sources.incremental[int] = dlt.sources.incremental( + "updated_at", + initial_value=initial_value, + range_start="open", + last_value_func=last_value_func, + ), + ) -> Any: + data = [{"updated_at": i} for i in data_range] + yield data_to_item_format(item_type, data) + + pipeline = dlt.pipeline(pipeline_name=uniq_id(), destination="duckdb") + pipeline.run(some_data()) + + with pipeline.sql_client() as client: + items = [ + row[0] + for row in client.execute_sql( + f"SELECT updated_at FROM some_data ORDER BY updated_at {order_dir}" + ) + ] + + assert items == expected_items + + +@pytest.mark.parametrize("item_type", ALL_TEST_DATA_ITEM_FORMATS) +def test_start_range_open_no_deduplication(item_type: TestDataItemFormat) -> None: + @dlt.source + def dummy(): + @dlt.resource + def some_data( + updated_at: dlt.sources.incremental[int] = dlt.sources.incremental( + "updated_at", + range_start="open", + ) + ): + yield [{"updated_at": i} for i in range(3)] + + yield some_data + + pipeline = dlt.pipeline(pipeline_name=uniq_id()) + pipeline.extract(dummy()) + + state = pipeline.state["sources"]["dummy"]["resources"]["some_data"]["incremental"][ + "updated_at" + ] + + # No unique values should be computed + assert state["unique_hashes"] == [] + + +@pytest.mark.parametrize("item_type", ALL_TEST_DATA_ITEM_FORMATS) +@pytest.mark.parametrize("last_value_func", [min, max]) +def test_end_range_closed(item_type: TestDataItemFormat, last_value_func: Any) -> None: + values = [5, 10] + expected_items = list(range(5, 11)) + if last_value_func == max: + order_dir = "ASC" + elif last_value_func == min: + values = list(reversed(values)) + expected_items = list(reversed(expected_items)) + order_dir = "DESC" + + @dlt.resource + def some_data( + updated_at: dlt.sources.incremental[int] = dlt.sources.incremental( + "updated_at", + initial_value=values[0], + end_value=values[1], + range_end="closed", + last_value_func=last_value_func, + ), + ) -> Any: + data = [{"updated_at": i} for i in range(1, 12)] + yield data_to_item_format(item_type, data) + + pipeline = dlt.pipeline(pipeline_name=uniq_id(), destination="duckdb") + pipeline.run(some_data()) + + with pipeline.sql_client() as client: + items = [ + row[0] + for row in client.execute_sql( + f"SELECT updated_at FROM some_data ORDER BY updated_at {order_dir}" + ) + ] + + # Includes values 5-10 inclusive + assert items == expected_items diff --git a/tests/load/sources/sql_database/test_helpers.py b/tests/load/sources/sql_database/test_helpers.py index def5430146..43da9c955f 100644 --- a/tests/load/sources/sql_database/test_helpers.py +++ b/tests/load/sources/sql_database/test_helpers.py @@ -1,3 +1,6 @@ +from typing import Callable, Any, TYPE_CHECKING +from dataclasses import dataclass + import pytest import dlt @@ -14,6 +17,18 @@ pytest.skip("Tests require sql alchemy", allow_module_level=True) +@dataclass +class MockIncremental: + last_value: Any + last_value_func: Callable[[Any], Any] + cursor_path: str + row_order: str = None + end_value: Any = None + on_cursor_value_missing: str = "raise" + range_start: str = "closed" + range_end: str = "open" + + @pytest.mark.parametrize("backend", ["sqlalchemy", "pyarrow", "pandas", "connectorx"]) def test_cursor_or_unique_column_not_in_table( sql_source_db: SQLAlchemySourceDB, backend: TableBackend @@ -36,13 +51,12 @@ def test_make_query_incremental_max( ) -> None: """Verify query is generated according to incremental settings""" - class MockIncremental: - last_value = dlt.common.pendulum.now() - last_value_func = max - cursor_path = "created_at" - row_order = "asc" - end_value = None - on_cursor_value_missing = "raise" + incremental = MockIncremental( + last_value=dlt.common.pendulum.now(), + last_value_func=max, + cursor_path="created_at", + row_order="asc", + ) table = sql_source_db.get_table("chat_message") loader = TableLoader( @@ -50,14 +64,14 @@ class MockIncremental: backend, table, table_to_columns(table), - incremental=MockIncremental(), # type: ignore[arg-type] + incremental=incremental, # type: ignore[arg-type] ) query = loader.make_query() expected = ( table.select() .order_by(table.c.created_at.asc()) - .where(table.c.created_at >= MockIncremental.last_value) + .where(table.c.created_at >= incremental.last_value) ) assert query.compare(expected) @@ -67,13 +81,14 @@ class MockIncremental: def test_make_query_incremental_min( sql_source_db: SQLAlchemySourceDB, backend: TableBackend ) -> None: - class MockIncremental: - last_value = dlt.common.pendulum.now() - last_value_func = min - cursor_path = "created_at" - row_order = "desc" - end_value = None - on_cursor_value_missing = "raise" + incremental = MockIncremental( + last_value=dlt.common.pendulum.now(), + last_value_func=min, + cursor_path="created_at", + row_order="desc", + end_value=None, + on_cursor_value_missing="raise", + ) table = sql_source_db.get_table("chat_message") loader = TableLoader( @@ -81,14 +96,14 @@ class MockIncremental: backend, table, table_to_columns(table), - incremental=MockIncremental(), # type: ignore[arg-type] + incremental=incremental, # type: ignore[arg-type] ) query = loader.make_query() expected = ( table.select() .order_by(table.c.created_at.asc()) # `min` func swaps order - .where(table.c.created_at <= MockIncremental.last_value) + .where(table.c.created_at <= incremental.last_value) ) assert query.compare(expected) @@ -103,13 +118,14 @@ def test_make_query_incremental_on_cursor_value_missing_set( with_end_value: bool, cursor_value_missing: str, ) -> None: - class MockIncremental: - last_value = dlt.common.pendulum.now() - last_value_func = max - cursor_path = "created_at" - row_order = "asc" - end_value = None if not with_end_value else dlt.common.pendulum.now().add(hours=1) - on_cursor_value_missing = cursor_value_missing + incremental = MockIncremental( + last_value=dlt.common.pendulum.now(), + last_value_func=max, + cursor_path="created_at", + row_order="asc", + end_value=None if not with_end_value else dlt.common.pendulum.now().add(hours=1), + on_cursor_value_missing=cursor_value_missing, + ) table = sql_source_db.get_table("chat_message") loader = TableLoader( @@ -117,7 +133,7 @@ class MockIncremental: backend, table, table_to_columns(table), - incremental=MockIncremental(), # type: ignore[arg-type] + incremental=incremental, # type: ignore[arg-type] ) query = loader.make_query() @@ -131,14 +147,14 @@ class MockIncremental: if with_end_value: where_clause = operator( sa.and_( - table.c.created_at >= MockIncremental.last_value, - table.c.created_at < MockIncremental.end_value, + table.c.created_at >= incremental.last_value, + table.c.created_at < incremental.end_value, ), missing_cond, ) else: where_clause = operator( - table.c.created_at >= MockIncremental.last_value, + table.c.created_at >= incremental.last_value, missing_cond, ) expected = table.select().order_by(table.c.created_at.asc()).where(where_clause) @@ -152,13 +168,14 @@ def test_make_query_incremental_on_cursor_value_missing_no_last_value( backend: TableBackend, cursor_value_missing: str, ) -> None: - class MockIncremental: - last_value = None - last_value_func = max - cursor_path = "created_at" - row_order = "asc" - end_value = None - on_cursor_value_missing = cursor_value_missing + incremental = MockIncremental( + last_value=None, + last_value_func=max, + cursor_path="created_at", + row_order="asc", + end_value=None, + on_cursor_value_missing=cursor_value_missing, + ) table = sql_source_db.get_table("chat_message") loader = TableLoader( @@ -166,7 +183,7 @@ class MockIncremental: backend, table, table_to_columns(table), - incremental=MockIncremental(), # type: ignore[arg-type] + incremental=incremental, # type: ignore[arg-type] ) query = loader.make_query() @@ -189,13 +206,14 @@ def test_make_query_incremental_end_value( ) -> None: now = dlt.common.pendulum.now() - class MockIncremental: - last_value = now - last_value_func = min - cursor_path = "created_at" - end_value = now.add(hours=1) - row_order = None - on_cursor_value_missing = "raise" + incremental = MockIncremental( + last_value=now, + last_value_func=min, + cursor_path="created_at", + end_value=now.add(hours=1), + row_order=None, + on_cursor_value_missing="raise", + ) table = sql_source_db.get_table("chat_message") loader = TableLoader( @@ -203,14 +221,14 @@ class MockIncremental: backend, table, table_to_columns(table), - incremental=MockIncremental(), # type: ignore[arg-type] + incremental=incremental, # type: ignore[arg-type] ) query = loader.make_query() expected = table.select().where( sa.and_( - table.c.created_at <= MockIncremental.last_value, - table.c.created_at > MockIncremental.end_value, + table.c.created_at <= incremental.last_value, + table.c.created_at > incremental.end_value, ) ) @@ -221,13 +239,14 @@ class MockIncremental: def test_make_query_incremental_any_fun( sql_source_db: SQLAlchemySourceDB, backend: TableBackend ) -> None: - class MockIncremental: - last_value = dlt.common.pendulum.now() - last_value_func = lambda x: x[-1] - cursor_path = "created_at" - row_order = "asc" - end_value = dlt.common.pendulum.now() - on_cursor_value_missing = "raise" + incremental = MockIncremental( + last_value=dlt.common.pendulum.now(), + last_value_func=lambda x: x[-1], + cursor_path="created_at", + row_order="asc", + end_value=dlt.common.pendulum.now(), + on_cursor_value_missing="raise", + ) table = sql_source_db.get_table("chat_message") loader = TableLoader( @@ -235,7 +254,7 @@ class MockIncremental: backend, table, table_to_columns(table), - incremental=MockIncremental(), # type: ignore[arg-type] + incremental=incremental, # type: ignore[arg-type] ) query = loader.make_query() @@ -256,12 +275,11 @@ def test_cursor_path_field_name_with_a_special_chars( if special_field_name not in table.c: table.append_column(sa.Column(special_field_name, sa.String)) - class MockIncremental: - cursor_path = "'id$field'" - last_value = None - end_value = None - row_order = None - on_cursor_value_missing = None + incremental = MockIncremental( + cursor_path="'id$field'", + last_value=None, + last_value_func=max, + ) # Should not raise any exception loader = TableLoader( @@ -269,7 +287,7 @@ class MockIncremental: backend, table, table_to_columns(table), - incremental=MockIncremental(), # type: ignore[arg-type] + incremental=incremental, # type: ignore[arg-type] ) assert loader.cursor_column == table.c[special_field_name] @@ -281,12 +299,11 @@ def test_cursor_path_multiple_fields( """Test that a cursor_path with multiple fields raises a ValueError.""" table = sql_source_db.get_table("chat_message") - class MockIncremental: - cursor_path = "created_at,updated_at" - last_value = None - end_value = None - row_order = None - on_cursor_value_missing = None + incremental = MockIncremental( + cursor_path="created_at,updated_at", + last_value=None, + last_value_func=max, + ) with pytest.raises(ValueError) as excinfo: TableLoader( @@ -294,7 +311,7 @@ class MockIncremental: backend, table, table_to_columns(table), - incremental=MockIncremental(), # type: ignore[arg-type] + incremental=incremental, # type: ignore[arg-type] ) assert "must be a simple column name" in str(excinfo.value) @@ -306,12 +323,11 @@ def test_cursor_path_complex_expression( """Test that a complex JSONPath expression in cursor_path raises a ValueError.""" table = sql_source_db.get_table("chat_message") - class MockIncremental: - cursor_path = "$.users[0].id" - last_value = None - end_value = None - row_order = None - on_cursor_value_missing = None + incremental = MockIncremental( + cursor_path="$.users[0].id", + last_value=None, + last_value_func=max, + ) with pytest.raises(ValueError) as excinfo: TableLoader( @@ -319,11 +335,80 @@ class MockIncremental: backend, table, table_to_columns(table), - incremental=MockIncremental(), # type: ignore[arg-type] + incremental=incremental, # type: ignore[arg-type] ) assert "must be a simple column name" in str(excinfo.value) +@pytest.mark.parametrize("backend", ["sqlalchemy", "pyarrow", "pandas", "connectorx"]) +@pytest.mark.parametrize("last_value_func", [min, max]) +def test_make_query_incremental_range_start_open( + sql_source_db: SQLAlchemySourceDB, backend: TableBackend, last_value_func: Callable[[Any], Any] +) -> None: + incremental = MockIncremental( + last_value=dlt.common.pendulum.now(), + last_value_func=last_value_func, + cursor_path="created_at", + end_value=None, + on_cursor_value_missing="raise", + range_start="open", + ) + + table = sql_source_db.get_table("chat_message") + + loader = TableLoader( + sql_source_db.engine, + backend, + table, + table_to_columns(table), + incremental=incremental, # type: ignore[arg-type] + ) + + query = loader.make_query() + expected = table.select() + + if last_value_func == min: + expected = expected.where(table.c.created_at < incremental.last_value) + else: + expected = expected.where(table.c.created_at > incremental.last_value) + + assert query.compare(expected) + + +@pytest.mark.parametrize("backend", ["sqlalchemy", "pyarrow", "pandas", "connectorx"]) +@pytest.mark.parametrize("last_value_func", [min, max]) +def test_make_query_incremental_range_end_closed( + sql_source_db: SQLAlchemySourceDB, backend: TableBackend, last_value_func: Callable[[Any], Any] +) -> None: + incremental = MockIncremental( + last_value=dlt.common.pendulum.now(), + last_value_func=last_value_func, + cursor_path="created_at", + end_value=None, + on_cursor_value_missing="raise", + range_end="closed", + ) + + table = sql_source_db.get_table("chat_message") + loader = TableLoader( + sql_source_db.engine, + backend, + table, + table_to_columns(table), + incremental=incremental, # type: ignore[arg-type] + ) + + query = loader.make_query() + expected = table.select() + + if last_value_func == min: + expected = expected.where(table.c.created_at <= incremental.last_value) + else: + expected = expected.where(table.c.created_at >= incremental.last_value) + + assert query.compare(expected) + + def mock_json_column(field: str) -> TDataItem: """""" import pyarrow as pa diff --git a/tests/load/sources/sql_database/test_sql_database_source.py b/tests/load/sources/sql_database/test_sql_database_source.py index 9079638586..00257471e0 100644 --- a/tests/load/sources/sql_database/test_sql_database_source.py +++ b/tests/load/sources/sql_database/test_sql_database_source.py @@ -13,6 +13,7 @@ from dlt.common.utils import uniq_id from dlt.extract.exceptions import ResourceExtractionError +from dlt.extract.incremental.transform import JsonIncremental, ArrowIncremental from dlt.sources import DltResource from tests.pipeline.utils import ( @@ -831,8 +832,12 @@ def _assert_incremental(item): else: assert _r.incremental.primary_key == ["id"] assert _r.incremental._incremental.primary_key == ["id"] - assert _r.incremental._incremental._transformers["json"].primary_key == ["id"] - assert _r.incremental._incremental._transformers["arrow"].primary_key == ["id"] + assert _r.incremental._incremental._make_or_get_transformer( + JsonIncremental + ).primary_key == ["id"] + assert _r.incremental._incremental._make_or_get_transformer( + ArrowIncremental + ).primary_key == ["id"] return item pipeline = make_pipeline("duckdb") @@ -841,8 +846,12 @@ def _assert_incremental(item): assert resource.incremental.primary_key == ["id"] assert resource.incremental._incremental.primary_key == ["id"] - assert resource.incremental._incremental._transformers["json"].primary_key == ["id"] - assert resource.incremental._incremental._transformers["arrow"].primary_key == ["id"] + assert resource.incremental._incremental._make_or_get_transformer( + JsonIncremental + ).primary_key == ["id"] + assert resource.incremental._incremental._make_or_get_transformer( + ArrowIncremental + ).primary_key == ["id"] @pytest.mark.parametrize("backend", ["sqlalchemy", "pyarrow", "pandas", "connectorx"]) From 631994d81f95163ddde6ebf4734cb2dd7378dbf8 Mon Sep 17 00:00:00 2001 From: David Scharf Date: Tue, 10 Dec 2024 23:44:01 +0100 Subject: [PATCH 24/32] bump semver to minimum version 3.0.0 (#2132) --- poetry.lock | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/poetry.lock b/poetry.lock index 88a5495c6e..8cada982bd 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "about-time" @@ -10638,4 +10638,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "fa1e98f02135eb816a2e7fe72974bf7acb09d2a5ba164836dafa03276e017a47" +content-hash = "1bf3deccd929c083b880c1a82be0983430ab49f7ade247b1c5573bb8c70d9ff5" diff --git a/pyproject.toml b/pyproject.toml index c4e3c127d4..5c91b3ec1c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,7 @@ requests = ">=2.26.0" pendulum = ">=2.1.2" simplejson = ">=3.17.5" PyYAML = ">=5.4.1" -semver = ">=2.13.0" +semver = ">=3.0.0" hexbytes = ">=0.2.2" tzdata = ">=2022.1" tomlkit = ">=0.11.3" From 5ec7386cdb825078967915c08adbc5f257a37630 Mon Sep 17 00:00:00 2001 From: David Scharf Date: Wed, 11 Dec 2024 00:43:32 +0100 Subject: [PATCH 25/32] leverage ibis expression for getting readablerelations (#2046) * add ibis dataset in own class for now * make error clearer * fix some linting and fix broken test * make most destinations work with selecting the right db and catalog, transpiling sql via postgres in some cases and selecting the right dialect in others * add missing motherduck and sqlalchemy mappings * casefold identifiers for ibis wrapper calss * re-organize existing dataset code to prepare ibis relation integration * integrate ibis relation into existing code * re-order tests * fall back to default dataset if table not in schema * make dataset type selectable * add dataset type selection test and fix bug in tests * update docs for ibis expressions use * ensure a bunch of ibis operations continue working * add some more tests and typings * fix typing (with brute force get_attr typing..) * move ibis to dependency group * move ibis stuff to helpers * post devel merge, put in change from dataset, update lockfile * add ibis to sqlalchemy tests * improve docs a bit * fix ibis dep group * fix dataset snippets * fix ibis version * add support for column schema in certion query cases --------- Co-authored-by: Marcin Rudolf --- .github/workflows/test_destination_athena.yml | 2 +- .../test_destination_athena_iceberg.yml | 2 +- .../workflows/test_destination_bigquery.yml | 2 +- .../workflows/test_destination_clickhouse.yml | 2 +- .../workflows/test_destination_databricks.yml | 2 +- .github/workflows/test_destination_dremio.yml | 2 +- .../workflows/test_destination_motherduck.yml | 2 +- .github/workflows/test_destination_mssql.yml | 2 +- .../workflows/test_destination_snowflake.yml | 2 +- .../workflows/test_destination_synapse.yml | 2 +- .github/workflows/test_destinations.yml | 2 +- .github/workflows/test_local_destinations.yml | 2 +- .../test_sqlalchemy_destinations.yml | 2 +- dlt/common/destination/reference.py | 10 +- dlt/destinations/dataset.py | 412 ------------------ dlt/destinations/dataset/__init__.py | 19 + dlt/destinations/dataset/dataset.py | 142 ++++++ dlt/destinations/dataset/exceptions.py | 22 + dlt/destinations/dataset/factory.py | 22 + dlt/destinations/dataset/ibis_relation.py | 224 ++++++++++ dlt/destinations/dataset/relation.py | 207 +++++++++ dlt/destinations/dataset/utils.py | 95 ++++ .../impl/sqlalchemy/db_api_client.py | 4 +- dlt/{common/libs => helpers}/ibis.py | 58 ++- dlt/pipeline/pipeline.py | 12 +- .../general-usage/dataset-access/dataset.md | 58 +++ poetry.lock | 105 ++--- pyproject.toml | 7 +- .../test_readable_dbapi_dataset.py | 30 +- tests/load/pipeline/test_duckdb.py | 8 +- tests/load/test_read_interfaces.py | 363 ++++++++++++--- 31 files changed, 1245 insertions(+), 579 deletions(-) delete mode 100644 dlt/destinations/dataset.py create mode 100644 dlt/destinations/dataset/__init__.py create mode 100644 dlt/destinations/dataset/dataset.py create mode 100644 dlt/destinations/dataset/exceptions.py create mode 100644 dlt/destinations/dataset/factory.py create mode 100644 dlt/destinations/dataset/ibis_relation.py create mode 100644 dlt/destinations/dataset/relation.py create mode 100644 dlt/destinations/dataset/utils.py rename dlt/{common/libs => helpers}/ibis.py (74%) diff --git a/.github/workflows/test_destination_athena.yml b/.github/workflows/test_destination_athena.yml index 1169fab0de..03eb7f9434 100644 --- a/.github/workflows/test_destination_athena.yml +++ b/.github/workflows/test_destination_athena.yml @@ -67,7 +67,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E athena --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E athena --with sentry-sdk --with pipeline,ibis - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_athena_iceberg.yml b/.github/workflows/test_destination_athena_iceberg.yml index 7ccefcc055..3412e789e3 100644 --- a/.github/workflows/test_destination_athena_iceberg.yml +++ b/.github/workflows/test_destination_athena_iceberg.yml @@ -67,7 +67,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E athena --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E athena --with sentry-sdk --with pipeline,ibis - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_bigquery.yml b/.github/workflows/test_destination_bigquery.yml index 7afc9b8a00..eb8b63f757 100644 --- a/.github/workflows/test_destination_bigquery.yml +++ b/.github/workflows/test_destination_bigquery.yml @@ -66,7 +66,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E bigquery --with providers -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E bigquery --with providers -E parquet --with sentry-sdk --with pipeline,ibis - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_clickhouse.yml b/.github/workflows/test_destination_clickhouse.yml index 7f297db971..46464ea462 100644 --- a/.github/workflows/test_destination_clickhouse.yml +++ b/.github/workflows/test_destination_clickhouse.yml @@ -61,7 +61,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E clickhouse --with providers -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E clickhouse --with providers -E parquet --with sentry-sdk --with pipeline,ibis - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_databricks.yml b/.github/workflows/test_destination_databricks.yml index 1656fe27f4..c1609de863 100644 --- a/.github/workflows/test_destination_databricks.yml +++ b/.github/workflows/test_destination_databricks.yml @@ -64,7 +64,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E databricks -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E databricks -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline,ibis - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_dremio.yml b/.github/workflows/test_destination_dremio.yml index 45c6d17db1..4bc48c54db 100644 --- a/.github/workflows/test_destination_dremio.yml +++ b/.github/workflows/test_destination_dremio.yml @@ -65,7 +65,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline,ibis - run: | poetry run pytest tests/load --ignore tests/load/sources diff --git a/.github/workflows/test_destination_motherduck.yml b/.github/workflows/test_destination_motherduck.yml index 0014b17655..db81131266 100644 --- a/.github/workflows/test_destination_motherduck.yml +++ b/.github/workflows/test_destination_motherduck.yml @@ -64,7 +64,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-motherduck - name: Install dependencies - run: poetry install --no-interaction -E motherduck -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E motherduck -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline,ibis - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_mssql.yml b/.github/workflows/test_destination_mssql.yml index 8b899e7da2..6fdd7a5bc5 100644 --- a/.github/workflows/test_destination_mssql.yml +++ b/.github/workflows/test_destination_mssql.yml @@ -69,7 +69,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E mssql -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E mssql -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline,ibis - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_snowflake.yml b/.github/workflows/test_destination_snowflake.yml index a720c479bd..73a2a8f6e7 100644 --- a/.github/workflows/test_destination_snowflake.yml +++ b/.github/workflows/test_destination_snowflake.yml @@ -64,7 +64,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E snowflake -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E snowflake -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline,ibis - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_synapse.yml b/.github/workflows/test_destination_synapse.yml index be1b493916..8f6bf1eb29 100644 --- a/.github/workflows/test_destination_synapse.yml +++ b/.github/workflows/test_destination_synapse.yml @@ -67,7 +67,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E synapse -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E synapse -E parquet --with sentry-sdk --with pipeline,ibis - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destinations.yml b/.github/workflows/test_destinations.yml index 933248d994..cfd0a3bd56 100644 --- a/.github/workflows/test_destinations.yml +++ b/.github/workflows/test_destinations.yml @@ -78,7 +78,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E redshift -E postgis -E postgres -E gs -E s3 -E az -E parquet -E duckdb -E cli -E filesystem --with sentry-sdk --with pipeline -E deltalake + run: poetry install --no-interaction -E redshift -E postgis -E postgres -E gs -E s3 -E az -E parquet -E duckdb -E cli -E filesystem --with sentry-sdk --with pipeline,ibis -E deltalake - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_local_destinations.yml b/.github/workflows/test_local_destinations.yml index 4947a46a3b..6f44e5fd5a 100644 --- a/.github/workflows/test_local_destinations.yml +++ b/.github/workflows/test_local_destinations.yml @@ -95,7 +95,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-local-destinations - name: Install dependencies - run: poetry install --no-interaction -E postgres -E postgis -E duckdb -E parquet -E filesystem -E cli -E weaviate -E qdrant -E sftp --with sentry-sdk --with pipeline -E deltalake + run: poetry install --no-interaction -E postgres -E postgis -E duckdb -E parquet -E filesystem -E cli -E weaviate -E qdrant -E sftp --with sentry-sdk --with pipeline,ibis -E deltalake - name: Start SFTP server run: docker compose -f "tests/load/filesystem_sftp/docker-compose.yml" up -d diff --git a/.github/workflows/test_sqlalchemy_destinations.yml b/.github/workflows/test_sqlalchemy_destinations.yml index c2572b322d..1f00373674 100644 --- a/.github/workflows/test_sqlalchemy_destinations.yml +++ b/.github/workflows/test_sqlalchemy_destinations.yml @@ -86,7 +86,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-local-destinations - name: Install dependencies - run: poetry install --no-interaction -E parquet -E filesystem -E sqlalchemy -E cli --with sentry-sdk --with pipeline && poetry run pip install mysqlclient && poetry run pip install "sqlalchemy==${{ matrix.sqlalchemy }}" + run: poetry install --no-interaction -E parquet -E filesystem -E sqlalchemy -E cli --with sentry-sdk --with pipeline,ibis && poetry run pip install mysqlclient && poetry run pip install "sqlalchemy==${{ matrix.sqlalchemy }}" - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index e27f99cde7..048fe2186f 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -67,7 +67,7 @@ TDestinationConfig = TypeVar("TDestinationConfig", bound="DestinationClientConfiguration") TDestinationClient = TypeVar("TDestinationClient", bound="JobClientBase") TDestinationDwhClient = TypeVar("TDestinationDwhClient", bound="DestinationClientDwhConfiguration") -TDatasetType = Literal["dbapi", "ibis"] +TDatasetType = Literal["auto", "default", "ibis"] DEFAULT_FILE_LAYOUT = "{table_name}/{load_id}.{file_id}.{ext}" @@ -76,7 +76,7 @@ try: from dlt.common.libs.pandas import DataFrame from dlt.common.libs.pyarrow import Table as ArrowTable - from dlt.common.libs.ibis import BaseBackend as IbisBackend + from dlt.helpers.ibis import BaseBackend as IbisBackend except MissingDependencyException: DataFrame = Any ArrowTable = Any @@ -535,7 +535,7 @@ def fetchone(self) -> Optional[Tuple[Any, ...]]: ... # modifying access parameters - def limit(self, limit: int) -> "SupportsReadableRelation": + def limit(self, limit: int, **kwargs: Any) -> "SupportsReadableRelation": """limit the result to 'limit' items""" ... @@ -557,6 +557,10 @@ def __getitem__(self, columns: Union[str, Sequence[str]]) -> "SupportsReadableRe """set which columns will be selected""" ... + def __getattr__(self, attr: str) -> Any: + """get an attribute of the relation""" + ... + def __copy__(self) -> "SupportsReadableRelation": """create a copy of the relation object""" ... diff --git a/dlt/destinations/dataset.py b/dlt/destinations/dataset.py deleted file mode 100644 index 27a7f5a7af..0000000000 --- a/dlt/destinations/dataset.py +++ /dev/null @@ -1,412 +0,0 @@ -from typing import Any, Generator, Sequence, Union, TYPE_CHECKING, Tuple - -from contextlib import contextmanager - -from dlt import version -from dlt.common.json import json -from dlt.common.exceptions import MissingDependencyException -from dlt.common.destination import AnyDestination -from dlt.common.destination.reference import ( - SupportsReadableRelation, - SupportsReadableDataset, - TDatasetType, - TDestinationReferenceArg, - Destination, - JobClientBase, - WithStateSync, - DestinationClientDwhConfiguration, - DestinationClientStagingConfiguration, - DestinationClientConfiguration, - DestinationClientDwhWithStagingConfiguration, -) - -from dlt.common.schema.typing import TTableSchemaColumns -from dlt.destinations.sql_client import SqlClientBase, WithSqlClient -from dlt.common.schema import Schema -from dlt.common.exceptions import DltException - -if TYPE_CHECKING: - try: - from dlt.common.libs.ibis import BaseBackend as IbisBackend - except MissingDependencyException: - IbisBackend = Any -else: - IbisBackend = Any - - -class DatasetException(DltException): - pass - - -class ReadableRelationHasQueryException(DatasetException): - def __init__(self, attempted_change: str) -> None: - msg = ( - "This readable relation was created with a provided sql query. You cannot change" - f" {attempted_change}. Please change the orignal sql query." - ) - super().__init__(msg) - - -class ReadableRelationUnknownColumnException(DatasetException): - def __init__(self, column_name: str) -> None: - msg = ( - f"The selected column {column_name} is not known in the dlt schema for this releation." - ) - super().__init__(msg) - - -class ReadableDBAPIRelation(SupportsReadableRelation): - def __init__( - self, - *, - readable_dataset: "ReadableDBAPIDataset", - provided_query: Any = None, - table_name: str = None, - limit: int = None, - selected_columns: Sequence[str] = None, - ) -> None: - """Create a lazy evaluated relation to for the dataset of a destination""" - - # NOTE: we can keep an assertion here, this class will not be created by the user - assert bool(table_name) != bool( - provided_query - ), "Please provide either an sql query OR a table_name" - - self._dataset = readable_dataset - - self._provided_query = provided_query - self._table_name = table_name - self._limit = limit - self._selected_columns = selected_columns - - # wire protocol functions - self.df = self._wrap_func("df") # type: ignore - self.arrow = self._wrap_func("arrow") # type: ignore - self.fetchall = self._wrap_func("fetchall") # type: ignore - self.fetchmany = self._wrap_func("fetchmany") # type: ignore - self.fetchone = self._wrap_func("fetchone") # type: ignore - - self.iter_df = self._wrap_iter("iter_df") # type: ignore - self.iter_arrow = self._wrap_iter("iter_arrow") # type: ignore - self.iter_fetch = self._wrap_iter("iter_fetch") # type: ignore - - @property - def sql_client(self) -> SqlClientBase[Any]: - return self._dataset.sql_client - - @property - def schema(self) -> Schema: - return self._dataset.schema - - @property - def query(self) -> Any: - """build the query""" - if self._provided_query: - return self._provided_query - - table_name = self.sql_client.make_qualified_table_name( - self.schema.naming.normalize_tables_path(self._table_name) - ) - - maybe_limit_clause_1 = "" - maybe_limit_clause_2 = "" - if self._limit: - maybe_limit_clause_1, maybe_limit_clause_2 = self.sql_client._limit_clause_sql( - self._limit - ) - - selector = "*" - if self._selected_columns: - selector = ",".join( - [ - self.sql_client.escape_column_name(self.schema.naming.normalize_path(c)) - for c in self._selected_columns - ] - ) - - return f"SELECT {maybe_limit_clause_1} {selector} FROM {table_name} {maybe_limit_clause_2}" - - @property - def columns_schema(self) -> TTableSchemaColumns: - return self.compute_columns_schema() - - @columns_schema.setter - def columns_schema(self, new_value: TTableSchemaColumns) -> None: - raise NotImplementedError("columns schema in ReadableDBAPIRelation can only be computed") - - def compute_columns_schema(self) -> TTableSchemaColumns: - """provide schema columns for the cursor, may be filtered by selected columns""" - - columns_schema = ( - self.schema.tables.get(self._table_name, {}).get("columns", {}) if self.schema else {} - ) - - if not columns_schema: - return None - if not self._selected_columns: - return columns_schema - - filtered_columns: TTableSchemaColumns = {} - for sc in self._selected_columns: - sc = self.schema.naming.normalize_path(sc) - if sc not in columns_schema.keys(): - raise ReadableRelationUnknownColumnException(sc) - filtered_columns[sc] = columns_schema[sc] - - return filtered_columns - - @contextmanager - def cursor(self) -> Generator[SupportsReadableRelation, Any, Any]: - """Gets a DBApiCursor for the current relation""" - with self.sql_client as client: - # this hacky code is needed for mssql to disable autocommit, read iterators - # will not work otherwise. in the future we should be able to create a readony - # client which will do this automatically - if hasattr(self.sql_client, "_conn") and hasattr(self.sql_client._conn, "autocommit"): - self.sql_client._conn.autocommit = False - with client.execute_query(self.query) as cursor: - if columns_schema := self.columns_schema: - cursor.columns_schema = columns_schema - yield cursor - - def _wrap_iter(self, func_name: str) -> Any: - """wrap SupportsReadableRelation generators in cursor context""" - - def _wrap(*args: Any, **kwargs: Any) -> Any: - with self.cursor() as cursor: - yield from getattr(cursor, func_name)(*args, **kwargs) - - return _wrap - - def _wrap_func(self, func_name: str) -> Any: - """wrap SupportsReadableRelation functions in cursor context""" - - def _wrap(*args: Any, **kwargs: Any) -> Any: - with self.cursor() as cursor: - return getattr(cursor, func_name)(*args, **kwargs) - - return _wrap - - def __copy__(self) -> "ReadableDBAPIRelation": - return self.__class__( - readable_dataset=self._dataset, - provided_query=self._provided_query, - table_name=self._table_name, - limit=self._limit, - selected_columns=self._selected_columns, - ) - - def limit(self, limit: int) -> "ReadableDBAPIRelation": - if self._provided_query: - raise ReadableRelationHasQueryException("limit") - rel = self.__copy__() - rel._limit = limit - return rel - - def select(self, *columns: str) -> "ReadableDBAPIRelation": - if self._provided_query: - raise ReadableRelationHasQueryException("select") - rel = self.__copy__() - rel._selected_columns = columns - # NOTE: the line below will ensure that no unknown columns are selected if - # schema is known - rel.compute_columns_schema() - return rel - - def __getitem__(self, columns: Union[str, Sequence[str]]) -> "SupportsReadableRelation": - if isinstance(columns, str): - return self.select(columns) - elif isinstance(columns, Sequence): - return self.select(*columns) - else: - raise TypeError(f"Invalid argument type: {type(columns).__name__}") - - def head(self, limit: int = 5) -> "ReadableDBAPIRelation": - return self.limit(limit) - - -class ReadableDBAPIDataset(SupportsReadableDataset): - """Access to dataframes and arrowtables in the destination dataset via dbapi""" - - def __init__( - self, - destination: TDestinationReferenceArg, - dataset_name: str, - schema: Union[Schema, str, None] = None, - ) -> None: - self._destination = Destination.from_reference(destination) - self._provided_schema = schema - self._dataset_name = dataset_name - self._sql_client: SqlClientBase[Any] = None - self._schema: Schema = None - - def ibis(self) -> IbisBackend: - """return a connected ibis backend""" - from dlt.common.libs.ibis import create_ibis_backend - - self._ensure_client_and_schema() - return create_ibis_backend( - self._destination, - self._destination_client(self.schema), - ) - - @property - def schema(self) -> Schema: - self._ensure_client_and_schema() - return self._schema - - @property - def sql_client(self) -> SqlClientBase[Any]: - self._ensure_client_and_schema() - return self._sql_client - - def _destination_client(self, schema: Schema) -> JobClientBase: - return get_destination_clients( - schema, destination=self._destination, destination_dataset_name=self._dataset_name - )[0] - - def _ensure_client_and_schema(self) -> None: - """Lazy load schema and client""" - - # full schema given, nothing to do - if not self._schema and isinstance(self._provided_schema, Schema): - self._schema = self._provided_schema - - # schema name given, resolve it from destination by name - elif not self._schema and isinstance(self._provided_schema, str): - with self._destination_client(Schema(self._provided_schema)) as client: - if isinstance(client, WithStateSync): - stored_schema = client.get_stored_schema(self._provided_schema) - if stored_schema: - self._schema = Schema.from_stored_schema(json.loads(stored_schema.schema)) - else: - self._schema = Schema(self._provided_schema) - - # no schema name given, load newest schema from destination - elif not self._schema: - with self._destination_client(Schema(self._dataset_name)) as client: - if isinstance(client, WithStateSync): - stored_schema = client.get_stored_schema() - if stored_schema: - self._schema = Schema.from_stored_schema(json.loads(stored_schema.schema)) - - # default to empty schema with dataset name - if not self._schema: - self._schema = Schema(self._dataset_name) - - # here we create the client bound to the resolved schema - if not self._sql_client: - destination_client = self._destination_client(self._schema) - if isinstance(destination_client, WithSqlClient): - self._sql_client = destination_client.sql_client - else: - raise Exception( - f"Destination {destination_client.config.destination_type} does not support" - " SqlClient." - ) - - def __call__(self, query: Any) -> ReadableDBAPIRelation: - return ReadableDBAPIRelation(readable_dataset=self, provided_query=query) # type: ignore[abstract] - - def table(self, table_name: str) -> SupportsReadableRelation: - return ReadableDBAPIRelation( - readable_dataset=self, - table_name=table_name, - ) # type: ignore[abstract] - - def __getitem__(self, table_name: str) -> SupportsReadableRelation: - """access of table via dict notation""" - return self.table(table_name) - - def __getattr__(self, table_name: str) -> SupportsReadableRelation: - """access of table via property notation""" - return self.table(table_name) - - -def dataset( - destination: TDestinationReferenceArg, - dataset_name: str, - schema: Union[Schema, str, None] = None, - dataset_type: TDatasetType = "dbapi", -) -> SupportsReadableDataset: - if dataset_type == "dbapi": - return ReadableDBAPIDataset(destination, dataset_name, schema) - raise NotImplementedError(f"Dataset of type {dataset_type} not implemented") - - -# helpers -def get_destination_client_initial_config( - destination: AnyDestination, - default_schema_name: str, - dataset_name: str, - as_staging: bool = False, -) -> DestinationClientConfiguration: - client_spec = destination.spec - - # this client supports many schemas and datasets - if issubclass(client_spec, DestinationClientDwhConfiguration): - if issubclass(client_spec, DestinationClientStagingConfiguration): - spec: DestinationClientDwhConfiguration = client_spec(as_staging_destination=as_staging) - else: - spec = client_spec() - - spec._bind_dataset_name(dataset_name, default_schema_name) - return spec - - return client_spec() - - -def get_destination_clients( - schema: Schema, - destination: AnyDestination = None, - destination_dataset_name: str = None, - destination_initial_config: DestinationClientConfiguration = None, - staging: AnyDestination = None, - staging_dataset_name: str = None, - staging_initial_config: DestinationClientConfiguration = None, - # pipeline specific settings - default_schema_name: str = None, -) -> Tuple[JobClientBase, JobClientBase]: - destination = Destination.from_reference(destination) if destination else None - staging = Destination.from_reference(staging) if staging else None - - try: - # resolve staging config in order to pass it to destination client config - staging_client = None - if staging: - if not staging_initial_config: - # this is just initial config - without user configuration injected - staging_initial_config = get_destination_client_initial_config( - staging, - dataset_name=staging_dataset_name, - default_schema_name=default_schema_name, - as_staging=True, - ) - # create the client - that will also resolve the config - staging_client = staging.client(schema, staging_initial_config) - - if not destination_initial_config: - # config is not provided then get it with injected credentials - initial_config = get_destination_client_initial_config( - destination, - dataset_name=destination_dataset_name, - default_schema_name=default_schema_name, - ) - - # attach the staging client config to destination client config - if its type supports it - if ( - staging_client - and isinstance(initial_config, DestinationClientDwhWithStagingConfiguration) - and isinstance(staging_client.config, DestinationClientStagingConfiguration) - ): - initial_config.staging_config = staging_client.config - # create instance with initial_config properly set - client = destination.client(schema, initial_config) - return client, staging_client - except ModuleNotFoundError: - client_spec = destination.spec() - raise MissingDependencyException( - f"{client_spec.destination_type} destination", - [f"{version.DLT_PKG_NAME}[{client_spec.destination_type}]"], - "Dependencies for specific destinations are available as extras of dlt", - ) diff --git a/dlt/destinations/dataset/__init__.py b/dlt/destinations/dataset/__init__.py new file mode 100644 index 0000000000..e0eef681b8 --- /dev/null +++ b/dlt/destinations/dataset/__init__.py @@ -0,0 +1,19 @@ +from dlt.destinations.dataset.factory import ( + dataset, +) +from dlt.destinations.dataset.dataset import ( + ReadableDBAPIDataset, + get_destination_clients, +) +from dlt.destinations.dataset.utils import ( + get_destination_clients, + get_destination_client_initial_config, +) + + +__all__ = [ + "dataset", + "ReadableDBAPIDataset", + "get_destination_client_initial_config", + "get_destination_clients", +] diff --git a/dlt/destinations/dataset/dataset.py b/dlt/destinations/dataset/dataset.py new file mode 100644 index 0000000000..e443045e49 --- /dev/null +++ b/dlt/destinations/dataset/dataset.py @@ -0,0 +1,142 @@ +from typing import Any, Union, TYPE_CHECKING + +from dlt.common.json import json + +from dlt.common.exceptions import MissingDependencyException + +from dlt.common.destination.reference import ( + SupportsReadableRelation, + SupportsReadableDataset, + TDestinationReferenceArg, + Destination, + JobClientBase, + WithStateSync, +) + +from dlt.destinations.sql_client import SqlClientBase, WithSqlClient +from dlt.common.schema import Schema +from dlt.destinations.dataset.relation import ReadableDBAPIRelation +from dlt.destinations.dataset.utils import get_destination_clients +from dlt.common.destination.reference import TDatasetType + +if TYPE_CHECKING: + try: + from dlt.helpers.ibis import BaseBackend as IbisBackend + except MissingDependencyException: + IbisBackend = Any +else: + IbisBackend = Any + + +class ReadableDBAPIDataset(SupportsReadableDataset): + """Access to dataframes and arrowtables in the destination dataset via dbapi""" + + def __init__( + self, + destination: TDestinationReferenceArg, + dataset_name: str, + schema: Union[Schema, str, None] = None, + dataset_type: TDatasetType = "auto", + ) -> None: + self._destination = Destination.from_reference(destination) + self._provided_schema = schema + self._dataset_name = dataset_name + self._sql_client: SqlClientBase[Any] = None + self._schema: Schema = None + self._dataset_type = dataset_type + + def ibis(self) -> IbisBackend: + """return a connected ibis backend""" + from dlt.helpers.ibis import create_ibis_backend + + self._ensure_client_and_schema() + return create_ibis_backend( + self._destination, + self._destination_client(self.schema), + ) + + @property + def schema(self) -> Schema: + self._ensure_client_and_schema() + return self._schema + + @property + def sql_client(self) -> SqlClientBase[Any]: + self._ensure_client_and_schema() + return self._sql_client + + def _destination_client(self, schema: Schema) -> JobClientBase: + return get_destination_clients( + schema, destination=self._destination, destination_dataset_name=self._dataset_name + )[0] + + def _ensure_client_and_schema(self) -> None: + """Lazy load schema and client""" + + # full schema given, nothing to do + if not self._schema and isinstance(self._provided_schema, Schema): + self._schema = self._provided_schema + + # schema name given, resolve it from destination by name + elif not self._schema and isinstance(self._provided_schema, str): + with self._destination_client(Schema(self._provided_schema)) as client: + if isinstance(client, WithStateSync): + stored_schema = client.get_stored_schema(self._provided_schema) + if stored_schema: + self._schema = Schema.from_stored_schema(json.loads(stored_schema.schema)) + else: + self._schema = Schema(self._provided_schema) + + # no schema name given, load newest schema from destination + elif not self._schema: + with self._destination_client(Schema(self._dataset_name)) as client: + if isinstance(client, WithStateSync): + stored_schema = client.get_stored_schema() + if stored_schema: + self._schema = Schema.from_stored_schema(json.loads(stored_schema.schema)) + + # default to empty schema with dataset name + if not self._schema: + self._schema = Schema(self._dataset_name) + + # here we create the client bound to the resolved schema + if not self._sql_client: + destination_client = self._destination_client(self._schema) + if isinstance(destination_client, WithSqlClient): + self._sql_client = destination_client.sql_client + else: + raise Exception( + f"Destination {destination_client.config.destination_type} does not support" + " SqlClient." + ) + + def __call__(self, query: Any) -> ReadableDBAPIRelation: + return ReadableDBAPIRelation(readable_dataset=self, provided_query=query) # type: ignore[abstract] + + def table(self, table_name: str) -> SupportsReadableRelation: + # we can create an ibis powered relation if ibis is available + if table_name in self.schema.tables and self._dataset_type in ("auto", "ibis"): + try: + from dlt.helpers.ibis import create_unbound_ibis_table + from dlt.destinations.dataset.ibis_relation import ReadableIbisRelation + + unbound_table = create_unbound_ibis_table(self.sql_client, self.schema, table_name) + return ReadableIbisRelation(readable_dataset=self, ibis_object=unbound_table, columns_schema=self.schema.tables[table_name]["columns"]) # type: ignore[abstract] + except MissingDependencyException: + # if ibis is explicitly requested, reraise + if self._dataset_type == "ibis": + raise + + # fallback to the standard dbapi relation + return ReadableDBAPIRelation( + readable_dataset=self, + table_name=table_name, + ) # type: ignore[abstract] + + def __getitem__(self, table_name: str) -> SupportsReadableRelation: + """access of table via dict notation""" + return self.table(table_name) + + def __getattr__(self, table_name: str) -> SupportsReadableRelation: + """access of table via property notation""" + return self.table(table_name) diff --git a/dlt/destinations/dataset/exceptions.py b/dlt/destinations/dataset/exceptions.py new file mode 100644 index 0000000000..17e8f6b563 --- /dev/null +++ b/dlt/destinations/dataset/exceptions.py @@ -0,0 +1,22 @@ +from dlt.common.exceptions import DltException + + +class DatasetException(DltException): + pass + + +class ReadableRelationHasQueryException(DatasetException): + def __init__(self, attempted_change: str) -> None: + msg = ( + "This readable relation was created with a provided sql query. You cannot change" + f" {attempted_change}. Please change the orignal sql query." + ) + super().__init__(msg) + + +class ReadableRelationUnknownColumnException(DatasetException): + def __init__(self, column_name: str) -> None: + msg = ( + f"The selected column {column_name} is not known in the dlt schema for this releation." + ) + super().__init__(msg) diff --git a/dlt/destinations/dataset/factory.py b/dlt/destinations/dataset/factory.py new file mode 100644 index 0000000000..8ea0ddf7a1 --- /dev/null +++ b/dlt/destinations/dataset/factory.py @@ -0,0 +1,22 @@ +from typing import Union + + +from dlt.common.destination import AnyDestination +from dlt.common.destination.reference import ( + SupportsReadableDataset, + TDatasetType, + TDestinationReferenceArg, +) + +from dlt.common.schema import Schema + +from dlt.destinations.dataset.dataset import ReadableDBAPIDataset + + +def dataset( + destination: TDestinationReferenceArg, + dataset_name: str, + schema: Union[Schema, str, None] = None, + dataset_type: TDatasetType = "auto", +) -> SupportsReadableDataset: + return ReadableDBAPIDataset(destination, dataset_name, schema, dataset_type) diff --git a/dlt/destinations/dataset/ibis_relation.py b/dlt/destinations/dataset/ibis_relation.py new file mode 100644 index 0000000000..632298ad56 --- /dev/null +++ b/dlt/destinations/dataset/ibis_relation.py @@ -0,0 +1,224 @@ +from typing import TYPE_CHECKING, Any, Union, Sequence + +from functools import partial + +from dlt.common.exceptions import MissingDependencyException +from dlt.destinations.dataset.relation import BaseReadableDBAPIRelation +from dlt.common.schema.typing import TTableSchemaColumns + + +if TYPE_CHECKING: + from dlt.destinations.dataset.dataset import ReadableDBAPIDataset +else: + ReadableDBAPIDataset = Any + +try: + from dlt.helpers.ibis import Expr +except MissingDependencyException: + Expr = Any + +# map dlt destination to sqlglot dialect +DIALECT_MAP = { + "dlt.destinations.duckdb": "duckdb", # works + "dlt.destinations.motherduck": "duckdb", # works + "dlt.destinations.clickhouse": "clickhouse", # works + "dlt.destinations.databricks": "databricks", # works + "dlt.destinations.bigquery": "bigquery", # works + "dlt.destinations.postgres": "postgres", # works + "dlt.destinations.redshift": "redshift", # works + "dlt.destinations.snowflake": "snowflake", # works + "dlt.destinations.mssql": "tsql", # works + "dlt.destinations.synapse": "tsql", # works + "dlt.destinations.athena": "trino", # works + "dlt.destinations.filesystem": "duckdb", # works + "dlt.destinations.dremio": "presto", # works + # NOTE: can we discover the current dialect in sqlalchemy? + "dlt.destinations.sqlalchemy": "mysql", # may work +} + +# NOTE: some dialects are not supported by ibis, but by sqlglot, these need to +# be transpiled with a intermediary step +TRANSPILE_VIA_MAP = { + "tsql": "postgres", + "databricks": "postgres", + "clickhouse": "postgres", + "redshift": "postgres", + "presto": "postgres", +} + + +class ReadableIbisRelation(BaseReadableDBAPIRelation): + def __init__( + self, + *, + readable_dataset: ReadableDBAPIDataset, + ibis_object: Any = None, + columns_schema: TTableSchemaColumns = None, + ) -> None: + """Create a lazy evaluated relation to for the dataset of a destination""" + super().__init__(readable_dataset=readable_dataset) + self._ibis_object = ibis_object + self._columns_schema = columns_schema + + @property + def query(self) -> Any: + """build the query""" + + from dlt.helpers.ibis import ibis, sqlglot + + destination_type = self._dataset._destination.destination_type + target_dialect = DIALECT_MAP[destination_type] + + # render sql directly if possible + if target_dialect not in TRANSPILE_VIA_MAP: + return ibis.to_sql(self._ibis_object, dialect=target_dialect) + + # here we need to transpile first + transpile_via = TRANSPILE_VIA_MAP[target_dialect] + sql = ibis.to_sql(self._ibis_object, dialect=transpile_via) + sql = sqlglot.transpile(sql, read=transpile_via, write=target_dialect)[0] + return sql + + @property + def columns_schema(self) -> TTableSchemaColumns: + return self.compute_columns_schema() + + @columns_schema.setter + def columns_schema(self, new_value: TTableSchemaColumns) -> None: + raise NotImplementedError("columns schema in ReadableDBAPIRelation can only be computed") + + def compute_columns_schema(self) -> TTableSchemaColumns: + """provide schema columns for the cursor, may be filtered by selected columns""" + # TODO: provide column lineage tracing with sqlglot lineage + return self._columns_schema + + def _proxy_expression_method(self, method_name: str, *args: Any, **kwargs: Any) -> Any: + """Proxy method calls to the underlying ibis expression, allowing to wrap the resulting expression in a new relation""" + + # Get the method from the expression + method = getattr(self._ibis_object, method_name) + + # unwrap args and kwargs if they are relations + args = tuple( + arg._ibis_object if isinstance(arg, ReadableIbisRelation) else arg for arg in args + ) + kwargs = { + k: v._ibis_object if isinstance(v, ReadableIbisRelation) else v + for k, v in kwargs.items() + } + + # casefold string params, we assume these are column names + args = tuple( + self.sql_client.capabilities.casefold_identifier(arg) if isinstance(arg, str) else arg + for arg in args + ) + kwargs = { + k: self.sql_client.capabilities.casefold_identifier(v) if isinstance(v, str) else v + for k, v in kwargs.items() + } + + # Call it with provided args + result = method(*args, **kwargs) + + # calculate columns schema for the result, some operations we know will not change the schema + # and select will just reduce the amount of column + columns_schema = None + if method_name == "select": + columns_schema = self._get_filtered_columns_schema(args) + elif method_name in ["filter", "limit", "order_by", "head"]: + columns_schema = self._columns_schema + + # If result is an ibis expression, wrap it in a new relation else return raw result + return self.__class__( + readable_dataset=self._dataset, ibis_object=result, columns_schema=columns_schema + ) + + def __getattr__(self, name: str) -> Any: + """Wrap all callable attributes of the expression""" + + attr = getattr(self._ibis_object, name, None) + + # try casefolded name for ibis columns access + if attr is None: + name = self.sql_client.capabilities.casefold_identifier(name) + attr = getattr(self._ibis_object, name, None) + + if attr is None: + raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'") + + if not callable(attr): + # NOTE: we don't need to forward columns schema for non-callable attributes, these are usually columns + return self.__class__(readable_dataset=self._dataset, ibis_object=attr) + + return partial(self._proxy_expression_method, name) + + def __getitem__(self, columns: Union[str, Sequence[str]]) -> "ReadableIbisRelation": + # casefold column-names + columns = [columns] if isinstance(columns, str) else columns + columns = [self.sql_client.capabilities.casefold_identifier(col) for col in columns] + expr = self._ibis_object[columns] + return self.__class__( + readable_dataset=self._dataset, + ibis_object=expr, + columns_schema=self._get_filtered_columns_schema(columns), + ) + + def _get_filtered_columns_schema(self, columns: Sequence[str]) -> TTableSchemaColumns: + if not self._columns_schema: + return None + try: + return {col: self._columns_schema[col] for col in columns} + except KeyError: + # NOTE: select statements can contain new columns not present in the original schema + # here we just break the column schema inheritance chain + return None + + # forward ibis methods defined on interface + def limit(self, limit: int, **kwargs: Any) -> "ReadableIbisRelation": + """limit the result to 'limit' items""" + return self._proxy_expression_method("limit", limit, **kwargs) # type: ignore + + def head(self, limit: int = 5) -> "ReadableIbisRelation": + """limit the result to 5 items by default""" + return self._proxy_expression_method("head", limit) # type: ignore + + def select(self, *columns: str) -> "ReadableIbisRelation": + """set which columns will be selected""" + return self._proxy_expression_method("select", *columns) # type: ignore + + # forward ibis comparison and math operators + def __lt__(self, other: Any) -> "ReadableIbisRelation": + return self._proxy_expression_method("__lt__", other) # type: ignore + + def __gt__(self, other: Any) -> "ReadableIbisRelation": + return self._proxy_expression_method("__gt__", other) # type: ignore + + def __ge__(self, other: Any) -> "ReadableIbisRelation": + return self._proxy_expression_method("__ge__", other) # type: ignore + + def __le__(self, other: Any) -> "ReadableIbisRelation": + return self._proxy_expression_method("__le__", other) # type: ignore + + def __eq__(self, other: Any) -> bool: + return self._proxy_expression_method("__eq__", other) # type: ignore + + def __ne__(self, other: Any) -> bool: + return self._proxy_expression_method("__ne__", other) # type: ignore + + def __and__(self, other: Any) -> "ReadableIbisRelation": + return self._proxy_expression_method("__and__", other) # type: ignore + + def __or__(self, other: Any) -> "ReadableIbisRelation": + return self._proxy_expression_method("__or__", other) # type: ignore + + def __mul__(self, other: Any) -> "ReadableIbisRelation": + return self._proxy_expression_method("__mul__", other) # type: ignore + + def __div__(self, other: Any) -> "ReadableIbisRelation": + return self._proxy_expression_method("__div__", other) # type: ignore + + def __add__(self, other: Any) -> "ReadableIbisRelation": + return self._proxy_expression_method("__add__", other) # type: ignore + + def __sub__(self, other: Any) -> "ReadableIbisRelation": + return self._proxy_expression_method("__sub__", other) # type: ignore diff --git a/dlt/destinations/dataset/relation.py b/dlt/destinations/dataset/relation.py new file mode 100644 index 0000000000..2cdb7640df --- /dev/null +++ b/dlt/destinations/dataset/relation.py @@ -0,0 +1,207 @@ +from typing import Any, Generator, Sequence, Union, TYPE_CHECKING + +from contextlib import contextmanager + + +from dlt.common.destination.reference import ( + SupportsReadableRelation, +) + +from dlt.destinations.dataset.exceptions import ( + ReadableRelationHasQueryException, + ReadableRelationUnknownColumnException, +) + +from dlt.common.schema.typing import TTableSchemaColumns +from dlt.destinations.sql_client import SqlClientBase +from dlt.common.schema import Schema + +if TYPE_CHECKING: + from dlt.destinations.dataset.dataset import ReadableDBAPIDataset +else: + ReadableDBAPIDataset = Any + + +class BaseReadableDBAPIRelation(SupportsReadableRelation): + def __init__( + self, + *, + readable_dataset: "ReadableDBAPIDataset", + ) -> None: + """Create a lazy evaluated relation to for the dataset of a destination""" + + self._dataset = readable_dataset + + # wire protocol functions + self.df = self._wrap_func("df") # type: ignore + self.arrow = self._wrap_func("arrow") # type: ignore + self.fetchall = self._wrap_func("fetchall") # type: ignore + self.fetchmany = self._wrap_func("fetchmany") # type: ignore + self.fetchone = self._wrap_func("fetchone") # type: ignore + + self.iter_df = self._wrap_iter("iter_df") # type: ignore + self.iter_arrow = self._wrap_iter("iter_arrow") # type: ignore + self.iter_fetch = self._wrap_iter("iter_fetch") # type: ignore + + @property + def sql_client(self) -> SqlClientBase[Any]: + return self._dataset.sql_client + + @property + def schema(self) -> Schema: + return self._dataset.schema + + @property + def query(self) -> Any: + raise NotImplementedError("No query in ReadableDBAPIRelation") + + @contextmanager + def cursor(self) -> Generator[SupportsReadableRelation, Any, Any]: + """Gets a DBApiCursor for the current relation""" + with self.sql_client as client: + # this hacky code is needed for mssql to disable autocommit, read iterators + # will not work otherwise. in the future we should be able to create a readony + # client which will do this automatically + if hasattr(self.sql_client, "_conn") and hasattr(self.sql_client._conn, "autocommit"): + self.sql_client._conn.autocommit = False + with client.execute_query(self.query) as cursor: + if columns_schema := self.columns_schema: + cursor.columns_schema = columns_schema + yield cursor + + def _wrap_iter(self, func_name: str) -> Any: + """wrap SupportsReadableRelation generators in cursor context""" + + def _wrap(*args: Any, **kwargs: Any) -> Any: + with self.cursor() as cursor: + yield from getattr(cursor, func_name)(*args, **kwargs) + + return _wrap + + def _wrap_func(self, func_name: str) -> Any: + """wrap SupportsReadableRelation functions in cursor context""" + + def _wrap(*args: Any, **kwargs: Any) -> Any: + with self.cursor() as cursor: + return getattr(cursor, func_name)(*args, **kwargs) + + return _wrap + + +class ReadableDBAPIRelation(BaseReadableDBAPIRelation): + def __init__( + self, + *, + readable_dataset: "ReadableDBAPIDataset", + provided_query: Any = None, + table_name: str = None, + limit: int = None, + selected_columns: Sequence[str] = None, + ) -> None: + """Create a lazy evaluated relation to for the dataset of a destination""" + + # NOTE: we can keep an assertion here, this class will not be created by the user + assert bool(table_name) != bool( + provided_query + ), "Please provide either an sql query OR a table_name" + + super().__init__(readable_dataset=readable_dataset) + + self._provided_query = provided_query + self._table_name = table_name + self._limit = limit + self._selected_columns = selected_columns + + @property + def query(self) -> Any: + """build the query""" + if self._provided_query: + return self._provided_query + + table_name = self.sql_client.make_qualified_table_name( + self.schema.naming.normalize_path(self._table_name) + ) + + maybe_limit_clause_1 = "" + maybe_limit_clause_2 = "" + if self._limit: + maybe_limit_clause_1, maybe_limit_clause_2 = self.sql_client._limit_clause_sql( + self._limit + ) + + selector = "*" + if self._selected_columns: + selector = ",".join( + [ + self.sql_client.escape_column_name(self.schema.naming.normalize_tables_path(c)) + for c in self._selected_columns + ] + ) + + return f"SELECT {maybe_limit_clause_1} {selector} FROM {table_name} {maybe_limit_clause_2}" + + @property + def columns_schema(self) -> TTableSchemaColumns: + return self.compute_columns_schema() + + @columns_schema.setter + def columns_schema(self, new_value: TTableSchemaColumns) -> None: + raise NotImplementedError("columns schema in ReadableDBAPIRelation can only be computed") + + def compute_columns_schema(self) -> TTableSchemaColumns: + """provide schema columns for the cursor, may be filtered by selected columns""" + + columns_schema = ( + self.schema.tables.get(self._table_name, {}).get("columns", {}) if self.schema else {} + ) + + if not columns_schema: + return None + if not self._selected_columns: + return columns_schema + + filtered_columns: TTableSchemaColumns = {} + for sc in self._selected_columns: + sc = self.schema.naming.normalize_path(sc) + if sc not in columns_schema.keys(): + raise ReadableRelationUnknownColumnException(sc) + filtered_columns[sc] = columns_schema[sc] + + return filtered_columns + + def __copy__(self) -> "ReadableDBAPIRelation": + return self.__class__( + readable_dataset=self._dataset, + provided_query=self._provided_query, + table_name=self._table_name, + limit=self._limit, + selected_columns=self._selected_columns, + ) + + def limit(self, limit: int, **kwargs: Any) -> "ReadableDBAPIRelation": + if self._provided_query: + raise ReadableRelationHasQueryException("limit") + rel = self.__copy__() + rel._limit = limit + return rel + + def select(self, *columns: str) -> "ReadableDBAPIRelation": + if self._provided_query: + raise ReadableRelationHasQueryException("select") + rel = self.__copy__() + rel._selected_columns = columns + # NOTE: the line below will ensure that no unknown columns are selected if + # schema is known + rel.compute_columns_schema() + return rel + + def __getitem__(self, columns: Union[str, Sequence[str]]) -> "SupportsReadableRelation": + if isinstance(columns, str): + return self.select(columns) + elif isinstance(columns, Sequence): + return self.select(*columns) + else: + raise TypeError(f"Invalid argument type: {type(columns).__name__}") + + def head(self, limit: int = 5) -> "ReadableDBAPIRelation": + return self.limit(limit) diff --git a/dlt/destinations/dataset/utils.py b/dlt/destinations/dataset/utils.py new file mode 100644 index 0000000000..766fbc13ea --- /dev/null +++ b/dlt/destinations/dataset/utils.py @@ -0,0 +1,95 @@ +from typing import Tuple + +from dlt import version + +from dlt.common.exceptions import MissingDependencyException + +from dlt.common.destination import AnyDestination +from dlt.common.destination.reference import ( + Destination, + JobClientBase, + DestinationClientDwhConfiguration, + DestinationClientStagingConfiguration, + DestinationClientConfiguration, + DestinationClientDwhWithStagingConfiguration, +) + +from dlt.common.schema import Schema + + +# helpers +def get_destination_client_initial_config( + destination: AnyDestination, + default_schema_name: str, + dataset_name: str, + as_staging: bool = False, +) -> DestinationClientConfiguration: + client_spec = destination.spec + + # this client supports many schemas and datasets + if issubclass(client_spec, DestinationClientDwhConfiguration): + if issubclass(client_spec, DestinationClientStagingConfiguration): + spec: DestinationClientDwhConfiguration = client_spec(as_staging_destination=as_staging) + else: + spec = client_spec() + + spec._bind_dataset_name(dataset_name, default_schema_name) + return spec + + return client_spec() + + +def get_destination_clients( + schema: Schema, + destination: AnyDestination = None, + destination_dataset_name: str = None, + destination_initial_config: DestinationClientConfiguration = None, + staging: AnyDestination = None, + staging_dataset_name: str = None, + staging_initial_config: DestinationClientConfiguration = None, + # pipeline specific settings + default_schema_name: str = None, +) -> Tuple[JobClientBase, JobClientBase]: + destination = Destination.from_reference(destination) if destination else None + staging = Destination.from_reference(staging) if staging else None + + try: + # resolve staging config in order to pass it to destination client config + staging_client = None + if staging: + if not staging_initial_config: + # this is just initial config - without user configuration injected + staging_initial_config = get_destination_client_initial_config( + staging, + dataset_name=staging_dataset_name, + default_schema_name=default_schema_name, + as_staging=True, + ) + # create the client - that will also resolve the config + staging_client = staging.client(schema, staging_initial_config) + + if not destination_initial_config: + # config is not provided then get it with injected credentials + initial_config = get_destination_client_initial_config( + destination, + dataset_name=destination_dataset_name, + default_schema_name=default_schema_name, + ) + + # attach the staging client config to destination client config - if its type supports it + if ( + staging_client + and isinstance(initial_config, DestinationClientDwhWithStagingConfiguration) + and isinstance(staging_client.config, DestinationClientStagingConfiguration) + ): + initial_config.staging_config = staging_client.config + # create instance with initial_config properly set + client = destination.client(schema, initial_config) + return client, staging_client + except ModuleNotFoundError: + client_spec = destination.spec() + raise MissingDependencyException( + f"{client_spec.destination_type} destination", + [f"{version.DLT_PKG_NAME}[{client_spec.destination_type}]"], + "Dependencies for specific destinations are available as extras of dlt", + ) diff --git a/dlt/destinations/impl/sqlalchemy/db_api_client.py b/dlt/destinations/impl/sqlalchemy/db_api_client.py index 6f3ff065bf..27c4f2f1f9 100644 --- a/dlt/destinations/impl/sqlalchemy/db_api_client.py +++ b/dlt/destinations/impl/sqlalchemy/db_api_client.py @@ -84,7 +84,7 @@ def __init__(self, curr: sa.engine.CursorResult) -> None: def _get_columns(self) -> List[str]: try: - return list(self.native_cursor.keys()) # type: ignore[attr-defined] + return list(self.native_cursor.keys()) except ResourceClosedError: # this happens if now rows are returned return [] @@ -314,7 +314,7 @@ def execute_sql( self, sql: Union[AnyStr, sa.sql.Executable], *args: Any, **kwargs: Any ) -> Optional[Sequence[Sequence[Any]]]: with self.execute_query(sql, *args, **kwargs) as cursor: - if cursor.returns_rows: # type: ignore[attr-defined] + if cursor.returns_rows: return cursor.fetchall() return None diff --git a/dlt/common/libs/ibis.py b/dlt/helpers/ibis.py similarity index 74% rename from dlt/common/libs/ibis.py rename to dlt/helpers/ibis.py index ba6f363e66..ed4264dac7 100644 --- a/dlt/common/libs/ibis.py +++ b/dlt/helpers/ibis.py @@ -1,12 +1,14 @@ -from typing import cast +from typing import cast, Any from dlt.common.exceptions import MissingDependencyException - from dlt.common.destination.reference import TDestinationReferenceArg, Destination, JobClientBase +from dlt.common.schema import Schema +from dlt.destinations.sql_client import SqlClientBase try: import ibis # type: ignore - from ibis import BaseBackend + import sqlglot + from ibis import BaseBackend, Expr except ModuleNotFoundError: raise MissingDependencyException("dlt ibis Helpers", ["ibis"]) @@ -29,6 +31,22 @@ ] +# Map dlt data types to ibis data types +DATA_TYPE_MAP = { + "text": "string", + "double": "float64", + "bool": "boolean", + "timestamp": "timestamp", + "bigint": "int64", + "binary": "binary", + "json": "string", # Store JSON as string in ibis + "decimal": "decimal", + "wei": "int64", # Wei is a large integer + "date": "date", + "time": "time", +} + + def create_ibis_backend( destination: TDestinationReferenceArg, client: JobClientBase ) -> BaseBackend: @@ -119,3 +137,37 @@ def create_ibis_backend( con = ibis.duckdb.from_connection(duck) return con + + +def create_unbound_ibis_table( + sql_client: SqlClientBase[Any], schema: Schema, table_name: str +) -> Expr: + """Create an unbound ibis table from a dlt schema""" + + if table_name not in schema.tables: + raise Exception( + f"Table {table_name} not found in schema. Available tables: {schema.tables.keys()}" + ) + table_schema = schema.tables[table_name] + + # Convert dlt table schema columns to ibis schema + ibis_schema = { + sql_client.capabilities.casefold_identifier(col_name): DATA_TYPE_MAP[ + col_info.get("data_type", "string") + ] + for col_name, col_info in table_schema.get("columns", {}).items() + } + + # normalize table name + table_path = sql_client.make_qualified_table_name_path(table_name, escape=False) + + catalog = None + if len(table_path) == 3: + catalog, database, table = table_path + else: + database, table = table_path + + # create unbound ibis table and return in dlt wrapper + unbound_table = ibis.table(schema=ibis_schema, name=table, database=database, catalog=catalog) + + return unbound_table diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index 70d160ea67..9bd2d6911f 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -1751,9 +1751,17 @@ def __getstate__(self) -> Any: return {"pipeline_name": self.pipeline_name} def _dataset( - self, schema: Union[Schema, str, None] = None, dataset_type: TDatasetType = "dbapi" + self, schema: Union[Schema, str, None] = None, dataset_type: TDatasetType = "auto" ) -> SupportsReadableDataset: - """Access helper to dataset""" + """Returns a dataset object for querying the destination data. + + Args: + schema: Schema name or Schema object to use. If None, uses the default schema if set. + dataset_type: Type of dataset interface to return. Defaults to 'auto' which will select ibis if available + otherwise it will fallback to the standard dbapi interface. + Returns: + A dataset object that supports querying the destination data. + """ if schema is None: schema = self.default_schema if self.default_schema_name else None return dataset( diff --git a/docs/website/docs/general-usage/dataset-access/dataset.md b/docs/website/docs/general-usage/dataset-access/dataset.md index 68635383c5..b2e3f03d4d 100644 --- a/docs/website/docs/general-usage/dataset-access/dataset.md +++ b/docs/website/docs/general-usage/dataset-access/dataset.md @@ -156,6 +156,64 @@ You can combine `select`, `limit`, and other methods. arrow_table = items_relation.select("col1", "col2").limit(50).arrow() ``` +## Modifying queries with ibis expressions + +If you install the amazing [ibis](https://ibis-project.org/) library, you can use ibis expressions to modify your queries. + +```sh +pip install ibis-framework +``` + +dlt will then wrap an `ibis.UnboundTable` with a `ReadableIbisRelation` object under the hood that will allow you to modify the query of a reltaion using ibis expressions: + +```py +# now that ibis is installed, we can get a dataset with ibis relations +dataset = pipeline._dataset() + +# get two relations +items_relation = dataset["items"] +order_relation = dataset["orders"] + +# join them using an ibis expression +joined_relation = items_relation.join(order_relation, items_relation.id == order_relation.item_id) + +# now we can use the ibis expression to filter the data +filtered_relation = joined_relation.filter(order_relation.status == "completed") + +# we can inspect the query that will be used to read the data +print(filtered_relation.query) + +# and finally fetch the data as a pandas dataframe, the same way we would do with a normal relation +df = filtered_relation.df() + +# a few more examples + +# filter for rows where the id is in the list of ids +items_relation.filter(items_relation.id.isin([1, 2, 3])).df() + +# limit and offset +items_relation.limit(10, offset=5).arrow() + +# mutate columns by adding a new colums that always is 10 times the value of the id column +items_relation.mutate(new_id=items_relation.id * 10).df() + +# sort asc and desc +import ibis +items_relation.order_by(ibis.desc("id"), ibis.asc("price")).limit(10) + +# group by and aggregate +items_relation.group_by("item_group").having(items_table.count() >= 1000).aggregate(sum_id=items_table.id.sum()).df() + +# subqueries +items_relation.filter(items_table.category.isin(beverage_categories.name)).df() +``` + +You can learn more about the available expressions on the [ibis for sql users](https://ibis-project.org/tutorials/ibis-for-sql-users) page. + +:::note +Keep in mind that you can use only methods that modify the executed query and none of the methods ibis provides for fetching data. This is done with the same methods defined on the regular relations explained above. If you need full native ibis integration, please read the ibis section in the advanced part further down. Additionally, not all ibis expressions may be supported by all destinations and sql dialects. +::: + ## Supported destinations All SQL and filesystem destinations supported by `dlt` can utilize this data access interface. For filesystem destinations, `dlt` [uses **DuckDB** under the hood](./sql-client.md#the-filesystem-sql-client) to create views from Parquet or JSONL files dynamically. This allows you to query data stored in files using the same interface as you would with SQL databases. If you plan on accessing data in buckets or the filesystem a lot this way, it is advised to load data as Parquet instead of JSONL, as **DuckDB** is able to only load the parts of the data actually needed for the query to work. diff --git a/poetry.lock b/poetry.lock index 8cada982bd..e5d2eafdd9 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. [[package]] name = "about-time" @@ -776,7 +776,7 @@ files = [ name = "atpublic" version = "5.0" description = "Keep all y'all's __all__'s in sync" -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "atpublic-5.0-py3-none-any.whl", hash = "sha256:b651dcd886666b1042d1e38158a22a4f2c267748f4e97fde94bc492a4a28a3f3"}, @@ -1755,7 +1755,7 @@ PyYAML = ">=3.11" name = "clickhouse-connect" version = "0.7.8" description = "ClickHouse Database Core Driver for Python, Pandas, and Superset" -optional = true +optional = false python-versions = "~=3.8" files = [ {file = "clickhouse-connect-0.7.8.tar.gz", hash = "sha256:dad10ba90eabfe215dfb1fef59f2821a95c752988e66f1093ca8590a51539b8f"}, @@ -2262,7 +2262,7 @@ urllib3 = ">=1.0" name = "db-dtypes" version = "1.3.0" description = "Pandas Data Types for SQL systems (BigQuery, Spanner)" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "db_dtypes-1.3.0-py2.py3-none-any.whl", hash = "sha256:7e65c59f849ccbe6f7bc4d0253edcc212a7907662906921caba3e4aadd0bc277"}, @@ -3546,7 +3546,7 @@ tqdm = ["tqdm (>=4.7.4,<5.0.0dev)"] name = "google-cloud-bigquery-storage" version = "2.27.0" description = "Google Cloud Bigquery Storage API client library" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "google_cloud_bigquery_storage-2.27.0-py2.py3-none-any.whl", hash = "sha256:3bfa8f74a61ceaffd3bfe90be5bbef440ad81c1c19ac9075188cccab34bffc2b"}, @@ -4524,63 +4524,64 @@ files = [ [[package]] name = "ibis-framework" -version = "10.0.0.dev256" +version = "9.5.0" description = "The portable Python dataframe library" -optional = true +optional = false python-versions = "<4.0,>=3.10" files = [ - {file = "ibis_framework-10.0.0.dev256-py3-none-any.whl", hash = "sha256:d6f21278e6fd78920bbe986df2c871921142635cc4f7d5d2048cae26e307a3df"}, - {file = "ibis_framework-10.0.0.dev256.tar.gz", hash = "sha256:e9f97d8177fd88f4a3578be20519c1da79a6a7ffac678b46b790bfde67405930"}, + {file = "ibis_framework-9.5.0-py3-none-any.whl", hash = "sha256:145fe30d94f111cff332580c275ce77725c5ff7086eede93af0b371649d009c0"}, + {file = "ibis_framework-9.5.0.tar.gz", hash = "sha256:1c8a29277e63ee0dfc289bc8f550164b5e3bdaec1b76b62436c37d331bb4ef84"}, ] [package.dependencies] atpublic = ">=2.3,<6" clickhouse-connect = {version = ">=0.5.23,<1", extras = ["arrow", "numpy", "pandas"], optional = true, markers = "extra == \"clickhouse\""} db-dtypes = {version = ">=0.3,<2", optional = true, markers = "extra == \"bigquery\""} -duckdb = {version = ">=0.10,<1.2", optional = true, markers = "extra == \"duckdb\""} +duckdb = {version = ">=0.8.1,<1.2", optional = true, markers = "extra == \"duckdb\""} google-cloud-bigquery = {version = ">=3,<4", optional = true, markers = "extra == \"bigquery\""} google-cloud-bigquery-storage = {version = ">=2,<3", optional = true, markers = "extra == \"bigquery\""} -numpy = {version = ">=1.23.2,<3", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"databricks\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} -packaging = {version = ">=21.3,<25", optional = true, markers = "extra == \"duckdb\" or extra == \"oracle\" or extra == \"polars\" or extra == \"pyspark\""} -pandas = {version = ">=1.5.3,<3", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"databricks\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} +numpy = {version = ">=1.23.2,<3", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"dask\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"pandas\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} +packaging = {version = ">=21.3,<25", optional = true, markers = "extra == \"dask\" or extra == \"duckdb\" or extra == \"oracle\" or extra == \"pandas\" or extra == \"polars\" or extra == \"pyspark\""} +pandas = {version = ">=1.5.3,<3", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"dask\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"pandas\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} parsy = ">=2,<3" psycopg2 = {version = ">=2.8.4,<3", optional = true, markers = "extra == \"postgres\" or extra == \"risingwave\""} -pyarrow = {version = ">=10.0.1,<19", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"databricks\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} -pyarrow-hotfix = {version = ">=0.4,<1", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"databricks\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} +pyarrow = {version = ">=10.0.1,<18", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"dask\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"pandas\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} +pyarrow-hotfix = {version = ">=0.4,<1", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"dask\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"pandas\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} pydata-google-auth = {version = ">=1.4.0,<2", optional = true, markers = "extra == \"bigquery\""} pyodbc = {version = ">=4.0.39,<6", optional = true, markers = "extra == \"mssql\""} python-dateutil = ">=2.8.2,<3" pytz = ">=2022.7" -rich = {version = ">=12.4.4,<14", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"databricks\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} +rich = {version = ">=12.4.4,<14", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"dask\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"pandas\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} snowflake-connector-python = {version = ">=3.0.2,<3.3.0b1 || >3.3.0b1,<4", optional = true, markers = "extra == \"snowflake\""} -sqlglot = ">=23.4,<25.30" -toolz = ">=0.11,<2" +sqlglot = ">=23.4,<25.21" +toolz = ">=0.11,<1" typing-extensions = ">=4.3.0,<5" [package.extras] -bigquery = ["db-dtypes (>=0.3,<2)", "google-cloud-bigquery (>=3,<4)", "google-cloud-bigquery-storage (>=2,<3)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "pydata-google-auth (>=1.4.0,<2)", "rich (>=12.4.4,<14)"] -clickhouse = ["clickhouse-connect[arrow,numpy,pandas] (>=0.5.23,<1)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] -databricks = ["databricks-sql-connector-core (>=4,<5)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] -datafusion = ["datafusion (>=0.6,<43)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] +bigquery = ["db-dtypes (>=0.3,<2)", "google-cloud-bigquery (>=3,<4)", "google-cloud-bigquery-storage (>=2,<3)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "pydata-google-auth (>=1.4.0,<2)", "rich (>=12.4.4,<14)"] +clickhouse = ["clickhouse-connect[arrow,numpy,pandas] (>=0.5.23,<1)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] +dask = ["dask[array,dataframe] (>=2022.9.1,<2024.3.0)", "numpy (>=1.23.2,<3)", "packaging (>=21.3,<25)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "regex (>=2021.7.6)", "rich (>=12.4.4,<14)"] +datafusion = ["datafusion (>=0.6,<41)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] decompiler = ["black (>=22.1.0,<25)"] deltalake = ["deltalake (>=0.9.0,<1)"] -druid = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "pydruid (>=0.6.7,<1)", "rich (>=12.4.4,<14)"] -duckdb = ["duckdb (>=0.10,<1.2)", "numpy (>=1.23.2,<3)", "packaging (>=21.3,<25)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] +druid = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "pydruid (>=0.6.7,<1)", "rich (>=12.4.4,<14)"] +duckdb = ["duckdb (>=0.8.1,<1.2)", "numpy (>=1.23.2,<3)", "packaging (>=21.3,<25)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] examples = ["pins[gcs] (>=0.8.3,<1)"] -exasol = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "pyexasol[pandas] (>=0.25.2,<1)", "rich (>=12.4.4,<14)"] -flink = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] +exasol = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "pyexasol[pandas] (>=0.25.2,<1)", "rich (>=12.4.4,<14)"] +flink = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] geospatial = ["geoarrow-types (>=0.2,<1)", "geopandas (>=0.6,<2)", "pyproj (>=3.3.0,<4)", "shapely (>=2,<3)"] -impala = ["impyla (>=0.17,<1)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] -mssql = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "pyodbc (>=4.0.39,<6)", "rich (>=12.4.4,<14)"] -mysql = ["mysqlclient (>=2.2.4,<3)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] -oracle = ["numpy (>=1.23.2,<3)", "oracledb (>=1.3.1,<3)", "packaging (>=21.3,<25)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] -polars = ["numpy (>=1.23.2,<3)", "packaging (>=21.3,<25)", "pandas (>=1.5.3,<3)", "polars (>=1,<2)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] -postgres = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "psycopg2 (>=2.8.4,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] -pyspark = ["numpy (>=1.23.2,<3)", "packaging (>=21.3,<25)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "pyspark (>=3.3.3,<4)", "rich (>=12.4.4,<14)"] -risingwave = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "psycopg2 (>=2.8.4,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] -snowflake = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)", "snowflake-connector-python (>=3.0.2,!=3.3.0b1,<4)"] -sqlite = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "regex (>=2021.7.6)", "rich (>=12.4.4,<14)"] -trino = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)", "trino (>=0.321,<1)"] +impala = ["impyla (>=0.17,<1)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] +mssql = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "pyodbc (>=4.0.39,<6)", "rich (>=12.4.4,<14)"] +mysql = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "pymysql (>=1,<2)", "rich (>=12.4.4,<14)"] +oracle = ["numpy (>=1.23.2,<3)", "oracledb (>=1.3.1,<3)", "packaging (>=21.3,<25)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] +pandas = ["numpy (>=1.23.2,<3)", "packaging (>=21.3,<25)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "regex (>=2021.7.6)", "rich (>=12.4.4,<14)"] +polars = ["numpy (>=1.23.2,<3)", "packaging (>=21.3,<25)", "pandas (>=1.5.3,<3)", "polars (>=1,<2)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] +postgres = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "psycopg2 (>=2.8.4,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] +pyspark = ["numpy (>=1.23.2,<3)", "packaging (>=21.3,<25)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "pyspark (>=3.3.3,<4)", "rich (>=12.4.4,<14)"] +risingwave = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "psycopg2 (>=2.8.4,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] +snowflake = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)", "snowflake-connector-python (>=3.0.2,!=3.3.0b1,<4)"] +sqlite = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "regex (>=2021.7.6)", "rich (>=12.4.4,<14)"] +trino = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)", "trino (>=0.321,<1)"] visualization = ["graphviz (>=0.16,<1)"] [[package]] @@ -5232,7 +5233,7 @@ source = ["Cython (>=0.29.35)"] name = "lz4" version = "4.3.3" description = "LZ4 Bindings for Python" -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "lz4-4.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b891880c187e96339474af2a3b2bfb11a8e4732ff5034be919aa9029484cd201"}, @@ -6665,7 +6666,7 @@ future = "*" name = "parsy" version = "2.1" description = "Easy-to-use parser combinators, for parsing in pure Python" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "parsy-2.1-py3-none-any.whl", hash = "sha256:8f18e7b11985e7802e7e3ecbd8291c6ca243d29820b1186e4c84605db4efffa0"}, @@ -7100,7 +7101,7 @@ test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"] name = "psycopg2" version = "2.9.10" description = "psycopg2 - Python-PostgreSQL Database Adapter" -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "psycopg2-2.9.10-cp310-cp310-win32.whl", hash = "sha256:5df2b672140f95adb453af93a7d669d7a7bf0a56bcd26f1502329166f4a61716"}, @@ -7263,7 +7264,7 @@ test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"] name = "pyarrow-hotfix" version = "0.6" description = "" -optional = true +optional = false python-versions = ">=3.5" files = [ {file = "pyarrow_hotfix-0.6-py3-none-any.whl", hash = "sha256:dcc9ae2d220dff0083be6a9aa8e0cdee5182ad358d4931fce825c545e5c89178"}, @@ -7476,7 +7477,7 @@ typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0" name = "pydata-google-auth" version = "1.9.0" description = "PyData helpers for authenticating to Google APIs" -optional = true +optional = false python-versions = ">=3.9" files = [ {file = "pydata-google-auth-1.9.0.tar.gz", hash = "sha256:2f546e88f007dfdb050087556eb46d6008e351386a7b368096797fae5df374f2"}, @@ -9285,13 +9286,13 @@ typing-extensions = "*" [[package]] name = "sqlglot" -version = "25.24.5" +version = "25.20.2" description = "An easily customizable SQL parser and transpiler" optional = false python-versions = ">=3.7" files = [ - {file = "sqlglot-25.24.5-py3-none-any.whl", hash = "sha256:f8a8870d1f5cdd2e2dc5c39a5030a0c7b0a91264fb8972caead3dac8e8438873"}, - {file = "sqlglot-25.24.5.tar.gz", hash = "sha256:6d3d604034301ca3b614d6b4148646b4033317b7a93d1801e9661495eb4b4fcf"}, + {file = "sqlglot-25.20.2-py3-none-any.whl", hash = "sha256:cdbfd7ce3f2f39f32bd7b4c23fd9e0fd261636a6b14285b914e8def25fd0a567"}, + {file = "sqlglot-25.20.2.tar.gz", hash = "sha256:169fe8308dd70d7bd40117b2221b62bdc7c4e2ea8eb07394b2a6146cdedf05ab"}, ] [package.extras] @@ -9668,13 +9669,13 @@ files = [ [[package]] name = "toolz" -version = "1.0.0" +version = "0.12.1" description = "List processing tools and functional utilities" -optional = true -python-versions = ">=3.8" +optional = false +python-versions = ">=3.7" files = [ - {file = "toolz-1.0.0-py3-none-any.whl", hash = "sha256:292c8f1c4e7516bf9086f8850935c799a874039c8bcf959d47b600e4c44a6236"}, - {file = "toolz-1.0.0.tar.gz", hash = "sha256:2c86e3d9a04798ac556793bced838816296a2f085017664e4995cb40a1047a02"}, + {file = "toolz-0.12.1-py3-none-any.whl", hash = "sha256:d22731364c07d72eea0a0ad45bafb2c2937ab6fd38a3507bf55eae8744aa7d85"}, + {file = "toolz-0.12.1.tar.gz", hash = "sha256:ecca342664893f177a13dac0e6b41cbd8ac25a358e5f215316d43e2100224f4d"}, ] [[package]] @@ -10549,7 +10550,7 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p name = "zstandard" version = "0.22.0" description = "Zstandard bindings for Python" -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "zstandard-0.22.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:275df437ab03f8c033b8a2c181e51716c32d831082d93ce48002a5227ec93019"}, @@ -10638,4 +10639,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "1bf3deccd929c083b880c1a82be0983430ab49f7ade247b1c5573bb8c70d9ff5" +content-hash = "a7cd6b599326d80b5beb8d4a3d3e3b4074eda6dc53daa5c296ef8d54002c5f78" diff --git a/pyproject.toml b/pyproject.toml index 5c91b3ec1c..4e10e96b93 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -169,7 +169,6 @@ pytest-mock = "^3.14.0" types-regex = "^2024.5.15.20240519" flake8-print = "^5.0.0" mimesis = "^7.0.0" -ibis-framework = { version = ">=9.0.0", markers = "python_version >= '3.10'", optional = true, extras = ["duckdb", "postgres", "bigquery", "snowflake", "mssql", "clickhouse"]} shapely = ">=2.0.6" [tool.poetry.group.sources] @@ -207,6 +206,12 @@ optional = true [tool.poetry.group.airflow.dependencies] apache-airflow = {version = "^2.8.0", markers = "python_version < '3.12'"} +[tool.poetry.group.ibis] +optional = true + +[tool.poetry.group.ibis.dependencies] +ibis-framework = { version = ">=9.0.0,<10.0.0", markers = "python_version >= '3.10'", extras = ["duckdb", "postgres", "bigquery", "snowflake", "mssql", "clickhouse"]} + [tool.poetry.group.providers] optional = true diff --git a/tests/destinations/test_readable_dbapi_dataset.py b/tests/destinations/test_readable_dbapi_dataset.py index 4745735371..bc58a18fa0 100644 --- a/tests/destinations/test_readable_dbapi_dataset.py +++ b/tests/destinations/test_readable_dbapi_dataset.py @@ -2,7 +2,7 @@ import dlt import pytest -from dlt.destinations.dataset import ( +from dlt.destinations.dataset.exceptions import ( ReadableRelationHasQueryException, ReadableRelationUnknownColumnException, ) @@ -12,44 +12,44 @@ def test_query_builder() -> None: dataset = dlt.pipeline(destination="duckdb", pipeline_name="pipeline")._dataset() # default query for a table - assert dataset.my_table.query.strip() == 'SELECT * FROM "pipeline_dataset"."my_table"' # type: ignore[attr-defined] + assert dataset.my_table.query.strip() == 'SELECT * FROM "pipeline_dataset"."my_table"' # head query assert ( - dataset.my_table.head().query.strip() # type: ignore[attr-defined] + dataset.my_table.head().query.strip() == 'SELECT * FROM "pipeline_dataset"."my_table" LIMIT 5' ) # limit query assert ( - dataset.my_table.limit(24).query.strip() # type: ignore[attr-defined] + dataset.my_table.limit(24).query.strip() == 'SELECT * FROM "pipeline_dataset"."my_table" LIMIT 24' ) # select columns assert ( - dataset.my_table.select("col1", "col2").query.strip() # type: ignore[attr-defined] + dataset.my_table.select("col1", "col2").query.strip() == 'SELECT "col1","col2" FROM "pipeline_dataset"."my_table"' ) # also indexer notation assert ( - dataset.my_table[["col1", "col2"]].query.strip() # type: ignore[attr-defined] + dataset.my_table[["col1", "col2"]].query.strip() == 'SELECT "col1","col2" FROM "pipeline_dataset"."my_table"' ) # identifiers are normalized assert ( - dataset["MY_TABLE"].select("CoL1", "cOl2").query.strip() # type: ignore[attr-defined] + dataset["MY_TABLE"].select("CoL1", "cOl2").query.strip() == 'SELECT "co_l1","c_ol2" FROM "pipeline_dataset"."my_table"' ) assert ( - dataset["MY__TABLE"].select("Co__L1", "cOl2").query.strip() # type: ignore[attr-defined] + dataset["MY__TABLE"].select("Co__L1", "cOl2").query.strip() == 'SELECT "co__l1","c_ol2" FROM "pipeline_dataset"."my__table"' ) # limit and select chained assert ( - dataset.my_table.select("col1", "col2").limit(24).query.strip() # type: ignore[attr-defined] + dataset.my_table.select("col1", "col2").limit(24).query.strip() == 'SELECT "col1","col2" FROM "pipeline_dataset"."my_table" LIMIT 24' ) @@ -65,18 +65,18 @@ def test_copy_and_chaining() -> None: relation2 = relation.__copy__() assert relation != relation2 - assert relation._limit == relation2._limit # type: ignore[attr-defined] - assert relation._table_name == relation2._table_name # type: ignore[attr-defined] - assert relation._provided_query == relation2._provided_query # type: ignore[attr-defined] - assert relation._selected_columns == relation2._selected_columns # type: ignore[attr-defined] + assert relation._limit == relation2._limit + assert relation._table_name == relation2._table_name + assert relation._provided_query == relation2._provided_query + assert relation._selected_columns == relation2._selected_columns # test copy while chaining limit relation3 = relation2.limit(22) assert relation2 != relation3 - assert relation2._limit != relation3._limit # type: ignore[attr-defined] + assert relation2._limit != relation3._limit # test last setting prevails chaining - assert relation.limit(23).limit(67).limit(11)._limit == 11 # type: ignore[attr-defined] + assert relation.limit(23).limit(67).limit(11)._limit == 11 def test_computed_schema_columns() -> None: diff --git a/tests/load/pipeline/test_duckdb.py b/tests/load/pipeline/test_duckdb.py index 98642bb263..a7aa4d36e4 100644 --- a/tests/load/pipeline/test_duckdb.py +++ b/tests/load/pipeline/test_duckdb.py @@ -283,8 +283,8 @@ def test_duckdb_credentials_separation( print(p1_dataset.p1_data.fetchall()) print(p2_dataset.p2_data.fetchall()) - assert "p1" in p1_dataset.sql_client.credentials._conn_str() # type: ignore[attr-defined] - assert "p2" in p2_dataset.sql_client.credentials._conn_str() # type: ignore[attr-defined] + assert "p1" in p1_dataset.sql_client.credentials._conn_str() + assert "p2" in p2_dataset.sql_client.credentials._conn_str() - assert p1_dataset.sql_client.credentials.bound_to_pipeline is p1 # type: ignore[attr-defined] - assert p2_dataset.sql_client.credentials.bound_to_pipeline is p2 # type: ignore[attr-defined] + assert p1_dataset.sql_client.credentials.bound_to_pipeline is p1 + assert p2_dataset.sql_client.credentials.bound_to_pipeline is p2 diff --git a/tests/load/test_read_interfaces.py b/tests/load/test_read_interfaces.py index 1a9c8a383b..d2f5f7951e 100644 --- a/tests/load/test_read_interfaces.py +++ b/tests/load/test_read_interfaces.py @@ -1,5 +1,5 @@ -from typing import Any, cast - +from typing import Any, cast, Tuple, List +import re import pytest import dlt import os @@ -20,8 +20,10 @@ ) from dlt.destinations import filesystem from tests.utils import TEST_STORAGE_ROOT, clean_test_storage -from dlt.common.destination.reference import TDestinationReferenceArg -from dlt.destinations.dataset import ReadableDBAPIDataset, ReadableRelationUnknownColumnException +from dlt.destinations.dataset.dataset import ReadableDBAPIDataset +from dlt.destinations.dataset.exceptions import ( + ReadableRelationUnknownColumnException, +) from tests.load.utils import drop_pipeline_data EXPECTED_COLUMNS = ["id", "decimal", "other_decimal", "_dlt_load_id", "_dlt_id"] @@ -58,6 +60,7 @@ def autouse_test_storage() -> FileStorage: @pytest.fixture(scope="session") def populated_pipeline(request, autouse_test_storage) -> Any: """fixture that returns a pipeline object populated with the example data""" + destination_config = cast(DestinationTestConfiguration, request.param) if ( @@ -104,6 +107,7 @@ def items(): columns={ "id": {"data_type": "bigint"}, "double_id": {"data_type": "bigint"}, + "di_decimal": {"data_type": "decimal", "precision": 7, "scale": 3}, }, ) def double_items(): @@ -111,6 +115,7 @@ def double_items(): { "id": i, "double_id": i * 2, + "di_decimal": Decimal("10.433"), } for i in range(total_records) ] @@ -151,6 +156,24 @@ def double_items(): ) +@pytest.mark.no_load +@pytest.mark.essential +@pytest.mark.parametrize( + "populated_pipeline", + configs, + indirect=True, + ids=lambda x: x.name, +) +def test_explicit_dataset_type_selection(populated_pipeline: Pipeline): + from dlt.destinations.dataset.dataset import ReadableDBAPIRelation + from dlt.destinations.dataset.ibis_relation import ReadableIbisRelation + + assert isinstance( + populated_pipeline._dataset(dataset_type="default").items, ReadableDBAPIRelation + ) + assert isinstance(populated_pipeline._dataset(dataset_type="ibis").items, ReadableIbisRelation) + + @pytest.mark.no_load @pytest.mark.essential @pytest.mark.parametrize( @@ -258,71 +281,6 @@ def test_db_cursor_access(populated_pipeline: Pipeline) -> None: assert set(ids) == set(range(total_records)) -@pytest.mark.no_load -@pytest.mark.essential -@pytest.mark.parametrize( - "populated_pipeline", - configs, - indirect=True, - ids=lambda x: x.name, -) -def test_ibis_dataset_access(populated_pipeline: Pipeline) -> None: - # NOTE: we could generalize this with a context for certain deps - import subprocess - - subprocess.check_call( - ["pip", "install", "ibis-framework[duckdb,postgres,bigquery,snowflake,mssql,clickhouse]"] - ) - - from dlt.common.libs.ibis import SUPPORTED_DESTINATIONS - - # check correct error if not supported - if populated_pipeline.destination.destination_type not in SUPPORTED_DESTINATIONS: - with pytest.raises(NotImplementedError): - populated_pipeline._dataset().ibis() - return - - total_records = _total_records(populated_pipeline) - ibis_connection = populated_pipeline._dataset().ibis() - - map_i = lambda x: x - if populated_pipeline.destination.destination_type == "dlt.destinations.snowflake": - map_i = lambda x: x.upper() - - dataset_name = map_i(populated_pipeline.dataset_name) - table_like_statement = None - table_name_prefix = "" - addtional_tables = [] - - # clickhouse has no datasets, but table prefixes and a sentinel table - if populated_pipeline.destination.destination_type == "dlt.destinations.clickhouse": - table_like_statement = dataset_name + "." - table_name_prefix = dataset_name + "___" - dataset_name = None - addtional_tables = ["dlt_sentinel_table"] - - add_table_prefix = lambda x: table_name_prefix + x - - # just do a basic check to see wether ibis can connect - assert set(ibis_connection.list_tables(database=dataset_name, like=table_like_statement)) == { - add_table_prefix(map_i(x)) - for x in ( - [ - "_dlt_loads", - "_dlt_pipeline_state", - "_dlt_version", - "double_items", - "items", - "items__children", - ] - + addtional_tables - ) - } - - items_table = ibis_connection.table(add_table_prefix(map_i("items")), database=dataset_name) - assert items_table.count().to_pandas() == total_records - - @pytest.mark.no_load @pytest.mark.essential @pytest.mark.parametrize( @@ -332,7 +290,8 @@ def test_ibis_dataset_access(populated_pipeline: Pipeline) -> None: ids=lambda x: x.name, ) def test_hint_preservation(populated_pipeline: Pipeline) -> None: - table_relationship = populated_pipeline._dataset().items + # NOTE: for now hints are only preserved for the default dataset + table_relationship = populated_pipeline._dataset(dataset_type="default").items # check that hints are carried over to arrow table expected_decimal_precision = 10 expected_decimal_precision_2 = 12 @@ -425,8 +384,7 @@ def test_limit_and_head(populated_pipeline: Pipeline) -> None: ids=lambda x: x.name, ) def test_column_selection(populated_pipeline: Pipeline) -> None: - table_relationship = populated_pipeline._dataset().items - + table_relationship = populated_pipeline._dataset(dataset_type="default").items columns = ["_dlt_load_id", "other_decimal"] data_frame = table_relationship.select(*columns).head().df() assert [v.lower() for v in data_frame.columns.values] == columns @@ -479,6 +437,266 @@ def test_schema_arg(populated_pipeline: Pipeline) -> None: assert "items" in dataset.schema.tables +@pytest.mark.no_load +@pytest.mark.essential +@pytest.mark.parametrize( + "populated_pipeline", + configs, + indirect=True, + ids=lambda x: x.name, +) +def test_ibis_expression_relation(populated_pipeline: Pipeline) -> None: + # NOTE: we could generalize this with a context for certain deps + import ibis # type: ignore + + # now we should get the more powerful ibis relation + dataset = populated_pipeline._dataset() + total_records = _total_records(populated_pipeline) + + items_table = dataset["items"] + double_items_table = dataset["double_items"] + + # check full table access + df = items_table.df() + assert len(df.index) == total_records + + df = double_items_table.df() + assert len(df.index) == total_records + + # check limit + df = items_table.limit(5).df() + assert len(df.index) == 5 + + # check chained expression with join, column selection, order by and limit + joined_table = ( + items_table.join(double_items_table, items_table.id == double_items_table.id)[ + ["id", "double_id"] + ] + .order_by("id") + .limit(20) + ) + table = joined_table.fetchall() + assert len(table) == 20 + assert list(table[0]) == [0, 0] + assert list(table[5]) == [5, 10] + assert list(table[10]) == [10, 20] + + # check aggregate of first 20 items + agg_table = items_table.order_by("id").limit(20).aggregate(sum_id=items_table.id.sum()) + assert agg_table.fetchone()[0] == reduce(lambda a, b: a + b, range(20)) + + # check filtering + filtered_table = items_table.filter(items_table.id < 10) + assert len(filtered_table.fetchall()) == 10 + + if populated_pipeline.destination.destination_type != "dlt.destinations.duckdb": + return + + # we check a bunch of expressions without executing them to see that they produce correct sql + # also we return the keys of the disovered schema columns + def sql_from_expr(expr: Any) -> Tuple[str, List[str]]: + query = str(expr.query).replace(populated_pipeline.dataset_name, "dataset") + columns = list(expr.columns_schema.keys()) if expr.columns_schema else None + return re.sub(r"\s+", " ", query), columns + + # test all functions discussed here: https://ibis-project.org/tutorials/ibis-for-sql-users + ALL_COLUMNS = ["id", "decimal", "other_decimal", "_dlt_load_id", "_dlt_id"] + + # selecting two columns + assert sql_from_expr(items_table.select("id", "decimal")) == ( + 'SELECT "t0"."id", "t0"."decimal" FROM "dataset"."items" AS "t0"', + ["id", "decimal"], + ) + + # selecting all columns + assert sql_from_expr(items_table) == ('SELECT * FROM "dataset"."items"', ALL_COLUMNS) + + # selecting two other columns via item getter + assert sql_from_expr(items_table["id", "decimal"]) == ( + 'SELECT "t0"."id", "t0"."decimal" FROM "dataset"."items" AS "t0"', + ["id", "decimal"], + ) + + # adding a new columns + new_col = (items_table.id * 2).name("new_col") + assert sql_from_expr(items_table.select("id", "decimal", new_col)) == ( + ( + 'SELECT "t0"."id", "t0"."decimal", "t0"."id" * 2 AS "new_col" FROM' + ' "dataset"."items" AS "t0"' + ), + None, + ) + + # mutating table (add a new column computed from existing columns) + assert sql_from_expr( + items_table.mutate(double_id=items_table.id * 2).select("id", "double_id") + ) == ( + 'SELECT "t0"."id", "t0"."id" * 2 AS "double_id" FROM "dataset"."items" AS "t0"', + None, + ) + + # mutating table add new static column + assert sql_from_expr( + items_table.mutate(new_col=ibis.literal("static_value")).select("id", "new_col") + ) == ('SELECT "t0"."id", \'static_value\' AS "new_col" FROM "dataset"."items" AS "t0"', None) + + # check filtering (preserves all columns) + assert sql_from_expr(items_table.filter(items_table.id < 10)) == ( + 'SELECT * FROM "dataset"."items" AS "t0" WHERE "t0"."id" < 10', + ALL_COLUMNS, + ) + + # filtering and selecting a single column + assert sql_from_expr(items_table.filter(items_table.id < 10).select("id")) == ( + 'SELECT "t0"."id" FROM "dataset"."items" AS "t0" WHERE "t0"."id" < 10', + ["id"], + ) + + # check filter "and" condition + assert sql_from_expr(items_table.filter(items_table.id < 10).filter(items_table.id > 5)) == ( + 'SELECT * FROM "dataset"."items" AS "t0" WHERE "t0"."id" < 10 AND "t0"."id" > 5', + ALL_COLUMNS, + ) + + # check filter "or" condition + assert sql_from_expr(items_table.filter((items_table.id < 10) | (items_table.id > 5))) == ( + 'SELECT * FROM "dataset"."items" AS "t0" WHERE ( "t0"."id" < 10 ) OR ( "t0"."id" > 5 )', + ALL_COLUMNS, + ) + + # check group by and aggregate + assert sql_from_expr( + items_table.group_by("id") + .having(items_table.count() >= 1000) + .aggregate(sum_id=items_table.id.sum()) + ) == ( + ( + 'SELECT "t1"."id", "t1"."sum_id" FROM ( SELECT "t0"."id", SUM("t0"."id") AS "sum_id",' + ' COUNT(*) AS "CountStar(items)" FROM "dataset"."items" AS "t0" GROUP BY 1 ) AS "t1"' + ' WHERE "t1"."CountStar(items)" >= 1000' + ), + None, + ) + + # sorting and ordering + assert sql_from_expr(items_table.order_by("id", "decimal").limit(10)) == ( + ( + 'SELECT * FROM "dataset"."items" AS "t0" ORDER BY "t0"."id" ASC, "t0"."decimal" ASC' + " LIMIT 10" + ), + ALL_COLUMNS, + ) + + # sort desc and asc + assert sql_from_expr(items_table.order_by(ibis.desc("id"), ibis.asc("decimal")).limit(10)) == ( + ( + 'SELECT * FROM "dataset"."items" AS "t0" ORDER BY "t0"."id" DESC, "t0"."decimal" ASC' + " LIMIT 10" + ), + ALL_COLUMNS, + ) + + # offset and limit + assert sql_from_expr(items_table.order_by("id").limit(10, offset=5)) == ( + 'SELECT * FROM "dataset"."items" AS "t0" ORDER BY "t0"."id" ASC LIMIT 10 OFFSET 5', + ALL_COLUMNS, + ) + + # join + assert sql_from_expr( + items_table.join(double_items_table, items_table.id == double_items_table.id)[ + ["id", "double_id"] + ] + ) == ( + ( + 'SELECT "t2"."id", "t3"."double_id" FROM "dataset"."items" AS "t2" INNER JOIN' + ' "dataset"."double_items" AS "t3" ON "t2"."id" = "t3"."id"' + ), + None, + ) + + # subqueries + assert sql_from_expr( + items_table.filter(items_table.decimal.isin(double_items_table.di_decimal)) + ) == ( + ( + 'SELECT * FROM "dataset"."items" AS "t0" WHERE "t0"."decimal" IN ( SELECT' + ' "t1"."di_decimal" FROM "dataset"."double_items" AS "t1" )' + ), + ALL_COLUMNS, + ) + + # topk + assert sql_from_expr(items_table.decimal.topk(10)) == ( + ( + 'SELECT * FROM ( SELECT "t0"."decimal", COUNT(*) AS "CountStar(items)" FROM' + ' "dataset"."items" AS "t0" GROUP BY 1 ) AS "t1" ORDER BY "t1"."CountStar(items)" DESC' + " LIMIT 10" + ), + None, + ) + + +@pytest.mark.no_load +@pytest.mark.essential +@pytest.mark.parametrize( + "populated_pipeline", + configs, + indirect=True, + ids=lambda x: x.name, +) +def test_ibis_dataset_access(populated_pipeline: Pipeline) -> None: + # NOTE: we could generalize this with a context for certain deps + + from dlt.helpers.ibis import SUPPORTED_DESTINATIONS + + # check correct error if not supported + if populated_pipeline.destination.destination_type not in SUPPORTED_DESTINATIONS: + with pytest.raises(NotImplementedError): + populated_pipeline._dataset().ibis() + return + + total_records = _total_records(populated_pipeline) + ibis_connection = populated_pipeline._dataset().ibis() + + map_i = lambda x: x + if populated_pipeline.destination.destination_type == "dlt.destinations.snowflake": + map_i = lambda x: x.upper() + + dataset_name = map_i(populated_pipeline.dataset_name) + table_like_statement = None + table_name_prefix = "" + addtional_tables = [] + + # clickhouse has no datasets, but table prefixes and a sentinel table + if populated_pipeline.destination.destination_type == "dlt.destinations.clickhouse": + table_like_statement = dataset_name + "." + table_name_prefix = dataset_name + "___" + dataset_name = None + addtional_tables = ["dlt_sentinel_table"] + + add_table_prefix = lambda x: table_name_prefix + x + + # just do a basic check to see wether ibis can connect + assert set(ibis_connection.list_tables(database=dataset_name, like=table_like_statement)) == { + add_table_prefix(map_i(x)) + for x in ( + [ + "_dlt_loads", + "_dlt_pipeline_state", + "_dlt_version", + "double_items", + "items", + "items__children", + ] + + addtional_tables + ) + } + + items_table = ibis_connection.table(add_table_prefix(map_i("items")), database=dataset_name) + assert items_table.count().to_pandas() == total_records + + @pytest.mark.no_load @pytest.mark.essential @pytest.mark.parametrize( @@ -546,6 +764,7 @@ def test_standalone_dataset(populated_pipeline: Pipeline) -> None: assert dataset.schema.name == "unknown_dataset" assert "items" not in dataset.schema.tables + # NOTE: this breaks the following test, it will need to be fixed somehow # create a newer schema with different name and see wether this is loaded from dlt.common.schema import Schema from dlt.common.schema import utils From 2d55e4ac8e6f9fff777cdb430dbb63eddab18fcc Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink <47451109+jorritsandbrink@users.noreply.github.com> Date: Wed, 11 Dec 2024 12:35:59 +0400 Subject: [PATCH 26/32] `iceberg` table format support for `filesystem` destination (#2067) * add pyiceberg dependency and upgrade mypy - mypy upgrade needed to solve this issue: https://github.com/apache/iceberg-python/issues/768 - uses <1.13.0 requirement on mypy because 1.13.0 gives error - new lint errors arising due to version upgrade are simply ignored * extend pyiceberg dependencies * remove redundant delta annotation * add basic local filesystem iceberg support * add active table format setting * disable merge tests for iceberg table format * restore non-redundant extra info * refactor to in-memory iceberg catalog * add s3 support for iceberg table format * add schema evolution support for iceberg table format * extract _register_table function * add partition support for iceberg table format * update docstring * enable child table test for iceberg table format * enable empty source test for iceberg table format * make iceberg catalog namespace configurable and default to dataset name * add optional typing * fix typo * improve typing * extract logic into dedicated function * add iceberg read support to filesystem sql client * remove unused import * add todo * extract logic into separate functions * add azure support for iceberg table format * generalize delta table format tests * enable get tables function test for iceberg table format * remove ignores * undo table directory management change * enable test_read_interfaces tests for iceberg * fix active table format filter * use mixin for object store rs credentials * generalize catalog typing * extract pyiceberg scheme mapping into separate function * generalize credentials mixin test setup * remove unused import * add centralized fallback to append when merge is not supported * Revert "add centralized fallback to append when merge is not supported" This reverts commit 54cd0bcebffad15d522e734da321c602f4bd7461. * fall back to append if merge is not supported on filesystem * fix test for s3-compatible storage * remove obsolete code path * exclude gcs read interface tests for iceberg * add gcs support for iceberg table format * switch to UnsupportedAuthenticationMethodException * add iceberg table format docs * use shorter pipeline name to prevent too long sql identifiers * add iceberg catalog note to docs * black format * use shorter pipeline name to prevent too long sql identifiers * correct max id length for sqlalchemy mysql dialect * Revert "use shorter pipeline name to prevent too long sql identifiers" This reverts commit 6cce03b77111825b0714597e6d494df97145f0f2. * Revert "use shorter pipeline name to prevent too long sql identifiers" This reverts commit ef29aa7c2fdba79441573850c7d15b83526c011a. * replace show with execute to prevent useless print output * add abfss scheme to test * remove az support for iceberg table format * remove iceberg bucket test exclusion * add note to docs on azure scheme support for iceberg table format * exclude iceberg from duckdb s3-compatibility test * disable pyiceberg info logs for tests * extend table format docs and move into own page * upgrade adlfs to enable account_host attribute * Merge branch 'devel' of https://github.com/dlt-hub/dlt into feat/1996-iceberg-filesystem * fix lint errors * re-add pyiceberg dependency * enabled iceberg in dbt-duckdb * upgrade pyiceberg version * remove pyiceberg mypy errors across python version * does not install airflow group for dev * fixes gcp oauth iceberg credentials handling * fixes ca cert bundle duckdb azure on ci * allow for airflow dep to be present during type check --------- Co-authored-by: Marcin Rudolf --- .github/workflows/test_destinations.yml | 9 +- .github/workflows/test_local_destinations.yml | 5 +- Makefile | 2 +- dlt/cli/source_detection.py | 3 +- .../configuration/specs/aws_credentials.py | 15 +- .../configuration/specs/azure_credentials.py | 22 +- .../configuration/specs/base_configuration.py | 2 +- .../specs/config_providers_context.py | 7 +- dlt/common/configuration/specs/exceptions.py | 4 + .../configuration/specs/gcp_credentials.py | 36 +- dlt/common/configuration/specs/mixins.py | 24 ++ dlt/common/data_writers/buffered.py | 2 +- dlt/common/destination/utils.py | 2 +- dlt/common/libs/deltalake.py | 6 +- dlt/common/libs/pyiceberg.py | 192 +++++++++ dlt/common/logger.py | 2 +- dlt/common/metrics.py | 2 +- dlt/common/reflection/utils.py | 14 +- dlt/common/schema/schema.py | 2 +- dlt/common/typing.py | 2 +- dlt/destinations/impl/filesystem/factory.py | 4 +- .../impl/filesystem/filesystem.py | 86 +++- .../impl/filesystem/sql_client.py | 27 +- dlt/destinations/impl/sqlalchemy/factory.py | 3 + dlt/extract/incremental/lag.py | 2 +- dlt/helpers/airflow_helper.py | 4 +- dlt/helpers/dbt/profiles.yml | 1 + .../destinations/delta-iceberg.md | 168 ++++++++ .../dlt-ecosystem/destinations/filesystem.md | 113 +---- .../dlt-ecosystem/table-formats/iceberg.md | 2 +- .../dataset-access/ibis-backend.md | 3 +- docs/website/sidebars.js | 1 + mypy.ini | 6 + poetry.lock | 154 +++++-- pyproject.toml | 11 +- tests/conftest.py | 3 + tests/libs/test_csv_writer.py | 4 +- ...dentials.py => test_credentials_mixins.py} | 169 +++++--- tests/load/filesystem/test_sql_client.py | 18 +- .../load/pipeline/test_filesystem_pipeline.py | 393 +++++++++++------- .../sql_database/test_sql_database_source.py | 5 +- tests/load/utils.py | 33 +- tests/pipeline/utils.py | 13 + .../helpers/rest_client/test_client.py | 2 +- tests/utils.py | 7 + 45 files changed, 1163 insertions(+), 422 deletions(-) create mode 100644 dlt/common/configuration/specs/mixins.py create mode 100644 dlt/common/libs/pyiceberg.py create mode 100644 docs/website/docs/dlt-ecosystem/destinations/delta-iceberg.md rename tests/load/filesystem/{test_object_store_rs_credentials.py => test_credentials_mixins.py} (50%) diff --git a/.github/workflows/test_destinations.yml b/.github/workflows/test_destinations.yml index cfd0a3bd56..84a8f95d71 100644 --- a/.github/workflows/test_destinations.yml +++ b/.github/workflows/test_destinations.yml @@ -77,8 +77,13 @@ jobs: # key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-redshift - name: Install dependencies - # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E redshift -E postgis -E postgres -E gs -E s3 -E az -E parquet -E duckdb -E cli -E filesystem --with sentry-sdk --with pipeline,ibis -E deltalake + run: poetry install --no-interaction -E redshift -E postgis -E postgres -E gs -E s3 -E az -E parquet -E duckdb -E cli -E filesystem --with sentry-sdk --with pipeline,ibis -E deltalake -E pyiceberg + + - name: enable certificates for azure and duckdb + run: sudo mkdir -p /etc/pki/tls/certs && sudo ln -s /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt + + - name: Upgrade sqlalchemy + run: poetry run pip install sqlalchemy==2.0.18 # minimum version required by `pyiceberg` - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_local_destinations.yml b/.github/workflows/test_local_destinations.yml index 6f44e5fd5a..706bae1b0c 100644 --- a/.github/workflows/test_local_destinations.yml +++ b/.github/workflows/test_local_destinations.yml @@ -95,7 +95,10 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-local-destinations - name: Install dependencies - run: poetry install --no-interaction -E postgres -E postgis -E duckdb -E parquet -E filesystem -E cli -E weaviate -E qdrant -E sftp --with sentry-sdk --with pipeline,ibis -E deltalake + run: poetry install --no-interaction -E postgres -E postgis -E duckdb -E parquet -E filesystem -E cli -E weaviate -E qdrant -E sftp --with sentry-sdk --with pipeline,ibis -E deltalake -E pyiceberg + + - name: Upgrade sqlalchemy + run: poetry run pip install sqlalchemy==2.0.18 # minimum version required by `pyiceberg` - name: Start SFTP server run: docker compose -f "tests/load/filesystem_sftp/docker-compose.yml" up -d diff --git a/Makefile b/Makefile index 2a7f6dac0a..0ca8a2e0c3 100644 --- a/Makefile +++ b/Makefile @@ -44,7 +44,7 @@ has-poetry: poetry --version dev: has-poetry - poetry install --all-extras --with docs,providers,pipeline,sources,sentry-sdk,airflow + poetry install --all-extras --with docs,providers,pipeline,sources,sentry-sdk lint: ./tools/check-package.sh diff --git a/dlt/cli/source_detection.py b/dlt/cli/source_detection.py index 7067f8b896..0769605d01 100644 --- a/dlt/cli/source_detection.py +++ b/dlt/cli/source_detection.py @@ -29,8 +29,7 @@ def find_call_arguments_to_replace( if not isinstance(dn_node, ast.Constant) or not isinstance(dn_node.value, str): raise CliCommandInnerException( "init", - f"The pipeline script {init_script_name} must pass the {t_arg_name} as" - f" string to '{arg_name}' function in line {dn_node.lineno}", + f"The pipeline script {init_script_name} must pass the {t_arg_name} as string to '{arg_name}' function in line {dn_node.lineno}", # type: ignore[attr-defined] ) else: transformed_nodes.append((dn_node, ast.Constant(value=t_value, kind=None))) diff --git a/dlt/common/configuration/specs/aws_credentials.py b/dlt/common/configuration/specs/aws_credentials.py index 5f69be6a33..a75cd85225 100644 --- a/dlt/common/configuration/specs/aws_credentials.py +++ b/dlt/common/configuration/specs/aws_credentials.py @@ -8,6 +8,7 @@ CredentialsWithDefault, configspec, ) +from dlt.common.configuration.specs.mixins import WithObjectStoreRsCredentials, WithPyicebergConfig from dlt.common.configuration.specs.exceptions import ( InvalidBoto3Session, ObjectStoreRsCredentialsException, @@ -16,7 +17,9 @@ @configspec -class AwsCredentialsWithoutDefaults(CredentialsConfiguration): +class AwsCredentialsWithoutDefaults( + CredentialsConfiguration, WithObjectStoreRsCredentials, WithPyicebergConfig +): # credentials without boto implementation aws_access_key_id: str = None aws_secret_access_key: TSecretStrValue = None @@ -77,6 +80,16 @@ def to_object_store_rs_credentials(self) -> Dict[str, str]: return creds + def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: + return { + "s3.access-key-id": self.aws_access_key_id, + "s3.secret-access-key": self.aws_secret_access_key, + "s3.session-token": self.aws_session_token, + "s3.region": self.region_name, + "s3.endpoint": self.endpoint_url, + "s3.connect-timeout": 300, + } + @configspec class AwsCredentials(AwsCredentialsWithoutDefaults, CredentialsWithDefault): diff --git a/dlt/common/configuration/specs/azure_credentials.py b/dlt/common/configuration/specs/azure_credentials.py index cf6ec493de..aabd0b471a 100644 --- a/dlt/common/configuration/specs/azure_credentials.py +++ b/dlt/common/configuration/specs/azure_credentials.py @@ -8,6 +8,7 @@ CredentialsWithDefault, configspec, ) +from dlt.common.configuration.specs.mixins import WithObjectStoreRsCredentials, WithPyicebergConfig from dlt import version from dlt.common.utils import without_none @@ -15,7 +16,7 @@ @configspec -class AzureCredentialsBase(CredentialsConfiguration): +class AzureCredentialsBase(CredentialsConfiguration, WithObjectStoreRsCredentials): azure_storage_account_name: str = None azure_account_host: Optional[str] = None """Alternative host when accessing blob storage endpoint ie. my_account.dfs.core.windows.net""" @@ -32,7 +33,7 @@ def to_object_store_rs_credentials(self) -> Dict[str, str]: @configspec -class AzureCredentialsWithoutDefaults(AzureCredentialsBase): +class AzureCredentialsWithoutDefaults(AzureCredentialsBase, WithPyicebergConfig): """Credentials for Azure Blob Storage, compatible with adlfs""" azure_storage_account_key: Optional[TSecretStrValue] = None @@ -49,6 +50,13 @@ def to_adlfs_credentials(self) -> Dict[str, Any]: account_host=self.azure_account_host, ) + def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: + return { + "adlfs.account-name": self.azure_storage_account_name, + "adlfs.account-key": self.azure_storage_account_key, + "adlfs.sas-token": self.azure_storage_sas_token, + } + def create_sas_token(self) -> None: try: from azure.storage.blob import generate_account_sas, ResourceTypes @@ -72,7 +80,7 @@ def on_partial(self) -> None: @configspec -class AzureServicePrincipalCredentialsWithoutDefaults(AzureCredentialsBase): +class AzureServicePrincipalCredentialsWithoutDefaults(AzureCredentialsBase, WithPyicebergConfig): azure_tenant_id: str = None azure_client_id: str = None azure_client_secret: TSecretStrValue = None @@ -86,6 +94,14 @@ def to_adlfs_credentials(self) -> Dict[str, Any]: client_secret=self.azure_client_secret, ) + def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: + return { + "adlfs.account-name": self.azure_storage_account_name, + "adlfs.tenant-id": self.azure_tenant_id, + "adlfs.client-id": self.azure_client_id, + "adlfs.client-secret": self.azure_client_secret, + } + @configspec class AzureCredentials(AzureCredentialsWithoutDefaults, CredentialsWithDefault): diff --git a/dlt/common/configuration/specs/base_configuration.py b/dlt/common/configuration/specs/base_configuration.py index 8d913d0542..41d1d7a0ca 100644 --- a/dlt/common/configuration/specs/base_configuration.py +++ b/dlt/common/configuration/specs/base_configuration.py @@ -359,7 +359,7 @@ def _get_resolvable_dataclass_fields(cls) -> Iterator[TDtcField]: def get_resolvable_fields(cls) -> Dict[str, type]: """Returns a mapping of fields to their type hints. Dunders should not be resolved and are not returned""" return { - f.name: eval(f.type) if isinstance(f.type, str) else f.type # type: ignore[arg-type] + f.name: eval(f.type) if isinstance(f.type, str) else f.type for f in cls._get_resolvable_dataclass_fields() } diff --git a/dlt/common/configuration/specs/config_providers_context.py b/dlt/common/configuration/specs/config_providers_context.py index 5d1a5b7f26..a244ab571f 100644 --- a/dlt/common/configuration/specs/config_providers_context.py +++ b/dlt/common/configuration/specs/config_providers_context.py @@ -1,5 +1,4 @@ import contextlib -import dataclasses import io from typing import ClassVar, List @@ -8,10 +7,6 @@ ConfigProvider, ContextProvider, ) -from dlt.common.configuration.specs.base_configuration import ( - ContainerInjectableContext, - NotResolved, -) from dlt.common.configuration.specs import ( GcpServiceAccountCredentials, BaseConfiguration, @@ -137,7 +132,7 @@ def _airflow_providers() -> List[ConfigProvider]: # check if we are in task context and provide more info from airflow.operators.python import get_current_context # noqa - ti: TaskInstance = get_current_context()["ti"] # type: ignore + ti: TaskInstance = get_current_context()["ti"] # type: ignore[assignment,unused-ignore] # log outside of stderr/out redirect if secrets_toml_var is None: diff --git a/dlt/common/configuration/specs/exceptions.py b/dlt/common/configuration/specs/exceptions.py index 928e46a8a0..fe87ef24d7 100644 --- a/dlt/common/configuration/specs/exceptions.py +++ b/dlt/common/configuration/specs/exceptions.py @@ -72,3 +72,7 @@ def __init__(self, spec: Type[Any], native_value: Any): class ObjectStoreRsCredentialsException(ConfigurationException): pass + + +class UnsupportedAuthenticationMethodException(ConfigurationException): + pass diff --git a/dlt/common/configuration/specs/gcp_credentials.py b/dlt/common/configuration/specs/gcp_credentials.py index 60ab1d4b56..17519b032a 100644 --- a/dlt/common/configuration/specs/gcp_credentials.py +++ b/dlt/common/configuration/specs/gcp_credentials.py @@ -11,7 +11,9 @@ InvalidGoogleServicesJson, NativeValueError, OAuth2ScopesRequired, + UnsupportedAuthenticationMethodException, ) +from dlt.common.configuration.specs.mixins import WithObjectStoreRsCredentials, WithPyicebergConfig from dlt.common.exceptions import MissingDependencyException from dlt.common.typing import DictStrAny, TSecretStrValue, StrAny from dlt.common.configuration.specs.base_configuration import ( @@ -23,7 +25,7 @@ @configspec -class GcpCredentials(CredentialsConfiguration): +class GcpCredentials(CredentialsConfiguration, WithObjectStoreRsCredentials, WithPyicebergConfig): token_uri: Final[str] = dataclasses.field( default="https://oauth2.googleapis.com/token", init=False, repr=False, compare=False ) @@ -126,6 +128,12 @@ def to_native_credentials(self) -> Any: else: return ServiceAccountCredentials.from_service_account_info(self) + def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: + raise UnsupportedAuthenticationMethodException( + "Service Account authentication not supported with `iceberg` table format. Use OAuth" + " authentication instead." + ) + def __str__(self) -> str: return f"{self.client_email}@{self.project_id}" @@ -176,11 +184,19 @@ def to_native_representation(self) -> str: return json.dumps(self._info_dict()) def to_object_store_rs_credentials(self) -> Dict[str, str]: - raise NotImplementedError( - "`object_store` Rust crate does not support OAuth for GCP credentials. Reference:" - " https://docs.rs/object_store/latest/object_store/gcp." + raise UnsupportedAuthenticationMethodException( + "OAuth authentication not supported with `delta` table format. Use Service Account or" + " Application Default Credentials authentication instead." ) + def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: + self.auth() + return { + "gcs.project-id": self.project_id, + "gcs.oauth2.token": self.token, + "gcs.oauth2.token-expires-at": (pendulum.now().timestamp() + 60) * 1000, + } + def auth(self, scopes: Union[str, List[str]] = None, redirect_url: str = None) -> None: if not self.refresh_token: self.add_scopes(scopes) @@ -313,6 +329,12 @@ def to_native_credentials(self) -> Any: else: return super().to_native_credentials() + def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: + raise UnsupportedAuthenticationMethodException( + "Application Default Credentials authentication not supported with `iceberg` table" + " format. Use OAuth authentication instead." + ) + @configspec class GcpServiceAccountCredentials( @@ -334,3 +356,9 @@ def parse_native_representation(self, native_value: Any) -> None: except NativeValueError: pass GcpOAuthCredentialsWithoutDefaults.parse_native_representation(self, native_value) + + def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: + if self.has_default_credentials(): + return GcpDefaultCredentials.to_pyiceberg_fileio_config(self) + else: + return GcpOAuthCredentialsWithoutDefaults.to_pyiceberg_fileio_config(self) diff --git a/dlt/common/configuration/specs/mixins.py b/dlt/common/configuration/specs/mixins.py new file mode 100644 index 0000000000..2f843aee5b --- /dev/null +++ b/dlt/common/configuration/specs/mixins.py @@ -0,0 +1,24 @@ +from typing import Dict, Any +from abc import abstractmethod, ABC + + +class WithObjectStoreRsCredentials(ABC): + @abstractmethod + def to_object_store_rs_credentials(self) -> Dict[str, Any]: + """Returns credentials dictionary for object_store Rust crate. + + Can be used for libraries that build on top of the object_store crate, such as `deltalake`. + + https://docs.rs/object_store/latest/object_store/ + """ + pass + + +class WithPyicebergConfig(ABC): + @abstractmethod + def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: + """Returns `pyiceberg` FileIO configuration dictionary. + + https://py.iceberg.apache.org/configuration/#fileio + """ + pass diff --git a/dlt/common/data_writers/buffered.py b/dlt/common/data_writers/buffered.py index e2b6c9a442..6ef431a4d0 100644 --- a/dlt/common/data_writers/buffered.py +++ b/dlt/common/data_writers/buffered.py @@ -242,7 +242,7 @@ def _flush_items(self, allow_empty_file: bool = False) -> None: if self.writer_spec.is_binary_format: self._file = self.open(self._file_name, "wb") # type: ignore else: - self._file = self.open(self._file_name, "wt", encoding="utf-8", newline="") # type: ignore + self._file = self.open(self._file_name, "wt", encoding="utf-8", newline="") self._writer = self.writer_cls(self._file, caps=self._caps) # type: ignore[assignment] self._writer.write_header(self._current_columns) # write buffer diff --git a/dlt/common/destination/utils.py b/dlt/common/destination/utils.py index 0bad5b152e..c98344b687 100644 --- a/dlt/common/destination/utils.py +++ b/dlt/common/destination/utils.py @@ -38,7 +38,7 @@ def verify_schema_capabilities( exception_log: List[Exception] = [] # combined casing function case_identifier = lambda ident: capabilities.casefold_identifier( - (str if capabilities.has_case_sensitive_identifiers else str.casefold)(ident) # type: ignore + (str if capabilities.has_case_sensitive_identifiers else str.casefold)(ident) ) table_name_lookup: DictStrStr = {} # name collision explanation diff --git a/dlt/common/libs/deltalake.py b/dlt/common/libs/deltalake.py index 4047bc3a1a..0f938e7102 100644 --- a/dlt/common/libs/deltalake.py +++ b/dlt/common/libs/deltalake.py @@ -10,6 +10,7 @@ from dlt.common.exceptions import MissingDependencyException from dlt.common.storages import FilesystemConfiguration from dlt.common.utils import assert_min_pkg_version +from dlt.common.configuration.specs.mixins import WithObjectStoreRsCredentials from dlt.destinations.impl.filesystem.filesystem import FilesystemClient try: @@ -191,10 +192,9 @@ def get_delta_tables( def _deltalake_storage_options(config: FilesystemConfiguration) -> Dict[str, str]: """Returns dict that can be passed as `storage_options` in `deltalake` library.""" - creds = {} # type: ignore + creds = {} extra_options = {} - # TODO: create a mixin with to_object_store_rs_credentials for a proper discovery - if hasattr(config.credentials, "to_object_store_rs_credentials"): + if isinstance(config.credentials, WithObjectStoreRsCredentials): creds = config.credentials.to_object_store_rs_credentials() if config.deltalake_storage_options is not None: extra_options = config.deltalake_storage_options diff --git a/dlt/common/libs/pyiceberg.py b/dlt/common/libs/pyiceberg.py new file mode 100644 index 0000000000..19ce9abbf2 --- /dev/null +++ b/dlt/common/libs/pyiceberg.py @@ -0,0 +1,192 @@ +from typing import Dict, Any, List, Optional + +from dlt import version, Pipeline +from dlt.common.libs.pyarrow import cast_arrow_schema_types +from dlt.common.schema.typing import TWriteDisposition +from dlt.common.utils import assert_min_pkg_version +from dlt.common.exceptions import MissingDependencyException +from dlt.common.storages.configuration import FileSystemCredentials +from dlt.common.configuration.specs import CredentialsConfiguration +from dlt.common.configuration.specs.mixins import WithPyicebergConfig +from dlt.destinations.impl.filesystem.filesystem import FilesystemClient + + +try: + from pyiceberg.table import Table as IcebergTable + from pyiceberg.catalog import MetastoreCatalog + import pyarrow as pa +except ModuleNotFoundError: + raise MissingDependencyException( + "dlt pyiceberg helpers", + [f"{version.DLT_PKG_NAME}[pyiceberg]"], + "Install `pyiceberg` so dlt can create Iceberg tables in the `filesystem` destination.", + ) + + +def ensure_iceberg_compatible_arrow_schema(schema: pa.Schema) -> pa.Schema: + ARROW_TO_ICEBERG_COMPATIBLE_ARROW_TYPE_MAP = { + pa.types.is_time: pa.string(), + pa.types.is_decimal256: pa.string(), # pyarrow does not allow downcasting to decimal128 + } + return cast_arrow_schema_types(schema, ARROW_TO_ICEBERG_COMPATIBLE_ARROW_TYPE_MAP) + + +def ensure_iceberg_compatible_arrow_data(data: pa.Table) -> pa.Table: + schema = ensure_iceberg_compatible_arrow_schema(data.schema) + return data.cast(schema) + + +def write_iceberg_table( + table: IcebergTable, + data: pa.Table, + write_disposition: TWriteDisposition, +) -> None: + if write_disposition == "append": + table.append(ensure_iceberg_compatible_arrow_data(data)) + elif write_disposition == "replace": + table.overwrite(ensure_iceberg_compatible_arrow_data(data)) + + +def get_sql_catalog(credentials: FileSystemCredentials) -> "SqlCatalog": # type: ignore[name-defined] # noqa: F821 + assert_min_pkg_version( + pkg_name="sqlalchemy", + version="2.0.18", + msg=( + "`sqlalchemy>=2.0.18` is needed for `iceberg` table format on `filesystem` destination." + ), + ) + + from pyiceberg.catalog.sql import SqlCatalog + + return SqlCatalog( + "default", + uri="sqlite:///:memory:", + **_get_fileio_config(credentials), + ) + + +def create_or_evolve_table( + catalog: MetastoreCatalog, + client: FilesystemClient, + table_name: str, + namespace_name: Optional[str] = None, + schema: Optional[pa.Schema] = None, + partition_columns: Optional[List[str]] = None, +) -> MetastoreCatalog: + # add table to catalog + table_id = f"{namespace_name}.{table_name}" + table_path = f"{client.dataset_path}/{table_name}" + metadata_path = f"{table_path}/metadata" + if client.fs_client.exists(metadata_path): + # found metadata; register existing table + table = _register_table(table_id, metadata_path, catalog, client) + + # evolve schema + if schema is not None: + with table.update_schema() as update: + update.union_by_name(ensure_iceberg_compatible_arrow_schema(schema)) + else: + # found no metadata; create new table + assert schema is not None + with catalog.create_table_transaction( + table_id, + schema=ensure_iceberg_compatible_arrow_schema(schema), + location=_make_path(table_path, client), + ) as txn: + # add partitioning + with txn.update_spec() as update_spec: + for col in partition_columns: + update_spec.add_identity(col) + + return catalog + + +def get_catalog( + client: FilesystemClient, + table_name: str, + namespace_name: Optional[str] = None, + schema: Optional[pa.Schema] = None, + partition_columns: Optional[List[str]] = None, +) -> MetastoreCatalog: + """Returns single-table, ephemeral, in-memory Iceberg catalog.""" + + # create in-memory catalog + catalog: MetastoreCatalog = get_sql_catalog(client.config.credentials) + + # create namespace + if namespace_name is None: + namespace_name = client.dataset_name + catalog.create_namespace(namespace_name) + + # add table to catalog + catalog = create_or_evolve_table( + catalog=catalog, + client=client, + table_name=table_name, + namespace_name=namespace_name, + schema=schema, + partition_columns=partition_columns, + ) + + return catalog + + +def get_iceberg_tables( + pipeline: Pipeline, *tables: str, schema_name: Optional[str] = None +) -> Dict[str, IcebergTable]: + from dlt.common.schema.utils import get_table_format + + with pipeline.destination_client(schema_name=schema_name) as client: + assert isinstance( + client, FilesystemClient + ), "The `get_iceberg_tables` function requires a `filesystem` destination." + + schema_iceberg_tables = [ + t["name"] + for t in client.schema.tables.values() + if get_table_format(client.schema.tables, t["name"]) == "iceberg" + ] + if len(tables) > 0: + invalid_tables = set(tables) - set(schema_iceberg_tables) + if len(invalid_tables) > 0: + available_schemas = "" + if len(pipeline.schema_names) > 1: + available_schemas = f" Available schemas are {pipeline.schema_names}" + raise ValueError( + f"Schema {client.schema.name} does not contain Iceberg tables with these names:" + f" {', '.join(invalid_tables)}.{available_schemas}" + ) + schema_iceberg_tables = [t for t in schema_iceberg_tables if t in tables] + + return { + name: get_catalog(client, name).load_table(f"{pipeline.dataset_name}.{name}") + for name in schema_iceberg_tables + } + + +def _get_fileio_config(credentials: CredentialsConfiguration) -> Dict[str, Any]: + if isinstance(credentials, WithPyicebergConfig): + return credentials.to_pyiceberg_fileio_config() + return {} + + +def _get_last_metadata_file(metadata_path: str, client: FilesystemClient) -> str: + # TODO: implement faster way to obtain `last_metadata_file` (listing is slow) + metadata_files = [f for f in client.fs_client.ls(metadata_path) if f.endswith(".json")] + return _make_path(sorted(metadata_files)[-1], client) + + +def _register_table( + identifier: str, + metadata_path: str, + catalog: MetastoreCatalog, + client: FilesystemClient, +) -> IcebergTable: + last_metadata_file = _get_last_metadata_file(metadata_path, client) + return catalog.register_table(identifier, last_metadata_file) + + +def _make_path(path: str, client: FilesystemClient) -> str: + # don't use file protocol for local files because duckdb does not support it + # https://github.com/duckdb/duckdb/issues/13669 + return path if client.is_local_filesystem else client.config.make_url(path) diff --git a/dlt/common/logger.py b/dlt/common/logger.py index b163c15672..634e305805 100644 --- a/dlt/common/logger.py +++ b/dlt/common/logger.py @@ -47,7 +47,7 @@ def is_logging() -> bool: def log_level() -> str: if not LOGGER: raise RuntimeError("Logger not initialized") - return logging.getLevelName(LOGGER.level) # type: ignore + return logging.getLevelName(LOGGER.level) def is_json_logging(log_format: str) -> bool: diff --git a/dlt/common/metrics.py b/dlt/common/metrics.py index d6acf19d0d..2f9f574dd0 100644 --- a/dlt/common/metrics.py +++ b/dlt/common/metrics.py @@ -9,7 +9,7 @@ class DataWriterMetrics(NamedTuple): created: float last_modified: float - def __add__(self, other: Tuple[object, ...], /) -> Tuple[object, ...]: + def __add__(self, other: Tuple[object, ...], /) -> Tuple[object, ...]: # type: ignore[override] if isinstance(other, DataWriterMetrics): return DataWriterMetrics( self.file_path if self.file_path == other.file_path else "", diff --git a/dlt/common/reflection/utils.py b/dlt/common/reflection/utils.py index c612c5a4f1..27c7bd8758 100644 --- a/dlt/common/reflection/utils.py +++ b/dlt/common/reflection/utils.py @@ -90,24 +90,24 @@ def rewrite_python_script( last_line = -1 last_offset = -1 # sort transformed nodes by line and offset - for node, t_value in sorted(transformed_nodes, key=lambda n: (n[0].lineno, n[0].col_offset)): + for node, t_value in sorted(transformed_nodes, key=lambda n: (n[0].lineno, n[0].col_offset)): # type: ignore[attr-defined] # do we have a line changed - if last_line != node.lineno - 1: + if last_line != node.lineno - 1: # type: ignore[attr-defined] # add remainder from the previous line if last_offset >= 0: script_lines.append(source_script_lines[last_line][last_offset:]) # add all new lines from previous line to current - script_lines.extend(source_script_lines[last_line + 1 : node.lineno - 1]) + script_lines.extend(source_script_lines[last_line + 1 : node.lineno - 1]) # type: ignore[attr-defined] # add trailing characters until node in current line starts - script_lines.append(source_script_lines[node.lineno - 1][: node.col_offset]) + script_lines.append(source_script_lines[node.lineno - 1][: node.col_offset]) # type: ignore[attr-defined] elif last_offset >= 0: # no line change, add the characters from the end of previous node to the current - script_lines.append(source_script_lines[last_line][last_offset : node.col_offset]) + script_lines.append(source_script_lines[last_line][last_offset : node.col_offset]) # type: ignore[attr-defined] # replace node value script_lines.append(ast_unparse(t_value).strip()) - last_line = node.end_lineno - 1 - last_offset = node.end_col_offset + last_line = node.end_lineno - 1 # type: ignore[attr-defined] + last_offset = node.end_col_offset # type: ignore[attr-defined] # add all that was missing if last_offset >= 0: diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index d6031a08fa..276bbe9c09 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -525,7 +525,7 @@ def get_new_table_columns( Typically they come from the destination schema. Columns that are in `existing_columns` and not in `table_name` columns are ignored. Optionally includes incomplete columns (without data type)""" - casefold_f: Callable[[str], str] = str.casefold if not case_sensitive else str # type: ignore[assignment] + casefold_f: Callable[[str], str] = str.casefold if not case_sensitive else str casefold_existing = { casefold_f(col_name): col for col_name, col in existing_columns.items() } diff --git a/dlt/common/typing.py b/dlt/common/typing.py index a3364d1b07..8986d753f3 100644 --- a/dlt/common/typing.py +++ b/dlt/common/typing.py @@ -446,7 +446,7 @@ def get_generic_type_argument_from_instance( if cls_: orig_param_type = get_args(cls_)[0] if orig_param_type in (Any, CallableAny) and sample_value is not None: - orig_param_type = type(sample_value) + orig_param_type = type(sample_value) # type: ignore[assignment] return orig_param_type # type: ignore diff --git a/dlt/destinations/impl/filesystem/factory.py b/dlt/destinations/impl/filesystem/factory.py index 2463da58fa..906bd157e4 100644 --- a/dlt/destinations/impl/filesystem/factory.py +++ b/dlt/destinations/impl/filesystem/factory.py @@ -19,7 +19,7 @@ def filesystem_loader_file_format_selector( *, table_schema: TTableSchema, ) -> t.Tuple[TLoaderFileFormat, t.Sequence[TLoaderFileFormat]]: - if table_schema.get("table_format") == "delta": + if table_schema.get("table_format") in ("delta", "iceberg"): return ("parquet", ["parquet"]) return (preferred_loader_file_format, supported_loader_file_formats) @@ -43,7 +43,7 @@ def _raw_capabilities(self) -> DestinationCapabilitiesContext: caps = DestinationCapabilitiesContext.generic_capabilities( preferred_loader_file_format="jsonl", loader_file_format_selector=filesystem_loader_file_format_selector, - supported_table_formats=["delta"], + supported_table_formats=["delta", "iceberg"], supported_merge_strategies=["upsert"], merge_strategies_selector=filesystem_merge_strategies_selector, ) diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py index 1739c87fb3..ccf764811b 100644 --- a/dlt/destinations/impl/filesystem/filesystem.py +++ b/dlt/destinations/impl/filesystem/filesystem.py @@ -119,16 +119,27 @@ def metrics(self) -> Optional[LoadJobMetrics]: return m._replace(remote_url=self.make_remote_url()) -class DeltaLoadFilesystemJob(FilesystemLoadJob): +class TableFormatLoadFilesystemJob(FilesystemLoadJob): def __init__(self, file_path: str) -> None: super().__init__(file_path=file_path) self.file_paths = ReferenceFollowupJobRequest.resolve_references(self._file_path) def make_remote_path(self) -> str: - # remote path is table dir - delta will create its file structure inside it return self._job_client.get_table_dir(self.load_table_name) + @property + def arrow_dataset(self) -> Any: + from dlt.common.libs.pyarrow import pyarrow + + return pyarrow.dataset.dataset(self.file_paths) + + @property + def _partition_columns(self) -> List[str]: + return get_columns_names_with_prop(self._load_table, "partition") + + +class DeltaLoadFilesystemJob(TableFormatLoadFilesystemJob): def run(self) -> None: # create Arrow dataset from Parquet files from dlt.common.libs.pyarrow import pyarrow as pa @@ -138,7 +149,7 @@ def run(self) -> None: f"Will copy file(s) {self.file_paths} to delta table {self.make_remote_url()} [arrow" f" buffer: {pa.total_allocated_bytes()}]" ) - source_ds = pa.dataset.dataset(self.file_paths) + source_ds = self.arrow_dataset delta_table = self._delta_table() # explicitly check if there is data @@ -148,9 +159,6 @@ def run(self) -> None: else: with source_ds.scanner().to_reader() as arrow_rbr: # RecordBatchReader if self._load_table["write_disposition"] == "merge" and delta_table is not None: - self._load_table["x-merge-strategy"] = resolve_merge_strategy( # type: ignore[typeddict-unknown-key] - self._schema.tables, self._load_table, self._job_client.capabilities - ) merge_delta_table( table=delta_table, data=arrow_rbr, @@ -188,10 +196,6 @@ def _delta_table(self) -> Optional["DeltaTable"]: # type: ignore[name-defined] else: return None - @property - def _partition_columns(self) -> List[str]: - return get_columns_names_with_prop(self._load_table, "partition") - def _create_or_evolve_delta_table(self, arrow_ds: "Dataset", delta_table: "DeltaTable") -> "DeltaTable": # type: ignore[name-defined] # noqa: F821 from dlt.common.libs.deltalake import ( DeltaTable, @@ -211,13 +215,36 @@ def _create_or_evolve_delta_table(self, arrow_ds: "Dataset", delta_table: "Delta return _evolve_delta_table_schema(delta_table, arrow_ds.schema) +class IcebergLoadFilesystemJob(TableFormatLoadFilesystemJob): + def run(self) -> None: + from dlt.common.libs.pyiceberg import write_iceberg_table + + write_iceberg_table( + table=self._iceberg_table(), + data=self.arrow_dataset.to_table(), + write_disposition=self._load_table["write_disposition"], + ) + + def _iceberg_table(self) -> "pyiceberg.table.Table": # type: ignore[name-defined] # noqa: F821 + from dlt.common.libs.pyiceberg import get_catalog + + catalog = get_catalog( + client=self._job_client, + table_name=self.load_table_name, + schema=self.arrow_dataset.schema, + partition_columns=self._partition_columns, + ) + return catalog.load_table(self.table_identifier) + + @property + def table_identifier(self) -> str: + return f"{self._job_client.dataset_name}.{self.load_table_name}" + + class FilesystemLoadJobWithFollowup(HasFollowupJobs, FilesystemLoadJob): def create_followup_jobs(self, final_state: TLoadJobState) -> List[FollowupJobRequest]: jobs = super().create_followup_jobs(final_state) - if self._load_table.get("table_format") == "delta": - # delta table jobs only require table chain followup jobs - pass - elif final_state == "completed": + if final_state == "completed": ref_job = ReferenceFollowupJobRequest( original_file_name=self.file_name(), remote_paths=[self._job_client.make_remote_url(self.make_remote_path())], @@ -394,6 +421,13 @@ def prepare_load_table(self, table_name: str) -> PreparedTableSchema: if table["write_disposition"] == "merge": table["write_disposition"] = "append" table.pop("table_format", None) + merge_strategy = resolve_merge_strategy(self.schema.tables, table, self.capabilities) + if table["write_disposition"] == "merge": + if merge_strategy is None: + # no supported merge strategies, fall back to append + table["write_disposition"] = "append" + else: + table["x-merge-strategy"] = merge_strategy # type: ignore[typeddict-unknown-key] return table def get_table_dir(self, table_name: str, remote: bool = False) -> str: @@ -458,12 +492,20 @@ def create_load_job( # where we want to load the state the regular way if table["name"] == self.schema.state_table_name and not self.config.as_staging_destination: return FinalizedLoadJob(file_path) - if table.get("table_format") == "delta": - import dlt.common.libs.deltalake # assert dependencies are installed + table_format = table.get("table_format") + if table_format in ("delta", "iceberg"): # a reference job for a delta table indicates a table chain followup job if ReferenceFollowupJobRequest.is_reference_job(file_path): - return DeltaLoadFilesystemJob(file_path) + if table_format == "delta": + import dlt.common.libs.deltalake + + return DeltaLoadFilesystemJob(file_path) + elif table_format == "iceberg": + import dlt.common.libs.pyiceberg + + return IcebergLoadFilesystemJob(file_path) + # otherwise just continue return FinalizedLoadJobWithFollowupJobs(file_path) @@ -494,10 +536,10 @@ def should_load_data_to_staging_dataset(self, table_name: str) -> bool: def should_truncate_table_before_load(self, table_name: str) -> bool: table = self.prepare_load_table(table_name) - return ( - table["write_disposition"] == "replace" - and not table.get("table_format") == "delta" # Delta can do a logical replace - ) + return table["write_disposition"] == "replace" and not table.get("table_format") in ( + "delta", + "iceberg", + ) # Delta/Iceberg can do a logical replace # # state stuff @@ -718,7 +760,7 @@ def create_table_chain_completed_followup_jobs( jobs = super().create_table_chain_completed_followup_jobs( table_chain, completed_table_chain_jobs ) - if table_chain[0].get("table_format") == "delta": + if table_chain[0].get("table_format") in ("delta", "iceberg"): for table in table_chain: table_job_paths = [ job.file_path diff --git a/dlt/destinations/impl/filesystem/sql_client.py b/dlt/destinations/impl/filesystem/sql_client.py index d03a00b418..d39f4c3431 100644 --- a/dlt/destinations/impl/filesystem/sql_client.py +++ b/dlt/destinations/impl/filesystem/sql_client.py @@ -13,6 +13,7 @@ from dlt.common.destination.reference import DBApiCursor +from dlt.common.storages.fsspec_filesystem import AZURE_BLOB_STORAGE_PROTOCOLS from dlt.destinations.sql_client import raise_database_error from dlt.destinations.impl.duckdb.sql_client import DuckDbSqlClient @@ -169,8 +170,9 @@ def create_authentication(self, persistent: bool = False, secret_name: str = Non # native google storage implementation is not supported.. elif self.fs_client.config.protocol in ["gs", "gcs"]: logger.warn( - "For gs/gcs access via duckdb please use the gs/gcs s3 compatibility layer. Falling" - " back to fsspec." + "For gs/gcs access via duckdb please use the gs/gcs s3 compatibility layer if" + " possible (not supported when using `iceberg` table format). Falling back to" + " fsspec." ) self._conn.register_filesystem(self.fs_client.fs_client) @@ -192,7 +194,7 @@ def open_connection(self) -> duckdb.DuckDBPyConnection: # the line below solves problems with certificate path lookup on linux # see duckdb docs - if self.fs_client.config.protocol in ["az", "abfss"]: + if self.fs_client.config.protocol in AZURE_BLOB_STORAGE_PROTOCOLS: self._conn.sql("SET azure_transport_option_type = 'curl';") return self._conn @@ -258,6 +260,13 @@ def create_views_for_tables(self, tables: Dict[str, str]) -> None: from_statement = "" if schema_table.get("table_format") == "delta": from_statement = f"delta_scan('{resolved_folder}')" + elif schema_table.get("table_format") == "iceberg": + from dlt.common.libs.pyiceberg import _get_last_metadata_file + + self._setup_iceberg(self._conn) + metadata_path = f"{resolved_folder}/metadata" + last_metadata_file = _get_last_metadata_file(metadata_path, self.fs_client) + from_statement = f"iceberg_scan('{last_metadata_file}')" elif first_file_type == "parquet": from_statement = f"read_parquet([{resolved_files_string}])" elif first_file_type == "jsonl": @@ -267,7 +276,7 @@ def create_views_for_tables(self, tables: Dict[str, str]) -> None: else: raise NotImplementedError( f"Unknown filetype {first_file_type} for table {table_name}. Currently only" - " jsonl and parquet files as well as delta tables are supported." + " jsonl and parquet files as well as delta and iceberg tables are supported." ) # create table @@ -299,6 +308,16 @@ def execute_query(self, query: AnyStr, *args: Any, **kwargs: Any) -> Iterator[DB with super().execute_query(query, *args, **kwargs) as cursor: yield cursor + @staticmethod + def _setup_iceberg(conn: duckdb.DuckDBPyConnection) -> None: + # needed to make persistent secrets work in new connection + # https://github.com/duckdb/duckdb_iceberg/issues/83 + conn.execute("FROM duckdb_secrets();") + + # `duckdb_iceberg` extension does not support autoloading + # https://github.com/duckdb/duckdb_iceberg/issues/71 + conn.execute("INSTALL iceberg; LOAD iceberg;") + def __del__(self) -> None: if self.memory_db: self.memory_db.close() diff --git a/dlt/destinations/impl/sqlalchemy/factory.py b/dlt/destinations/impl/sqlalchemy/factory.py index edd827ed00..e61ac1fb6a 100644 --- a/dlt/destinations/impl/sqlalchemy/factory.py +++ b/dlt/destinations/impl/sqlalchemy/factory.py @@ -81,6 +81,9 @@ def adjust_capabilities( caps.max_column_identifier_length = dialect.max_identifier_length caps.supports_native_boolean = dialect.supports_native_boolean if dialect.name == "mysql": + # correct max identifier length + # dialect uses 255 (max length for aliases) instead of 64 (max length of identifiers) + caps.max_identifier_length = 64 caps.format_datetime_literal = _format_mysql_datetime_literal return caps diff --git a/dlt/extract/incremental/lag.py b/dlt/extract/incremental/lag.py index ee102a9961..dfafa2cd11 100644 --- a/dlt/extract/incremental/lag.py +++ b/dlt/extract/incremental/lag.py @@ -20,7 +20,7 @@ def _apply_lag_to_value( parsed_value = ensure_pendulum_datetime(value) if is_str else value if isinstance(parsed_value, (datetime, date)): - parsed_value = _apply_lag_to_datetime(lag, parsed_value, last_value_func, is_str_date) + parsed_value = _apply_lag_to_datetime(lag, parsed_value, last_value_func, is_str_date) # type: ignore[assignment] # go back to string or pass exact type value = parsed_value.strftime(value_format) if value_format else parsed_value # type: ignore[assignment] diff --git a/dlt/helpers/airflow_helper.py b/dlt/helpers/airflow_helper.py index 99458a3949..aaa19ea97d 100644 --- a/dlt/helpers/airflow_helper.py +++ b/dlt/helpers/airflow_helper.py @@ -18,7 +18,7 @@ from airflow.configuration import conf from airflow.models import TaskInstance from airflow.utils.task_group import TaskGroup - from airflow.operators.dummy import DummyOperator # type: ignore + from airflow.operators.dummy import DummyOperator from airflow.operators.python import PythonOperator, get_current_context except ModuleNotFoundError: raise MissingDependencyException("Airflow", ["apache-airflow>=2.5"]) @@ -255,7 +255,7 @@ def _run( # use task logger if self.use_task_logger: - ti: TaskInstance = get_current_context()["ti"] # type: ignore + ti: TaskInstance = get_current_context()["ti"] # type: ignore[assignment,unused-ignore] logger.LOGGER = ti.log # set global number of buffered items diff --git a/dlt/helpers/dbt/profiles.yml b/dlt/helpers/dbt/profiles.yml index a2a0014e4e..fd114478fb 100644 --- a/dlt/helpers/dbt/profiles.yml +++ b/dlt/helpers/dbt/profiles.yml @@ -83,6 +83,7 @@ duckdb: extensions: - httpfs - parquet + - iceberg # TODO: emit the config of duck db motherduck: diff --git a/docs/website/docs/dlt-ecosystem/destinations/delta-iceberg.md b/docs/website/docs/dlt-ecosystem/destinations/delta-iceberg.md new file mode 100644 index 0000000000..7a056d6b40 --- /dev/null +++ b/docs/website/docs/dlt-ecosystem/destinations/delta-iceberg.md @@ -0,0 +1,168 @@ +--- +title: Delta / Iceberg +description: Delta / Iceberg `dlt` destination +keywords: [delta, iceberg, destination, data warehouse] +--- + +# Delta and Iceberg table formats +`dlt` supports writing [Delta](https://delta.io/) and [Iceberg](https://iceberg.apache.org/) tables when using the [filesystem](./filesystem.md) destination. + +## How it works +`dlt` uses the [deltalake](https://pypi.org/project/deltalake/) and [pyiceberg](https://pypi.org/project/pyiceberg/) libraries to write Delta and Iceberg tables, respectively. One or multiple Parquet files are prepared during the extract and normalize steps. In the load step, these Parquet files are exposed as an Arrow data structure and fed into `deltalake` or `pyiceberg`. + +## Iceberg single-user ephemeral catalog +`dlt` uses single-table, ephemeral, in-memory, sqlite-based [Iceberg catalog](https://iceberg.apache.org/concepts/catalog/)s. These catalogs are created "on demand" when a pipeline is run, and do not persist afterwards. If a table already exists in the filesystem, it gets registered into the catalog using its latest metadata file. This allows for a serverless setup. It is currently not possible to connect your own Iceberg catalog. + +:::caution +While ephemeral catalogs make it easy to get started with Iceberg, it comes with limitations: +- concurrent writes are not handled and may lead to corrupt table state +- we cannot guarantee that reads concurrent with writes are clean +- the latest manifest file needs to be searched for using file listing—this can become slow with large tables, especially in cloud object stores +::: + +## Delta dependencies + +You need the `deltalake` package to use this format: + +```sh +pip install "dlt[deltalake]" +``` + +You also need `pyarrow>=17.0.0`: + +```sh +pip install 'pyarrow>=17.0.0' +``` + +## Iceberg dependencies + +You need Python version 3.9 or higher and the `pyiceberg` package to use this format: + +```sh +pip install "dlt[pyiceberg]" +``` + +You also need `sqlalchemy>=2.0.18`: + +```sh +pip install 'sqlalchemy>=2.0.18' +``` + +## Set table format + +Set the `table_format` argument to `delta` or `iceberg` when defining your resource: + +```py +@dlt.resource(table_format="delta") +def my_delta_resource(): + ... +``` + +or when calling `run` on your pipeline: + +```py +pipeline.run(my_resource, table_format="delta") +``` + +:::note +`dlt` always uses Parquet as `loader_file_format` when using the `delta` or `iceberg` table format. Any setting of `loader_file_format` is disregarded. +::: + + +## Table format partitioning +Both `delta` and `iceberg` tables can be partitioned by specifying one or more `partition` column hints. This example partitions a Delta table by the `foo` column: + +```py +@dlt.resource( + table_format="delta", + columns={"foo": {"partition": True}} +) +def my_delta_resource(): + ... +``` + +:::note +Delta uses [Hive-style partitioning](https://delta.io/blog/pros-cons-hive-style-partionining/), while Iceberg uses [hidden partioning](https://iceberg.apache.org/docs/latest/partitioning/). +::: + +:::caution +Partition evolution (changing partition columns after a table has been created) is not supported. +::: + +## Table access helper functions +You can use the `get_delta_tables` and `get_iceberg_tables` helper functions to acccess native table objects. For `delta` these are `deltalake` [DeltaTable](https://delta-io.github.io/delta-rs/api/delta_table/) objects, for `iceberg` these are `pyiceberg` [Table](https://py.iceberg.apache.org/reference/pyiceberg/table/#pyiceberg.table.Table) objects. + +```py +from dlt.common.libs.deltalake import get_delta_tables +# from dlt.common.libs.pyiceberg import get_iceberg_tables + +... + +# get dictionary of DeltaTable objects +delta_tables = get_delta_tables(pipeline) + +# execute operations on DeltaTable objects +delta_tables["my_delta_table"].optimize.compact() +delta_tables["another_delta_table"].optimize.z_order(["col_a", "col_b"]) +# delta_tables["my_delta_table"].vacuum() +# etc. +``` + +## Table format Google Cloud Storage authentication + +Note that not all authentication methods are supported when using table formats on Google Cloud Storage: + +| Authentication method | `delta` | `iceberg` | +| -- | -- | -- | +| [Service Account](bigquery.md#setup-guide) | ✅ | ❌ | +| [OAuth](../destinations/bigquery.md#oauth-20-authentication) | ❌ | ✅ | +| [Application Default Credentials](bigquery.md#using-default-credentials) | ✅ | ❌ | + +:::note +The [S3-compatible](#using-s3-compatible-storage) interface for Google Cloud Storage is not supported when using `iceberg`. +::: + +## Iceberg Azure scheme +The `az` [scheme](#supported-schemes) is not supported when using the `iceberg` table format. Please use the `abfss` scheme. This is because `pyiceberg`, which `dlt` used under the hood, currently does not support `az`. + +## Table format `merge` support (**experimental**) +The [`upsert`](../../general-usage/incremental-loading.md#upsert-strategy) merge strategy is supported for `delta`. For `iceberg`, the `merge` write disposition is not supported and falls back to `append`. + +:::caution +The `upsert` merge strategy for the filesystem destination with Delta table format is **experimental**. +::: + +```py +@dlt.resource( + write_disposition={"disposition": "merge", "strategy": "upsert"}, + primary_key="my_primary_key", + table_format="delta" +) +def my_upsert_resource(): + ... +... +``` + +### Known limitations +- `hard_delete` hint not supported +- Deleting records from nested tables not supported + - This means updates to JSON columns that involve element removals are not propagated. For example, if you first load `{"key": 1, "nested": [1, 2]}` and then load `{"key": 1, "nested": [1]}`, then the record for element `2` will not be deleted from the nested table. + +## Delta table format storage options +You can pass storage options by configuring `destination.filesystem.deltalake_storage_options`: + +```toml +[destination.filesystem] +deltalake_storage_options = '{"AWS_S3_LOCKING_PROVIDER": "dynamodb", "DELTA_DYNAMO_TABLE_NAME": "custom_table_name"}' +``` + +`dlt` passes these options to the `storage_options` argument of the `write_deltalake` method in the `deltalake` library. Look at their [documentation](https://delta-io.github.io/delta-rs/api/delta_writer/#deltalake.write_deltalake) to see which options can be used. + +You don't need to specify credentials here. `dlt` merges the required credentials with the options you provided before passing it as `storage_options`. + +>❗When using `s3`, you need to specify storage options to [configure](https://delta-io.github.io/delta-rs/usage/writing/writing-to-s3-with-locking-provider/) locking behavior. + +## Delta table format memory usage +:::caution +Beware that when loading a large amount of data for one table, the underlying rust implementation will consume a lot of memory. This is a known issue and the maintainers are actively working on a solution. You can track the progress [here](https://github.com/delta-io/delta-rs/pull/2289). Until the issue is resolved, you can mitigate the memory consumption by doing multiple smaller incremental pipeline runs. +::: \ No newline at end of file diff --git a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md index 9b243b9429..de3d12e8e1 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md +++ b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md @@ -108,7 +108,8 @@ You need to create an S3 bucket and a user who can access that bucket. dlt does #### Using S3 compatible storage -To use an S3 compatible storage other than AWS S3, such as [MinIO](https://min.io/) or [Cloudflare R2](https://www.cloudflare.com/en-ca/developer-platform/r2/), you may supply an `endpoint_url` in the config. This should be set along with AWS credentials: +To use an S3 compatible storage other than AWS S3, such as [MinIO](https://min.io/), [Cloudflare R2](https://www.cloudflare.com/en-ca/developer-platform/r2/) or [Google +Cloud Storage](https://cloud.google.com/storage/docs/interoperability), you may supply an `endpoint_url` in the config. This should be set along with AWS credentials: ```toml [destination.filesystem] @@ -166,6 +167,8 @@ Run `pip install "dlt[az]"` which will install the `adlfs` package to interface Edit the credentials in `.dlt/secrets.toml`, you'll see AWS credentials by default; replace them with your Azure credentials. +#### Supported schemes + `dlt` supports both forms of the blob storage urls: ```toml [destination.filesystem] @@ -404,29 +407,6 @@ The filesystem destination handles the write dispositions as follows: - `replace` - all files that belong to such tables are deleted from the dataset folder, and then the current set of files is added. - `merge` - falls back to `append` -### Merge with Delta table format (experimental) -The [`upsert`](../../general-usage/incremental-loading.md#upsert-strategy) merge strategy is supported when using the [Delta table format](#delta-table-format). - -:::caution -The `upsert` merge strategy for the filesystem destination with Delta table format is experimental. -::: - -```py -@dlt.resource( - write_disposition={"disposition": "merge", "strategy": "upsert"}, - primary_key="my_primary_key", - table_format="delta" -) -def my_upsert_resource(): - ... -... -``` - -#### Known limitations -- `hard_delete` hint not supported -- Deleting records from nested tables not supported - - This means updates to JSON columns that involve element removals are not propagated. For example, if you first load `{"key": 1, "nested": [1, 2]}` and then load `{"key": 1, "nested": [1]}`, then the record for element `2` will not be deleted from the nested table. - ## File compression The filesystem destination in the dlt library uses `gzip` compression by default for efficiency, which may result in the files being stored in a compressed format. This format may not be easily readable as plain text or JSON Lines (`jsonl`) files. If you encounter files that seem unreadable, they may be compressed. @@ -645,88 +625,9 @@ You can choose the following file formats: ## Supported table formats -You can choose the following table formats: -* [Delta table](../table-formats/delta.md) is supported - -### Delta table format - -You need the `deltalake` package to use this format: - -```sh -pip install "dlt[deltalake]" -``` - -You also need `pyarrow>=17.0.0`: - -```sh -pip install 'pyarrow>=17.0.0' -``` - -Set the `table_format` argument to `delta` when defining your resource: - -```py -@dlt.resource(table_format="delta") -def my_delta_resource(): - ... -``` - -:::note -`dlt` always uses Parquet as `loader_file_format` when using the `delta` table format. Any setting of `loader_file_format` is disregarded. -::: - -:::caution -Beware that when loading a large amount of data for one table, the underlying rust implementation will consume a lot of memory. This is a known issue and the maintainers are actively working on a solution. You can track the progress [here](https://github.com/delta-io/delta-rs/pull/2289). Until the issue is resolved, you can mitigate the memory consumption by doing multiple smaller incremental pipeline runs. -::: - -#### Delta table partitioning -A Delta table can be partitioned ([Hive-style partitioning](https://delta.io/blog/pros-cons-hive-style-partionining/)) by specifying one or more `partition` column hints. This example partitions the Delta table by the `foo` column: - -```py -@dlt.resource( - table_format="delta", - columns={"foo": {"partition": True}} -) -def my_delta_resource(): - ... -``` - -:::caution -It is **not** possible to change partition columns after the Delta table has been created. Trying to do so causes an error stating that the partition columns don't match. -::: - - -#### Storage options -You can pass storage options by configuring `destination.filesystem.deltalake_storage_options`: - -```toml -[destination.filesystem] -deltalake_storage_options = '{"AWS_S3_LOCKING_PROVIDER": "dynamodb", "DELTA_DYNAMO_TABLE_NAME": "custom_table_name"}' -``` - -`dlt` passes these options to the `storage_options` argument of the `write_deltalake` method in the `deltalake` library. Look at their [documentation](https://delta-io.github.io/delta-rs/api/delta_writer/#deltalake.write_deltalake) to see which options can be used. - -You don't need to specify credentials here. `dlt` merges the required credentials with the options you provided before passing it as `storage_options`. - ->❗When using `s3`, you need to specify storage options to [configure](https://delta-io.github.io/delta-rs/usage/writing/writing-to-s3-with-locking-provider/) locking behavior. - -#### `get_delta_tables` helper -You can use the `get_delta_tables` helper function to get `deltalake` [DeltaTable](https://delta-io.github.io/delta-rs/api/delta_table/) objects for your Delta tables: - -```py -from dlt.common.libs.deltalake import get_delta_tables - -... - -# get dictionary of DeltaTable objects -delta_tables = get_delta_tables(pipeline) - -# execute operations on DeltaTable objects -delta_tables["my_delta_table"].optimize.compact() -delta_tables["another_delta_table"].optimize.z_order(["col_a", "col_b"]) -# delta_tables["my_delta_table"].vacuum() -# etc. - -``` +You can choose the following [table formats](./delta-iceberg.md): +* Delta table +* Iceberg ## Syncing of dlt state This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination). To this end, special folders and files will be created at your destination which hold information about your pipeline state, schemas, and completed loads. These folders DO NOT respect your settings in the layout section. When using filesystem as a staging destination, not all of these folders are created, as the state and schemas are managed in the regular way by the final destination you have configured. diff --git a/docs/website/docs/dlt-ecosystem/table-formats/iceberg.md b/docs/website/docs/dlt-ecosystem/table-formats/iceberg.md index 233ae0ce21..edca521e52 100644 --- a/docs/website/docs/dlt-ecosystem/table-formats/iceberg.md +++ b/docs/website/docs/dlt-ecosystem/table-formats/iceberg.md @@ -10,5 +10,5 @@ keywords: [iceberg, table formats] ## Supported destinations -Supported by: **Athena** +Supported by: **Athena**, **filesystem** diff --git a/docs/website/docs/general-usage/dataset-access/ibis-backend.md b/docs/website/docs/general-usage/dataset-access/ibis-backend.md index 8f4b0fb6b6..9f9b65e9c0 100644 --- a/docs/website/docs/general-usage/dataset-access/ibis-backend.md +++ b/docs/website/docs/general-usage/dataset-access/ibis-backend.md @@ -6,7 +6,7 @@ keywords: [data, dataset, ibis] # Ibis -Ibis is a powerful portable Python dataframe library. Learn more about what it is and how to use it in the [official documentation](https://ibis-project.org/). +Ibis is a powerful portable Python dataframe library. Learn more about what it is and how to use it in the [official documentation](https://ibis-project.org/). `dlt` provides an easy way to hand over your loaded dataset to an Ibis backend connection. @@ -46,4 +46,3 @@ print(table.limit(10).execute()) # Visit the ibis docs to learn more about the available methods ``` - diff --git a/docs/website/sidebars.js b/docs/website/sidebars.js index 274f3e82b3..8e8c11fc09 100644 --- a/docs/website/sidebars.js +++ b/docs/website/sidebars.js @@ -167,6 +167,7 @@ const sidebars = { 'dlt-ecosystem/destinations/synapse', 'dlt-ecosystem/destinations/clickhouse', 'dlt-ecosystem/destinations/filesystem', + 'dlt-ecosystem/destinations/delta-iceberg', 'dlt-ecosystem/destinations/postgres', 'dlt-ecosystem/destinations/redshift', 'dlt-ecosystem/destinations/snowflake', diff --git a/mypy.ini b/mypy.ini index 769e84b13a..fdf0ceb1e6 100644 --- a/mypy.ini +++ b/mypy.ini @@ -135,3 +135,9 @@ ignore_missing_imports = True [mypy-time_machine.*] ignore_missing_imports = True + +[mypy-pyiceberg.*] +ignore_missing_imports = True + +[mypy-airflow.*] +ignore_missing_imports = True diff --git a/poetry.lock b/poetry.lock index e5d2eafdd9..25dbf5a3b2 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1543,13 +1543,13 @@ files = [ [[package]] name = "cachetools" -version = "5.3.1" +version = "5.5.0" description = "Extensible memoizing collections and decorators" optional = false python-versions = ">=3.7" files = [ - {file = "cachetools-5.3.1-py3-none-any.whl", hash = "sha256:95ef631eeaea14ba2e36f06437f36463aac3a096799e876ee55e5cdccb102590"}, - {file = "cachetools-5.3.1.tar.gz", hash = "sha256:dce83f2d9b4e1f732a8cd44af8e8fab2dbe46201467fc98b3ef8f269092bf62b"}, + {file = "cachetools-5.5.0-py3-none-any.whl", hash = "sha256:02134e8439cdc2ffb62023ce1debca2944c3f289d66bb17ead3ab3dede74b292"}, + {file = "cachetools-5.5.0.tar.gz", hash = "sha256:2cc24fb4cbe39633fb7badd9db9ca6295d766d9c2995f245725a46715d050f2a"}, ] [[package]] @@ -5892,44 +5892,49 @@ files = [ [[package]] name = "mypy" -version = "1.10.0" +version = "1.12.1" description = "Optional static typing for Python" optional = false python-versions = ">=3.8" files = [ - {file = "mypy-1.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:da1cbf08fb3b851ab3b9523a884c232774008267b1f83371ace57f412fe308c2"}, - {file = "mypy-1.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:12b6bfc1b1a66095ab413160a6e520e1dc076a28f3e22f7fb25ba3b000b4ef99"}, - {file = "mypy-1.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e36fb078cce9904c7989b9693e41cb9711e0600139ce3970c6ef814b6ebc2b2"}, - {file = "mypy-1.10.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2b0695d605ddcd3eb2f736cd8b4e388288c21e7de85001e9f85df9187f2b50f9"}, - {file = "mypy-1.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:cd777b780312ddb135bceb9bc8722a73ec95e042f911cc279e2ec3c667076051"}, - {file = "mypy-1.10.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3be66771aa5c97602f382230165b856c231d1277c511c9a8dd058be4784472e1"}, - {file = "mypy-1.10.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8b2cbaca148d0754a54d44121b5825ae71868c7592a53b7292eeb0f3fdae95ee"}, - {file = "mypy-1.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ec404a7cbe9fc0e92cb0e67f55ce0c025014e26d33e54d9e506a0f2d07fe5de"}, - {file = "mypy-1.10.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e22e1527dc3d4aa94311d246b59e47f6455b8729f4968765ac1eacf9a4760bc7"}, - {file = "mypy-1.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:a87dbfa85971e8d59c9cc1fcf534efe664d8949e4c0b6b44e8ca548e746a8d53"}, - {file = "mypy-1.10.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:a781f6ad4bab20eef8b65174a57e5203f4be627b46291f4589879bf4e257b97b"}, - {file = "mypy-1.10.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b808e12113505b97d9023b0b5e0c0705a90571c6feefc6f215c1df9381256e30"}, - {file = "mypy-1.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f55583b12156c399dce2df7d16f8a5095291354f1e839c252ec6c0611e86e2e"}, - {file = "mypy-1.10.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4cf18f9d0efa1b16478c4c129eabec36148032575391095f73cae2e722fcf9d5"}, - {file = "mypy-1.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:bc6ac273b23c6b82da3bb25f4136c4fd42665f17f2cd850771cb600bdd2ebeda"}, - {file = "mypy-1.10.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9fd50226364cd2737351c79807775136b0abe084433b55b2e29181a4c3c878c0"}, - {file = "mypy-1.10.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f90cff89eea89273727d8783fef5d4a934be2fdca11b47def50cf5d311aff727"}, - {file = "mypy-1.10.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fcfc70599efde5c67862a07a1aaf50e55bce629ace26bb19dc17cece5dd31ca4"}, - {file = "mypy-1.10.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:075cbf81f3e134eadaf247de187bd604748171d6b79736fa9b6c9685b4083061"}, - {file = "mypy-1.10.0-cp38-cp38-win_amd64.whl", hash = "sha256:3f298531bca95ff615b6e9f2fc0333aae27fa48052903a0ac90215021cdcfa4f"}, - {file = "mypy-1.10.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fa7ef5244615a2523b56c034becde4e9e3f9b034854c93639adb667ec9ec2976"}, - {file = "mypy-1.10.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3236a4c8f535a0631f85f5fcdffba71c7feeef76a6002fcba7c1a8e57c8be1ec"}, - {file = "mypy-1.10.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4a2b5cdbb5dd35aa08ea9114436e0d79aceb2f38e32c21684dcf8e24e1e92821"}, - {file = "mypy-1.10.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:92f93b21c0fe73dc00abf91022234c79d793318b8a96faac147cd579c1671746"}, - {file = "mypy-1.10.0-cp39-cp39-win_amd64.whl", hash = "sha256:28d0e038361b45f099cc086d9dd99c15ff14d0188f44ac883010e172ce86c38a"}, - {file = "mypy-1.10.0-py3-none-any.whl", hash = "sha256:f8c083976eb530019175aabadb60921e73b4f45736760826aa1689dda8208aee"}, - {file = "mypy-1.10.0.tar.gz", hash = "sha256:3d087fcbec056c4ee34974da493a826ce316947485cef3901f511848e687c131"}, + {file = "mypy-1.12.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:3d7d4371829184e22fda4015278fbfdef0327a4b955a483012bd2d423a788801"}, + {file = "mypy-1.12.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f59f1dfbf497d473201356966e353ef09d4daec48caeacc0254db8ef633a28a5"}, + {file = "mypy-1.12.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b947097fae68004b8328c55161ac9db7d3566abfef72d9d41b47a021c2fba6b1"}, + {file = "mypy-1.12.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:96af62050971c5241afb4701c15189ea9507db89ad07794a4ee7b4e092dc0627"}, + {file = "mypy-1.12.1-cp310-cp310-win_amd64.whl", hash = "sha256:d90da248f4c2dba6c44ddcfea94bb361e491962f05f41990ff24dbd09969ce20"}, + {file = "mypy-1.12.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1230048fec1380faf240be6385e709c8570604d2d27ec6ca7e573e3bc09c3735"}, + {file = "mypy-1.12.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:02dcfe270c6ea13338210908f8cadc8d31af0f04cee8ca996438fe6a97b4ec66"}, + {file = "mypy-1.12.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a5a437c9102a6a252d9e3a63edc191a3aed5f2fcb786d614722ee3f4472e33f6"}, + {file = "mypy-1.12.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:186e0c8346efc027ee1f9acf5ca734425fc4f7dc2b60144f0fbe27cc19dc7931"}, + {file = "mypy-1.12.1-cp311-cp311-win_amd64.whl", hash = "sha256:673ba1140a478b50e6d265c03391702fa11a5c5aff3f54d69a62a48da32cb811"}, + {file = "mypy-1.12.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:9fb83a7be97c498176fb7486cafbb81decccaef1ac339d837c377b0ce3743a7f"}, + {file = "mypy-1.12.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:389e307e333879c571029d5b93932cf838b811d3f5395ed1ad05086b52148fb0"}, + {file = "mypy-1.12.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:94b2048a95a21f7a9ebc9fbd075a4fcd310410d078aa0228dbbad7f71335e042"}, + {file = "mypy-1.12.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4ee5932370ccf7ebf83f79d1c157a5929d7ea36313027b0d70a488493dc1b179"}, + {file = "mypy-1.12.1-cp312-cp312-win_amd64.whl", hash = "sha256:19bf51f87a295e7ab2894f1d8167622b063492d754e69c3c2fed6563268cb42a"}, + {file = "mypy-1.12.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:d34167d43613ffb1d6c6cdc0cc043bb106cac0aa5d6a4171f77ab92a3c758bcc"}, + {file = "mypy-1.12.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:427878aa54f2e2c5d8db31fa9010c599ed9f994b3b49e64ae9cd9990c40bd635"}, + {file = "mypy-1.12.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5fcde63ea2c9f69d6be859a1e6dd35955e87fa81de95bc240143cf00de1f7f81"}, + {file = "mypy-1.12.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d54d840f6c052929f4a3d2aab2066af0f45a020b085fe0e40d4583db52aab4e4"}, + {file = "mypy-1.12.1-cp313-cp313-win_amd64.whl", hash = "sha256:20db6eb1ca3d1de8ece00033b12f793f1ea9da767334b7e8c626a4872090cf02"}, + {file = "mypy-1.12.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b16fe09f9c741d85a2e3b14a5257a27a4f4886c171d562bc5a5e90d8591906b8"}, + {file = "mypy-1.12.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0dcc1e843d58f444fce19da4cce5bd35c282d4bde232acdeca8279523087088a"}, + {file = "mypy-1.12.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e10ba7de5c616e44ad21005fa13450cd0de7caaa303a626147d45307492e4f2d"}, + {file = "mypy-1.12.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:0e6fe449223fa59fbee351db32283838a8fee8059e0028e9e6494a03802b4004"}, + {file = "mypy-1.12.1-cp38-cp38-win_amd64.whl", hash = "sha256:dc6e2a2195a290a7fd5bac3e60b586d77fc88e986eba7feced8b778c373f9afe"}, + {file = "mypy-1.12.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:de5b2a8988b4e1269a98beaf0e7cc71b510d050dce80c343b53b4955fff45f19"}, + {file = "mypy-1.12.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:843826966f1d65925e8b50d2b483065c51fc16dc5d72647e0236aae51dc8d77e"}, + {file = "mypy-1.12.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9fe20f89da41a95e14c34b1ddb09c80262edcc295ad891f22cc4b60013e8f78d"}, + {file = "mypy-1.12.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8135ffec02121a75f75dc97c81af7c14aa4ae0dda277132cfcd6abcd21551bfd"}, + {file = "mypy-1.12.1-cp39-cp39-win_amd64.whl", hash = "sha256:a7b76fa83260824300cc4834a3ab93180db19876bce59af921467fd03e692810"}, + {file = "mypy-1.12.1-py3-none-any.whl", hash = "sha256:ce561a09e3bb9863ab77edf29ae3a50e65685ad74bba1431278185b7e5d5486e"}, + {file = "mypy-1.12.1.tar.gz", hash = "sha256:f5b3936f7a6d0e8280c9bdef94c7ce4847f5cdfc258fbb2c29a8c1711e8bb96d"}, ] [package.dependencies] mypy-extensions = ">=1.0.0" tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} -typing-extensions = ">=4.1.0" +typing-extensions = ">=4.6.0" [package.extras] dmypy = ["psutil (>=4.0)"] @@ -7541,6 +7546,74 @@ files = [ [package.extras] plugins = ["importlib-metadata"] +[[package]] +name = "pyiceberg" +version = "0.8.1" +description = "Apache Iceberg is an open table format for huge analytic datasets" +optional = true +python-versions = "!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,!=3.8.*,>=3.9" +files = [ + {file = "pyiceberg-0.8.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7c121d1d3baf64510db94740ad870ae4b6eb9eb59a5ff7ecb4e96f7510666b2f"}, + {file = "pyiceberg-0.8.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2a6f14aa588a3883fc7fddc136ca75b75660b4abb0b55b4c541619953f8971e7"}, + {file = "pyiceberg-0.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7c720c2a191ac6faf01fe4c0f4c01c64b94bf064185b0292003d42939049277c"}, + {file = "pyiceberg-0.8.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:d421d6e51ac1c581cba9fce96aa6b9118cf4a02270066a7fdc9490ab5d57ece9"}, + {file = "pyiceberg-0.8.1-cp310-cp310-win_amd64.whl", hash = "sha256:ae11fb0515ea0a046370e09a7f6039a7e86622ab910360eaa732f0106b8f00c7"}, + {file = "pyiceberg-0.8.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:9488954c9eb5ce42ca6b816fc61873f219414cfdb9e9928d1c4a302702be1d89"}, + {file = "pyiceberg-0.8.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:44179e0fb844887b440c162279ba526dfe0e0f72d32945236528838518b55af0"}, + {file = "pyiceberg-0.8.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e121c6f5505d8ec711a1dd1690e07156cd54fb3d0844d5d991e02f1593f2708"}, + {file = "pyiceberg-0.8.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:5961a288f2d4bbb2ab300c803da1bf0e70cea837e3f14b14108827cc821af252"}, + {file = "pyiceberg-0.8.1-cp311-cp311-win_amd64.whl", hash = "sha256:dbe192324a6fb552c2fd29cab51086e21fa248ea2a0b95fbab921dede49e5a69"}, + {file = "pyiceberg-0.8.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:60430f0d8f6d650ed7d1893d038b847565a8e9ac135a1cc812e57d24f0482f6c"}, + {file = "pyiceberg-0.8.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f0f697977dac672d8b00e125836423585a97ebf59a28b865b1296a2b6ee81c51"}, + {file = "pyiceberg-0.8.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:370de7c230970ff858f713d150164d492ba8450e771e59a0c520520b13ea6226"}, + {file = "pyiceberg-0.8.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3036ed226020d50e30648a71f968cf78bde5d6b609294508e60754e100e5ef36"}, + {file = "pyiceberg-0.8.1-cp312-cp312-win_amd64.whl", hash = "sha256:9ac9555f3bd25a31059229089ae639cf738a8e8286a175cea128561ac1ed9452"}, + {file = "pyiceberg-0.8.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:51da3a553d3a881042bf436e66a91cc2b6c4a3fea0e174cd73af2eb6ed255323"}, + {file = "pyiceberg-0.8.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:863f1dce7340e6ed870706a3fa4a73457178dae8529725bb80522ddcd4253afb"}, + {file = "pyiceberg-0.8.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4dbf52b39080a6a2cda6a5126a74e3a88d5b206f609c128d001a728b36b81075"}, + {file = "pyiceberg-0.8.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:eb77d65e8efbb883c163817e4a9c373d907110ab6343c1b816b48f336955d4d7"}, + {file = "pyiceberg-0.8.1-cp39-cp39-win_amd64.whl", hash = "sha256:1fcd35b7de0eddc3fd8fd0c38b98741217ef6de4eeb0e72b798b4007692aa76c"}, + {file = "pyiceberg-0.8.1-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:6f0f56f8fc61bcd795f6a3d03e8ce6bee09ebaa64425eb08327e975f906d98be"}, + {file = "pyiceberg-0.8.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:d7099c6631743ad29c451de2bebd9ed3c96c42bcb1fe5d5d5c93aec895858e3f"}, + {file = "pyiceberg-0.8.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d6436f5a782491115f64131882a737d77c9dc0040493e1b7f9b3081ea8cf6a26"}, + {file = "pyiceberg-0.8.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c1d75b40a98a327f7436eb0d6187c51834c44b79adf61c6945b33645f4afbf17"}, + {file = "pyiceberg-0.8.1-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:8de988fa2363e6a51b40b85b5ff1e8261cda5bfc14ac54dd4ebe58391b95acae"}, + {file = "pyiceberg-0.8.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:dd06c5b606011155aa0b76e7b001e30f1c40ab2fb3eeb8a0652b88629259c2bb"}, + {file = "pyiceberg-0.8.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8142f0dbc12dda0e6d7aaf564a3fbb0f17fc934630e7cf866773c8caaebf666"}, + {file = "pyiceberg-0.8.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:6126ee3a46ff975f15abf2085f184591d21643bffb96330907e003eea0b63005"}, + {file = "pyiceberg-0.8.1.tar.gz", hash = "sha256:4502f0cfddf6f7cd48b9cd54016bce0ab94052b0ab01efcfa515879074f4c8e3"}, +] + +[package.dependencies] +cachetools = ">=5.5.0,<6.0.0" +click = ">=7.1.1,<9.0.0" +fsspec = ">=2023.1.0" +mmh3 = ">=4.0.0,<6.0.0" +pydantic = ">=2.0,<2.4.0 || >2.4.0,<2.4.1 || >2.4.1,<3.0" +pyparsing = ">=3.1.0,<4.0.0" +requests = ">=2.20.0,<3.0.0" +rich = ">=10.11.0,<14.0.0" +sortedcontainers = "2.4.0" +strictyaml = ">=1.7.0,<2.0.0" +tenacity = ">=8.2.3,<10.0.0" + +[package.extras] +adlfs = ["adlfs (>=2023.1.0)"] +daft = ["getdaft (>=0.2.12)"] +duckdb = ["duckdb (>=0.5.0,<2.0.0)", "pyarrow (>=14.0.0,<19.0.0)"] +dynamodb = ["boto3 (>=1.24.59)"] +gcsfs = ["gcsfs (>=2023.1.0)"] +glue = ["boto3 (>=1.24.59)", "mypy-boto3-glue (>=1.28.18)"] +hive = ["thrift (>=0.13.0,<1.0.0)"] +pandas = ["pandas (>=1.0.0,<3.0.0)", "pyarrow (>=14.0.0,<19.0.0)"] +pyarrow = ["pyarrow (>=14.0.0,<19.0.0)"] +ray = ["pandas (>=1.0.0,<3.0.0)", "pyarrow (>=14.0.0,<19.0.0)", "ray (==2.10.0)", "ray (>=2.10.0,<3.0.0)"] +s3fs = ["s3fs (>=2023.1.0)"] +snappy = ["python-snappy (>=0.6.0,<1.0.0)"] +sql-postgres = ["psycopg2-binary (>=2.9.6)", "sqlalchemy (>=2.0.18,<3.0.0)"] +sql-sqlite = ["sqlalchemy (>=2.0.18,<3.0.0)"] +zstandard = ["zstandard (>=0.13.0,<1.0.0)"] + [[package]] name = "pyjwt" version = "2.8.0" @@ -9347,6 +9420,20 @@ files = [ [package.dependencies] pbr = ">=2.0.0,<2.1.0 || >2.1.0" +[[package]] +name = "strictyaml" +version = "1.7.3" +description = "Strict, typed YAML parser" +optional = true +python-versions = ">=3.7.0" +files = [ + {file = "strictyaml-1.7.3-py3-none-any.whl", hash = "sha256:fb5c8a4edb43bebb765959e420f9b3978d7f1af88c80606c03fb420888f5d1c7"}, + {file = "strictyaml-1.7.3.tar.gz", hash = "sha256:22f854a5fcab42b5ddba8030a0e4be51ca89af0267961c8d6cfa86395586c407"}, +] + +[package.dependencies] +python-dateutil = ">=2.6.0" + [[package]] name = "sympy" version = "1.12" @@ -10626,6 +10713,7 @@ mssql = ["pyodbc"] parquet = ["pyarrow"] postgis = ["psycopg2-binary", "psycopg2cffi"] postgres = ["psycopg2-binary", "psycopg2cffi"] +pyiceberg = ["pyarrow", "pyiceberg", "sqlalchemy"] qdrant = ["qdrant-client"] redshift = ["psycopg2-binary", "psycopg2cffi"] s3 = ["botocore", "s3fs"] @@ -10639,4 +10727,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "a7cd6b599326d80b5beb8d4a3d3e3b4074eda6dc53daa5c296ef8d54002c5f78" +content-hash = "84e8b8eccd9b8ee104a2dc08f5b83987aeb06540d61330390ce849cc1ad6acb4" diff --git a/pyproject.toml b/pyproject.toml index 4e10e96b93..4de92014a8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,7 +75,7 @@ cron-descriptor = {version = ">=1.2.32", optional = true} pipdeptree = {version = ">=2.9.0,<2.10", optional = true} pyathena = {version = ">=2.9.6", optional = true} weaviate-client = {version = ">=3.22", optional = true} -adlfs = {version = ">=2022.4.0", optional = true} +adlfs = {version = ">=2024.7.0", optional = true} pyodbc = {version = ">=4.0.39", optional = true} qdrant-client = {version = ">=1.8", optional = true, extras = ["fastembed"]} databricks-sql-connector = {version = ">=2.9.3", optional = true} @@ -91,6 +91,12 @@ sqlglot = {version = ">=20.0.0", optional = true} db-dtypes = { version = ">=1.2.0", optional = true } aiohttp = { version = ">=3.9", optional = true } databricks-sdk = {version = ">=0.38.0", optional = true} +# `sql-sqlite` extra leads to dependency conflict with `apache-airflow` because `apache-airflow` +# requires `sqlalchemy<2.0.0` while the extra requires `sqlalchemy>=2.0.18` +# https://github.com/apache/airflow/issues/28723 +# pyiceberg = { version = ">=0.7.1", optional = true, extras = ["sql-sqlite"] } +# we will rely on manual installation of `sqlalchemy>=2.0.18` instead +pyiceberg = { version = ">=0.8.1", python = ">=3.9", optional = true } [tool.poetry.extras] gcp = ["grpcio", "google-cloud-bigquery", "db-dtypes", "gcsfs"] @@ -120,6 +126,7 @@ lancedb = ["lancedb", "pyarrow", "tantivy"] deltalake = ["deltalake", "pyarrow"] sql_database = ["sqlalchemy"] sqlalchemy = ["sqlalchemy", "alembic"] +pyiceberg = ["pyiceberg", "pyarrow", "sqlalchemy"] postgis = ["psycopg2-binary", "psycopg2cffi"] [tool.poetry.scripts] @@ -136,7 +143,7 @@ sqlfluff = "^2.3.2" types-deprecated = "^1.2.9.2" pytest-console-scripts = "^1.4.1" pytest = "^7.0.0" -mypy = "^1.10.0" +mypy = ">=1.11.0,<1.13.0" flake8 = "^5.0.0" bandit = "^1.7.0" black = "^23.7.0" diff --git a/tests/conftest.py b/tests/conftest.py index 6088fa976c..a5a349f8d9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -120,6 +120,9 @@ def _create_pipeline_instance_id(self) -> str: # disable googleapiclient logging logging.getLogger("googleapiclient.discovery_cache").setLevel("WARNING") + # disable pyiceberg logging + logging.getLogger("pyiceberg").setLevel("WARNING") + # reset and init airflow db import warnings diff --git a/tests/libs/test_csv_writer.py b/tests/libs/test_csv_writer.py index 3c30123e1c..a120cd048e 100644 --- a/tests/libs/test_csv_writer.py +++ b/tests/libs/test_csv_writer.py @@ -178,7 +178,7 @@ def test_non_utf8_binary(item_type: TestDataItemFormat) -> None: table = pq.read_table(f) else: table = data - writer_type: Type[DataWriter] = ArrowToCsvWriter if item_type == "arrow-table" else CsvWriter # type: ignore + writer_type: Type[DataWriter] = ArrowToCsvWriter if item_type == "arrow-table" else CsvWriter with pytest.raises(InvalidDataItem) as inv_ex: with get_writer(writer_type, disable_compression=True) as writer: @@ -195,7 +195,7 @@ def test_arrow_struct() -> None: @pytest.mark.parametrize("item_type", ["object", "arrow-table"]) def test_csv_writer_empty(item_type: TestDataItemFormat) -> None: - writer_type: Type[DataWriter] = ArrowToCsvWriter if item_type == "arrow-table" else CsvWriter # type: ignore + writer_type: Type[DataWriter] = ArrowToCsvWriter if item_type == "arrow-table" else CsvWriter with get_writer(writer_type, disable_compression=True) as writer: writer.write_empty_file(TABLE_UPDATE_COLUMNS_SCHEMA) diff --git a/tests/load/filesystem/test_object_store_rs_credentials.py b/tests/load/filesystem/test_credentials_mixins.py similarity index 50% rename from tests/load/filesystem/test_object_store_rs_credentials.py rename to tests/load/filesystem/test_credentials_mixins.py index f23187a269..c1fb02c152 100644 --- a/tests/load/filesystem/test_object_store_rs_credentials.py +++ b/tests/load/filesystem/test_credentials_mixins.py @@ -1,12 +1,8 @@ -"""Tests translation of `dlt` credentials into `object_store` Rust crate credentials.""" - -from typing import Any, Dict +from typing import Any, Dict, Union, Type, get_args, cast import os import json # noqa: I251 import pytest -from deltalake import DeltaTable -from deltalake.exceptions import TableNotFoundError import dlt from dlt.common.configuration import resolve_configuration @@ -23,10 +19,15 @@ from dlt.common.utils import custom_environ from dlt.common.configuration.resolve import resolve_configuration from dlt.common.configuration.specs.gcp_credentials import GcpDefaultCredentials -from dlt.common.configuration.specs.exceptions import ObjectStoreRsCredentialsException +from dlt.common.configuration.specs.exceptions import ( + ObjectStoreRsCredentialsException, + UnsupportedAuthenticationMethodException, +) +from dlt.common.configuration.specs.mixins import WithObjectStoreRsCredentials, WithPyicebergConfig from tests.load.utils import ( AZ_BUCKET, + ABFS_BUCKET, AWS_BUCKET, GCS_BUCKET, R2_BUCKET_CONFIG, @@ -34,6 +35,9 @@ ) +TCredentialsMixin = Union[WithObjectStoreRsCredentials, WithPyicebergConfig] +ALL_CREDENTIALS_MIXINS = get_args(TCredentialsMixin) + pytestmark = pytest.mark.essential if all(driver not in ALL_FILESYSTEM_DRIVERS for driver in ("az", "s3", "gs", "r2")): @@ -53,11 +57,27 @@ def fs_creds() -> Dict[str, Any]: return creds -def can_connect(bucket_url: str, object_store_rs_credentials: Dict[str, str]) -> bool: - """Returns True if client can connect to object store, False otherwise. +def can_connect(bucket_url: str, credentials: TCredentialsMixin, mixin: Type[TCredentialsMixin]) -> bool: # type: ignore[return] + """Returns True if client can connect to object store, False otherwise.""" + if mixin == WithObjectStoreRsCredentials: + credentials = cast(WithObjectStoreRsCredentials, credentials) + return can_connect_object_store_rs_credentials( + bucket_url, credentials.to_object_store_rs_credentials() + ) + elif mixin == WithPyicebergConfig: + credentials = cast(WithPyicebergConfig, credentials) + return can_connect_pyiceberg_fileio_config( + bucket_url, credentials.to_pyiceberg_fileio_config() + ) + + +def can_connect_object_store_rs_credentials( + bucket_url: str, object_store_rs_credentials: Dict[str, str] +) -> bool: + # uses `deltatable` library as Python interface to `object_store` Rust crate + from deltalake import DeltaTable + from deltalake.exceptions import TableNotFoundError - Uses `deltatable` library as Python interface to `object_store` Rust crate. - """ try: DeltaTable( bucket_url, @@ -70,16 +90,40 @@ def can_connect(bucket_url: str, object_store_rs_credentials: Dict[str, str]) -> return False +def can_connect_pyiceberg_fileio_config( + bucket_url: str, pyiceberg_fileio_config: Dict[str, str] +) -> bool: + from pyiceberg.table import StaticTable + + try: + StaticTable.from_metadata( + f"{bucket_url}/non_existing_metadata_file.json", + properties=pyiceberg_fileio_config, + ) + except FileNotFoundError: + # this error implies the connection was successful + # there is no Iceberg metadata file at the specified path + return True + return False + + @pytest.mark.parametrize( - "driver", [driver for driver in ALL_FILESYSTEM_DRIVERS if driver in ("az")] + "driver", [driver for driver in ALL_FILESYSTEM_DRIVERS if driver in ("az", "abfss")] ) -def test_azure_object_store_rs_credentials(driver: str, fs_creds: Dict[str, Any]) -> None: +@pytest.mark.parametrize("mixin", ALL_CREDENTIALS_MIXINS) +def test_azure_credentials_mixins( + driver: str, fs_creds: Dict[str, Any], mixin: Type[TCredentialsMixin] +) -> None: + if mixin == WithPyicebergConfig and driver == "az": + pytest.skip("`pyiceberg` does not support `az` scheme") + + buckets = {"az": AZ_BUCKET, "abfss": ABFS_BUCKET} creds: AnyAzureCredentials creds = AzureServicePrincipalCredentialsWithoutDefaults( **dlt.secrets.get("destination.fsazureprincipal.credentials") ) - assert can_connect(AZ_BUCKET, creds.to_object_store_rs_credentials()) + assert can_connect(buckets[driver], creds, mixin) # without SAS token creds = AzureCredentialsWithoutDefaults( @@ -87,18 +131,21 @@ def test_azure_object_store_rs_credentials(driver: str, fs_creds: Dict[str, Any] azure_storage_account_key=fs_creds["azure_storage_account_key"], ) assert creds.azure_storage_sas_token is None - assert can_connect(AZ_BUCKET, creds.to_object_store_rs_credentials()) + assert can_connect(buckets[driver], creds, mixin) # with SAS token creds = resolve_configuration(creds) assert creds.azure_storage_sas_token is not None - assert can_connect(AZ_BUCKET, creds.to_object_store_rs_credentials()) + assert can_connect(buckets[driver], creds, mixin) @pytest.mark.parametrize( "driver", [driver for driver in ALL_FILESYSTEM_DRIVERS if driver in ("s3", "r2")] ) -def test_aws_object_store_rs_credentials(driver: str, fs_creds: Dict[str, Any]) -> None: +@pytest.mark.parametrize("mixin", ALL_CREDENTIALS_MIXINS) +def test_aws_credentials_mixins( + driver: str, fs_creds: Dict[str, Any], mixin: Type[TCredentialsMixin] +) -> None: creds: AwsCredentialsWithoutDefaults if driver == "r2": @@ -112,9 +159,11 @@ def test_aws_object_store_rs_credentials(driver: str, fs_creds: Dict[str, Any]) endpoint_url=fs_creds.get("endpoint_url"), ) assert creds.aws_session_token is None - object_store_rs_creds = creds.to_object_store_rs_credentials() - assert "aws_session_token" not in object_store_rs_creds # no auto-generated token - assert can_connect(AWS_BUCKET, object_store_rs_creds) + if mixin == WithObjectStoreRsCredentials: + assert ( + "aws_session_token" not in creds.to_object_store_rs_credentials() + ) # no auto-generated token + assert can_connect(AWS_BUCKET, creds, mixin) # AwsCredentials: no user-provided session token creds = AwsCredentials( @@ -124,24 +173,27 @@ def test_aws_object_store_rs_credentials(driver: str, fs_creds: Dict[str, Any]) endpoint_url=fs_creds.get("endpoint_url"), ) assert creds.aws_session_token is None - object_store_rs_creds = creds.to_object_store_rs_credentials() - assert "aws_session_token" not in object_store_rs_creds # no auto-generated token - assert can_connect(AWS_BUCKET, object_store_rs_creds) - - # exception should be raised if both `endpoint_url` and `region_name` are - # not provided - with pytest.raises(ObjectStoreRsCredentialsException): - AwsCredentials( - aws_access_key_id=fs_creds["aws_access_key_id"], - aws_secret_access_key=fs_creds["aws_secret_access_key"], - ).to_object_store_rs_credentials() - - if "endpoint_url" in object_store_rs_creds: - # TODO: make sure this case is tested on GitHub CI, e.g. by adding - # a local MinIO bucket to the set of tested buckets - if object_store_rs_creds["endpoint_url"].startswith("http://"): + assert can_connect(AWS_BUCKET, creds, mixin) + if mixin == WithObjectStoreRsCredentials: + object_store_rs_creds = creds.to_object_store_rs_credentials() + assert "aws_session_token" not in object_store_rs_creds # no auto-generated token + + # exception should be raised if both `endpoint_url` and `region_name` are + # not provided + with pytest.raises(ObjectStoreRsCredentialsException): + AwsCredentials( + aws_access_key_id=fs_creds["aws_access_key_id"], + aws_secret_access_key=fs_creds["aws_secret_access_key"], + ).to_object_store_rs_credentials() + + if "endpoint_url" in object_store_rs_creds and object_store_rs_creds[ + "endpoint_url" + ].startswith("http://"): + # TODO: make sure this case is tested on GitHub CI, e.g. by adding + # a local MinIO bucket to the set of tested buckets assert object_store_rs_creds["aws_allow_http"] == "true" + if creds.endpoint_url is not None: # remainder of tests use session tokens # we don't run them on S3 compatible storage because session tokens # may not be available @@ -158,9 +210,10 @@ def test_aws_object_store_rs_credentials(driver: str, fs_creds: Dict[str, Any]) region_name=fs_creds["region_name"], ) assert creds.aws_session_token is not None - object_store_rs_creds = creds.to_object_store_rs_credentials() - assert object_store_rs_creds["aws_session_token"] is not None - assert can_connect(AWS_BUCKET, object_store_rs_creds) + assert can_connect(AWS_BUCKET, creds, mixin) + if mixin == WithObjectStoreRsCredentials: + object_store_rs_creds = creds.to_object_store_rs_credentials() + assert object_store_rs_creds["aws_session_token"] is not None # AwsCredentialsWithoutDefaults: user-provided session token creds = AwsCredentialsWithoutDefaults( @@ -170,15 +223,19 @@ def test_aws_object_store_rs_credentials(driver: str, fs_creds: Dict[str, Any]) region_name=fs_creds["region_name"], ) assert creds.aws_session_token is not None - object_store_rs_creds = creds.to_object_store_rs_credentials() - assert object_store_rs_creds["aws_session_token"] is not None - assert can_connect(AWS_BUCKET, object_store_rs_creds) + assert can_connect(AWS_BUCKET, creds, mixin) + if mixin == WithObjectStoreRsCredentials: + object_store_rs_creds = creds.to_object_store_rs_credentials() + assert object_store_rs_creds["aws_session_token"] is not None @pytest.mark.parametrize( "driver", [driver for driver in ALL_FILESYSTEM_DRIVERS if driver in ("gs")] ) -def test_gcp_object_store_rs_credentials(driver, fs_creds: Dict[str, Any]) -> None: +@pytest.mark.parametrize("mixin", ALL_CREDENTIALS_MIXINS) +def test_gcp_credentials_mixins( + driver, fs_creds: Dict[str, Any], mixin: Type[TCredentialsMixin] +) -> None: creds: GcpCredentials # GcpServiceAccountCredentialsWithoutDefaults @@ -189,7 +246,11 @@ def test_gcp_object_store_rs_credentials(driver, fs_creds: Dict[str, Any]) -> No private_key_id=fs_creds["private_key_id"], client_email=fs_creds["client_email"], ) - assert can_connect(GCS_BUCKET, creds.to_object_store_rs_credentials()) + if mixin == WithPyicebergConfig: + with pytest.raises(UnsupportedAuthenticationMethodException): + assert can_connect(GCS_BUCKET, creds, mixin) + elif mixin == WithObjectStoreRsCredentials: + assert can_connect(GCS_BUCKET, creds, mixin) # GcpDefaultCredentials @@ -197,7 +258,7 @@ def test_gcp_object_store_rs_credentials(driver, fs_creds: Dict[str, Any]) -> No GcpDefaultCredentials._LAST_FAILED_DEFAULT = 0 # write service account key to JSON file - service_json = json.loads(creds.to_object_store_rs_credentials()["service_account_key"]) + service_json = json.loads(creds.to_native_representation()) path = "_secrets/service.json" os.makedirs(os.path.dirname(path), exist_ok=True) with open(path, "w", encoding="utf-8") as f: @@ -206,8 +267,18 @@ def test_gcp_object_store_rs_credentials(driver, fs_creds: Dict[str, Any]) -> No with custom_environ({"GOOGLE_APPLICATION_CREDENTIALS": path}): creds = GcpDefaultCredentials() resolve_configuration(creds) - can_connect(GCS_BUCKET, creds.to_object_store_rs_credentials()) - - # GcpOAuthCredentialsWithoutDefaults is currently not supported - with pytest.raises(NotImplementedError): - GcpOAuthCredentialsWithoutDefaults().to_object_store_rs_credentials() + if mixin == WithPyicebergConfig: + with pytest.raises(UnsupportedAuthenticationMethodException): + assert can_connect(GCS_BUCKET, creds, mixin) + elif mixin == WithObjectStoreRsCredentials: + assert can_connect(GCS_BUCKET, creds, mixin) + + # GcpOAuthCredentialsWithoutDefaults + creds = resolve_configuration( + GcpOAuthCredentialsWithoutDefaults(), sections=("destination", "fsgcpoauth") + ) + if mixin == WithPyicebergConfig: + assert can_connect(GCS_BUCKET, creds, mixin) + elif mixin == WithObjectStoreRsCredentials: + with pytest.raises(UnsupportedAuthenticationMethodException): + assert can_connect(GCS_BUCKET, creds, mixin) diff --git a/tests/load/filesystem/test_sql_client.py b/tests/load/filesystem/test_sql_client.py index ac2ada2551..a73b0f7e31 100644 --- a/tests/load/filesystem/test_sql_client.py +++ b/tests/load/filesystem/test_sql_client.py @@ -1,17 +1,17 @@ """Test the duckdb supported sql client for special internal features""" -from typing import Any +from typing import Optional import pytest import dlt import os import shutil -import logging from dlt import Pipeline from dlt.common.utils import uniq_id +from dlt.common.schema.typing import TTableFormat from tests.load.utils import ( destinations_configs, @@ -19,7 +19,6 @@ GCS_BUCKET, SFTP_BUCKET, MEMORY_BUCKET, - AWS_BUCKET, ) from dlt.destinations import filesystem from tests.utils import TEST_STORAGE_ROOT @@ -37,7 +36,7 @@ def _run_dataset_checks( pipeline: Pipeline, destination_config: DestinationTestConfiguration, secret_directory: str, - table_format: Any = None, + table_format: Optional[TTableFormat] = None, alternate_access_pipeline: Pipeline = None, ) -> None: total_records = 200 @@ -144,6 +143,8 @@ def _external_duckdb_connection() -> duckdb.DuckDBPyConnection: # the line below solves problems with certificate path lookup on linux, see duckdb docs external_db.sql("SET azure_transport_option_type = 'curl';") external_db.sql(f"SET secret_directory = '{secret_directory}';") + if table_format == "iceberg": + FilesystemSqlClient._setup_iceberg(external_db) return external_db def _fs_sql_client_for_external_db( @@ -283,13 +284,13 @@ def test_read_interfaces_filesystem( "destination_config", destinations_configs( table_format_filesystem_configs=True, - with_table_format="delta", + with_table_format=("delta", "iceberg"), bucket_exclude=[SFTP_BUCKET, MEMORY_BUCKET], # NOTE: delta does not work on memory buckets ), ids=lambda x: x.name, ) -def test_delta_tables( +def test_table_formats( destination_config: DestinationTestConfiguration, secret_directory: str ) -> None: os.environ["DATA_WRITER__FILE_MAX_ITEMS"] = "700" @@ -302,8 +303,9 @@ def test_delta_tables( # in case of gcs we use the s3 compat layer for reading # for writing we still need to use the gc authentication, as delta_rs seems to use # methods on the s3 interface that are not implemented by gcs + # s3 compat layer does not work with `iceberg` table format access_pipeline = pipeline - if destination_config.bucket_url == GCS_BUCKET: + if destination_config.bucket_url == GCS_BUCKET and destination_config.table_format != "iceberg": gcp_bucket = filesystem( GCS_BUCKET.replace("gs://", "s3://"), destination_name="filesystem_s3_gcs_comp" ) @@ -315,7 +317,7 @@ def test_delta_tables( pipeline, destination_config, secret_directory=secret_directory, - table_format="delta", + table_format=destination_config.table_format, alternate_access_pipeline=access_pipeline, ) diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py index 8d890642ee..c70fa5ab5d 100644 --- a/tests/load/pipeline/test_filesystem_pipeline.py +++ b/tests/load/pipeline/test_filesystem_pipeline.py @@ -2,7 +2,7 @@ import os import posixpath from pathlib import Path -from typing import Any, Callable, List, Dict, cast +from typing import Any, Callable, List, Dict, cast, Tuple from importlib.metadata import version as pkg_version from packaging.version import Version @@ -15,7 +15,7 @@ from dlt.common.storages.configuration import FilesystemConfiguration from dlt.common.storages.load_package import ParsedLoadJobFileName from dlt.common.utils import uniq_id -from dlt.common.schema.typing import TWriteDisposition +from dlt.common.schema.typing import TWriteDisposition, TTableFormat from dlt.common.configuration.exceptions import ConfigurationValueError from dlt.destinations import filesystem from dlt.destinations.impl.filesystem.filesystem import FilesystemClient @@ -223,6 +223,48 @@ def some_source(): assert table.column("value").to_pylist() == [1, 2, 3, 4, 5] +# here start the `table_format` tests + + +def get_expected_actual( + pipeline: dlt.Pipeline, + table_name: str, + table_format: TTableFormat, + arrow_table: "pyarrow.Table", # type: ignore[name-defined] # noqa: F821 +) -> Tuple["pyarrow.Table", "pyarrow.Table"]: # type: ignore[name-defined] # noqa: F821 + from dlt.common.libs.pyarrow import pyarrow, cast_arrow_schema_types + + if table_format == "delta": + from dlt.common.libs.deltalake import ( + get_delta_tables, + ensure_delta_compatible_arrow_data, + ) + + dt = get_delta_tables(pipeline, table_name)[table_name] + expected = ensure_delta_compatible_arrow_data(arrow_table) + actual = dt.to_pyarrow_table() + elif table_format == "iceberg": + from dlt.common.libs.pyiceberg import ( + get_iceberg_tables, + ensure_iceberg_compatible_arrow_data, + ) + + it = get_iceberg_tables(pipeline, table_name)[table_name] + expected = ensure_iceberg_compatible_arrow_data(arrow_table) + actual = it.scan().to_arrow() + + # work around pyiceberg bug https://github.com/apache/iceberg-python/issues/1128 + schema = cast_arrow_schema_types( + actual.schema, + { + pyarrow.types.is_large_string: pyarrow.string(), + pyarrow.types.is_large_binary: pyarrow.binary(), + }, + ) + actual = actual.cast(schema) + return (expected, actual) + + @pytest.mark.skip( reason="pyarrow version check not needed anymore, since we have 17 as a dependency" ) @@ -258,44 +300,44 @@ def foo(): "destination_config", destinations_configs( table_format_filesystem_configs=True, - with_table_format="delta", + with_table_format=("delta", "iceberg"), bucket_exclude=(MEMORY_BUCKET, SFTP_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_core( +def test_table_format_core( destination_config: DestinationTestConfiguration, ) -> None: - """Tests core functionality for `delta` table format. + """Tests core functionality for `delta` and `iceberg` table formats. Tests all data types, all filesystems. Tests `append` and `replace` write dispositions (`merge` is tested elsewhere). """ - - from dlt.common.libs.deltalake import get_delta_tables + if destination_config.table_format == "delta": + from dlt.common.libs.deltalake import get_delta_tables # create resource that yields rows with all data types column_schemas, row = table_update_and_row() - @dlt.resource(columns=column_schemas, table_format="delta") + @dlt.resource(columns=column_schemas, table_format=destination_config.table_format) def data_types(): nonlocal row yield [row] * 10 pipeline = destination_config.setup_pipeline("fs_pipe", dev_mode=True) - # run pipeline, this should create Delta table + # run pipeline, this should create table info = pipeline.run(data_types()) assert_load_info(info) - # `delta` table format should use `parquet` file format + # table formats should use `parquet` file format completed_jobs = info.load_packages[0].jobs["completed_jobs"] data_types_jobs = [ job for job in completed_jobs if job.job_file_info.table_name == "data_types" ] assert all([job.file_path.endswith((".parquet", ".reference")) for job in data_types_jobs]) - # 10 rows should be loaded to the Delta table and the content of the first + # 10 rows should be loaded to the table and the content of the first # row should match expected values rows = load_tables_to_dicts(pipeline, "data_types", exclude_system_cols=True)["data_types"] assert len(rows) == 10 @@ -322,7 +364,8 @@ def data_types(): # should do logical replace, increasing the table version info = pipeline.run(data_types(), write_disposition="replace") assert_load_info(info) - assert get_delta_tables(pipeline, "data_types")["data_types"].version() == 2 + if destination_config.table_format == "delta": + assert get_delta_tables(pipeline, "data_types")["data_types"].version() == 2 rows = load_tables_to_dicts(pipeline, "data_types", exclude_system_cols=True)["data_types"] assert len(rows) == 10 @@ -331,15 +374,16 @@ def data_types(): "destination_config", destinations_configs( table_format_filesystem_configs=True, + # job orchestration is same across table formats—no need to test all formats with_table_format="delta", bucket_subset=(FILE_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_does_not_contain_job_files( +def test_table_format_does_not_contain_job_files( destination_config: DestinationTestConfiguration, ) -> None: - """Asserts Parquet job files do not end up in Delta table.""" + """Asserts Parquet job files do not end up in table.""" pipeline = destination_config.setup_pipeline("fs_pipe", dev_mode=True) @@ -376,17 +420,18 @@ def delta_table(): "destination_config", destinations_configs( table_format_filesystem_configs=True, + # job orchestration is same across table formats—no need to test all formats with_table_format="delta", bucket_subset=(FILE_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_multiple_files( +def test_table_format_multiple_files( destination_config: DestinationTestConfiguration, ) -> None: - """Tests loading multiple files into a Delta table. + """Tests loading multiple files into a table. - Files should be loaded into the Delta table in a single commit. + Files should be loaded into the table in a single commit. """ from dlt.common.libs.deltalake import get_delta_tables @@ -422,17 +467,17 @@ def delta_table(): "destination_config", destinations_configs( table_format_filesystem_configs=True, - with_table_format="delta", + with_table_format=("delta", "iceberg"), bucket_subset=(FILE_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_child_tables( +def test_table_format_child_tables( destination_config: DestinationTestConfiguration, ) -> None: - """Tests child table handling for `delta` table format.""" + """Tests child table handling for `delta` and `iceberg` table formats.""" - @dlt.resource(table_format="delta") + @dlt.resource(table_format=destination_config.table_format) def nested_table(): yield [ { @@ -494,49 +539,63 @@ def nested_table(): assert len(rows_dict["nested_table__child"]) == 3 assert len(rows_dict["nested_table__child__grandchild"]) == 5 - # now drop children and grandchildren, use merge write disposition to create and pass full table chain - # also for tables that do not have jobs - info = pipeline.run( - [{"foo": 3}] * 10000, - table_name="nested_table", - primary_key="foo", - write_disposition="merge", - ) - assert_load_info(info) + if destination_config.supports_merge: + # now drop children and grandchildren, use merge write disposition to create and pass full table chain + # also for tables that do not have jobs + info = pipeline.run( + [{"foo": 3}] * 10000, + table_name="nested_table", + primary_key="foo", + write_disposition="merge", + ) + assert_load_info(info) @pytest.mark.parametrize( "destination_config", destinations_configs( table_format_filesystem_configs=True, - with_table_format="delta", + with_table_format=("delta", "iceberg"), bucket_subset=(FILE_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_partitioning( +def test_table_format_partitioning( destination_config: DestinationTestConfiguration, ) -> None: - """Tests partitioning for `delta` table format.""" + """Tests partitioning for `delta` and `iceberg` table formats.""" - from dlt.common.libs.deltalake import get_delta_tables from tests.pipeline.utils import users_materialize_table_schema + def assert_partition_columns( + table_name: str, table_format: TTableFormat, expected_partition_columns: List[str] + ) -> None: + if table_format == "delta": + from dlt.common.libs.deltalake import get_delta_tables + + dt = get_delta_tables(pipeline, table_name)[table_name] + actual_partition_columns = dt.metadata().partition_columns + elif table_format == "iceberg": + from dlt.common.libs.pyiceberg import get_iceberg_tables + + it = get_iceberg_tables(pipeline, table_name)[table_name] + actual_partition_columns = [f.name for f in it.metadata.specs_struct().fields] + assert actual_partition_columns == expected_partition_columns + pipeline = destination_config.setup_pipeline("fs_pipe", dev_mode=True) # zero partition columns - @dlt.resource(table_format="delta") + @dlt.resource(table_format=destination_config.table_format) def zero_part(): yield {"foo": 1, "bar": 1} info = pipeline.run(zero_part()) assert_load_info(info) - dt = get_delta_tables(pipeline, "zero_part")["zero_part"] - assert dt.metadata().partition_columns == [] + assert_partition_columns("zero_part", destination_config.table_format, []) assert load_table_counts(pipeline, "zero_part")["zero_part"] == 1 # one partition column - @dlt.resource(table_format="delta", columns={"c1": {"partition": True}}) + @dlt.resource(table_format=destination_config.table_format, columns={"c1": {"partition": True}}) def one_part(): yield [ {"c1": "foo", "c2": 1}, @@ -547,13 +606,13 @@ def one_part(): info = pipeline.run(one_part()) assert_load_info(info) - dt = get_delta_tables(pipeline, "one_part")["one_part"] - assert dt.metadata().partition_columns == ["c1"] + assert_partition_columns("one_part", destination_config.table_format, ["c1"]) assert load_table_counts(pipeline, "one_part")["one_part"] == 4 # two partition columns @dlt.resource( - table_format="delta", columns={"c1": {"partition": True}, "c2": {"partition": True}} + table_format=destination_config.table_format, + columns={"c1": {"partition": True}, "c2": {"partition": True}}, ) def two_part(): yield [ @@ -565,29 +624,31 @@ def two_part(): info = pipeline.run(two_part()) assert_load_info(info) - dt = get_delta_tables(pipeline, "two_part")["two_part"] - assert dt.metadata().partition_columns == ["c1", "c2"] + assert_partition_columns("two_part", destination_config.table_format, ["c1", "c2"]) assert load_table_counts(pipeline, "two_part")["two_part"] == 4 # test partitioning with empty source users_materialize_table_schema.apply_hints( - table_format="delta", + table_format=destination_config.table_format, columns={"id": {"partition": True}}, ) info = pipeline.run(users_materialize_table_schema()) assert_load_info(info) - dt = get_delta_tables(pipeline, "users")["users"] - assert dt.metadata().partition_columns == ["id"] + assert_partition_columns("users", destination_config.table_format, ["id"]) assert load_table_counts(pipeline, "users")["users"] == 0 # changing partitioning after initial table creation is not supported zero_part.apply_hints(columns={"foo": {"partition": True}}) - with pytest.raises(PipelineStepFailed) as pip_ex: + if destination_config.table_format == "delta": + # Delta raises error when trying to change partitioning + with pytest.raises(PipelineStepFailed) as pip_ex: + pipeline.run(zero_part()) + assert isinstance(pip_ex.value.__context__, LoadClientJobRetry) + assert "partitioning" in pip_ex.value.__context__.retry_message + elif destination_config.table_format == "iceberg": + # while Iceberg supports partition evolution, we don't apply it pipeline.run(zero_part()) - assert isinstance(pip_ex.value.__context__, LoadClientJobRetry) - assert "partitioning" in pip_ex.value.__context__.retry_message - dt = get_delta_tables(pipeline, "zero_part")["zero_part"] - assert dt.metadata().partition_columns == [] + assert_partition_columns("zero_part", destination_config.table_format, []) @pytest.mark.parametrize( @@ -646,7 +707,7 @@ def test_delta_table_partitioning_arrow_load_id( "destination_config", destinations_configs( table_format_filesystem_configs=True, - with_table_format="delta", + with_table_format=("delta", "iceberg"), bucket_subset=(FILE_BUCKET), ), ids=lambda x: x.name, @@ -659,20 +720,25 @@ def test_delta_table_partitioning_arrow_load_id( pytest.param({"disposition": "merge", "strategy": "upsert"}, id="upsert"), ), ) -def test_delta_table_schema_evolution( +def test_table_format_schema_evolution( destination_config: DestinationTestConfiguration, write_disposition: TWriteDisposition, ) -> None: - """Tests schema evolution (adding new columns) for `delta` table format.""" - from dlt.common.libs.deltalake import get_delta_tables, ensure_delta_compatible_arrow_data + """Tests schema evolution (adding new columns) for `delta` and `iceberg` table formats.""" + if destination_config.table_format == "iceberg" and write_disposition == { + "disposition": "merge", + "strategy": "upsert", + }: + pytest.skip("`upsert` currently not implemented for `iceberg`") + from dlt.common.libs.pyarrow import pyarrow @dlt.resource( write_disposition=write_disposition, primary_key="pk", - table_format="delta", + table_format=destination_config.table_format, ) - def delta_table(data): + def evolving_table(data): yield data pipeline = destination_config.setup_pipeline("fs_pipe", dev_mode=True) @@ -684,11 +750,11 @@ def delta_table(data): assert arrow_table.shape == (1, 1) # initial load - info = pipeline.run(delta_table(arrow_table)) + info = pipeline.run(evolving_table(arrow_table)) assert_load_info(info) - dt = get_delta_tables(pipeline, "delta_table")["delta_table"] - expected = ensure_delta_compatible_arrow_data(arrow_table) - actual = dt.to_pyarrow_table() + expected, actual = get_expected_actual( + pipeline, "evolving_table", destination_config.table_format, arrow_table + ) assert actual.equals(expected) # create Arrow table with many columns, two rows @@ -703,11 +769,11 @@ def delta_table(data): arrow_table = arrow_table.add_column(0, pk_field, [[1, 2]]) # second load — this should evolve the schema (i.e. add the new columns) - info = pipeline.run(delta_table(arrow_table)) + info = pipeline.run(evolving_table(arrow_table)) assert_load_info(info) - dt = get_delta_tables(pipeline, "delta_table")["delta_table"] - actual = dt.to_pyarrow_table() - expected = ensure_delta_compatible_arrow_data(arrow_table) + expected, actual = get_expected_actual( + pipeline, "evolving_table", destination_config.table_format, arrow_table + ) if write_disposition == "append": # just check shape and schema for `append`, because table comparison is # more involved than with the other dispositions @@ -724,13 +790,21 @@ def delta_table(data): empty_arrow_table = arrow_table.schema.empty_table() # load 3 — this should evolve the schema without changing data - info = pipeline.run(delta_table(empty_arrow_table)) + info = pipeline.run(evolving_table(empty_arrow_table)) assert_load_info(info) - dt = get_delta_tables(pipeline, "delta_table")["delta_table"] - actual = dt.to_pyarrow_table() - expected_schema = ensure_delta_compatible_arrow_data(arrow_table).schema - assert actual.schema.equals(expected_schema) - expected_num_rows = 3 if write_disposition == "append" else 2 + expected, actual = get_expected_actual( + pipeline, "evolving_table", destination_config.table_format, arrow_table + ) + assert actual.schema.equals(expected.schema) + if write_disposition == "append": + expected_num_rows = 3 + elif write_disposition == "replace": + expected_num_rows = 0 + if destination_config.table_format == "delta": + # TODO: fix https://github.com/dlt-hub/dlt/issues/2092 and remove this if-clause + expected_num_rows = 2 + elif write_disposition == {"disposition": "merge", "strategy": "upsert"}: + expected_num_rows = 2 assert actual.num_rows == expected_num_rows # new column should have NULLs only assert ( @@ -743,23 +817,38 @@ def delta_table(data): "destination_config", destinations_configs( table_format_filesystem_configs=True, - with_table_format="delta", + with_table_format=("delta", "iceberg"), bucket_subset=(FILE_BUCKET, AZ_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_empty_source( +def test_table_format_empty_source( destination_config: DestinationTestConfiguration, ) -> None: - """Tests empty source handling for `delta` table format. + """Tests empty source handling for `delta` and `iceberg` table formats. Tests both empty Arrow table and `dlt.mark.materialize_table_schema()`. """ - from dlt.common.libs.deltalake import ensure_delta_compatible_arrow_data, get_delta_tables from tests.pipeline.utils import users_materialize_table_schema - @dlt.resource(table_format="delta") - def delta_table(data): + def get_table_version( # type: ignore[return] + pipeline: dlt.Pipeline, + table_name: str, + table_format: TTableFormat, + ) -> int: + if table_format == "delta": + from dlt.common.libs.deltalake import get_delta_tables + + dt = get_delta_tables(pipeline, table_name)[table_name] + return dt.version() + elif table_format == "iceberg": + from dlt.common.libs.pyiceberg import get_iceberg_tables + + it = get_iceberg_tables(pipeline, table_name)[table_name] + return it.last_sequence_number - 1 # subtract 1 to match `delta` + + @dlt.resource(table_format=destination_config.table_format) + def a_table(data): yield data # create empty Arrow table with schema @@ -779,61 +868,62 @@ def delta_table(data): # run 1: empty Arrow table with schema # this should create empty Delta table with same schema as Arrow table - info = pipeline.run(delta_table(empty_arrow_table)) + info = pipeline.run(a_table(empty_arrow_table)) assert_load_info(info) - dt = get_delta_tables(pipeline, "delta_table")["delta_table"] - assert dt.version() == 0 - dt_arrow_table = dt.to_pyarrow_table() - assert dt_arrow_table.shape == (0, empty_arrow_table.num_columns) - assert dt_arrow_table.schema.equals( - ensure_delta_compatible_arrow_data(empty_arrow_table).schema + assert get_table_version(pipeline, "a_table", destination_config.table_format) == 0 + expected, actual = get_expected_actual( + pipeline, "a_table", destination_config.table_format, empty_arrow_table ) + assert actual.shape == (0, expected.num_columns) + assert actual.schema.equals(expected.schema) # run 2: non-empty Arrow table with same schema as run 1 # this should load records into Delta table - info = pipeline.run(delta_table(arrow_table)) + info = pipeline.run(a_table(arrow_table)) assert_load_info(info) - dt = get_delta_tables(pipeline, "delta_table")["delta_table"] - assert dt.version() == 1 - dt_arrow_table = dt.to_pyarrow_table() - assert dt_arrow_table.shape == (2, empty_arrow_table.num_columns) - assert dt_arrow_table.schema.equals( - ensure_delta_compatible_arrow_data(empty_arrow_table).schema + assert get_table_version(pipeline, "a_table", destination_config.table_format) == 1 + expected, actual = get_expected_actual( + pipeline, "a_table", destination_config.table_format, empty_arrow_table ) + assert actual.shape == (2, expected.num_columns) + assert actual.schema.equals(expected.schema) # now run the empty frame again - info = pipeline.run(delta_table(empty_arrow_table)) + info = pipeline.run(a_table(empty_arrow_table)) assert_load_info(info) - # use materialized list - # NOTE: this will create an empty parquet file with a schema takes from dlt schema. - # the original parquet file had a nested (struct) type in `json` field that is now - # in the delta table schema. the empty parquet file lost this information and had - # string type (converted from dlt `json`) - info = pipeline.run([dlt.mark.materialize_table_schema()], table_name="delta_table") - assert_load_info(info) + if destination_config.table_format == "delta": + # use materialized list + # NOTE: this will create an empty parquet file with a schema takes from dlt schema. + # the original parquet file had a nested (struct) type in `json` field that is now + # in the delta table schema. the empty parquet file lost this information and had + # string type (converted from dlt `json`) + info = pipeline.run([dlt.mark.materialize_table_schema()], table_name="a_table") + assert_load_info(info) # test `dlt.mark.materialize_table_schema()` - users_materialize_table_schema.apply_hints(table_format="delta") + users_materialize_table_schema.apply_hints(table_format=destination_config.table_format) info = pipeline.run(users_materialize_table_schema(), loader_file_format="parquet") assert_load_info(info) - dt = get_delta_tables(pipeline, "users")["users"] - assert dt.version() == 0 - dt_arrow_table = dt.to_pyarrow_table() - assert dt_arrow_table.num_rows == 0 - assert "id", "name" == dt_arrow_table.schema.names[:2] + assert get_table_version(pipeline, "users", destination_config.table_format) == 0 + _, actual = get_expected_actual( + pipeline, "users", destination_config.table_format, empty_arrow_table + ) + assert actual.num_rows == 0 + assert "id", "name" == actual.schema.names[:2] @pytest.mark.parametrize( "destination_config", destinations_configs( table_format_filesystem_configs=True, + # job orchestration is same across table formats—no need to test all formats with_table_format="delta", bucket_subset=(FILE_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_mixed_source( +def test_table_format_mixed_source( destination_config: DestinationTestConfiguration, ) -> None: """Tests file format handling in mixed source. @@ -877,12 +967,13 @@ def s(): "destination_config", destinations_configs( table_format_filesystem_configs=True, + # job orchestration is same across table formats—no need to test all formats with_table_format="delta", bucket_subset=(FILE_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_dynamic_dispatch( +def test_table_format_dynamic_dispatch( destination_config: DestinationTestConfiguration, ) -> None: @dlt.resource(primary_key="id", table_name=lambda i: i["type"], table_format="delta") @@ -905,80 +996,96 @@ def github_events(): "destination_config", destinations_configs( table_format_filesystem_configs=True, - with_table_format="delta", + with_table_format=("delta", "iceberg"), bucket_subset=(FILE_BUCKET, AZ_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_get_delta_tables_helper( +def test_table_format_get_tables_helper( destination_config: DestinationTestConfiguration, ) -> None: - """Tests `get_delta_tables` helper function.""" - from dlt.common.libs.deltalake import DeltaTable, get_delta_tables + """Tests `get_delta_tables` / `get_iceberg_tables` helper functions.""" + get_tables: Any + if destination_config.table_format == "delta": + from dlt.common.libs.deltalake import DeltaTable, get_delta_tables - @dlt.resource(table_format="delta") - def foo_delta(): + get_tables = get_delta_tables + get_num_rows = lambda table: table.to_pyarrow_table().num_rows + elif destination_config.table_format == "iceberg": + from dlt.common.libs.pyiceberg import IcebergTable, get_iceberg_tables + + get_tables = get_iceberg_tables + get_num_rows = lambda table: table.scan().to_arrow().num_rows + + @dlt.resource(table_format=destination_config.table_format) + def foo_table_format(): yield [{"foo": 1}, {"foo": 2}] - @dlt.resource(table_format="delta") - def bar_delta(): + @dlt.resource(table_format=destination_config.table_format) + def bar_table_format(): yield [{"bar": 1}] @dlt.resource - def baz_not_delta(): + def baz_not_table_format(): yield [{"baz": 1}] pipeline = destination_config.setup_pipeline("fs_pipe", dev_mode=True) - info = pipeline.run(foo_delta()) + info = pipeline.run(foo_table_format()) assert_load_info(info) - delta_tables = get_delta_tables(pipeline) - assert delta_tables.keys() == {"foo_delta"} - assert isinstance(delta_tables["foo_delta"], DeltaTable) - assert delta_tables["foo_delta"].to_pyarrow_table().num_rows == 2 - - info = pipeline.run([foo_delta(), bar_delta(), baz_not_delta()]) + tables = get_tables(pipeline) + assert tables.keys() == {"foo_table_format"} + if destination_config.table_format == "delta": + assert isinstance(tables["foo_table_format"], DeltaTable) + elif destination_config.table_format == "iceberg": + assert isinstance(tables["foo_table_format"], IcebergTable) + assert get_num_rows(tables["foo_table_format"]) == 2 + + info = pipeline.run([foo_table_format(), bar_table_format(), baz_not_table_format()]) assert_load_info(info) - delta_tables = get_delta_tables(pipeline) - assert delta_tables.keys() == {"foo_delta", "bar_delta"} - assert delta_tables["bar_delta"].to_pyarrow_table().num_rows == 1 - assert get_delta_tables(pipeline, "foo_delta").keys() == {"foo_delta"} - assert get_delta_tables(pipeline, "bar_delta").keys() == {"bar_delta"} - assert get_delta_tables(pipeline, "foo_delta", "bar_delta").keys() == {"foo_delta", "bar_delta"} + tables = get_tables(pipeline) + assert tables.keys() == {"foo_table_format", "bar_table_format"} + assert get_num_rows(tables["bar_table_format"]) == 1 + assert get_tables(pipeline, "foo_table_format").keys() == {"foo_table_format"} + assert get_tables(pipeline, "bar_table_format").keys() == {"bar_table_format"} + assert get_tables(pipeline, "foo_table_format", "bar_table_format").keys() == { + "foo_table_format", + "bar_table_format", + } # test with child table - @dlt.resource(table_format="delta") - def parent_delta(): + @dlt.resource(table_format=destination_config.table_format) + def parent_table_format(): yield [{"foo": 1, "child": [1, 2, 3]}] - info = pipeline.run(parent_delta()) + info = pipeline.run(parent_table_format()) assert_load_info(info) - delta_tables = get_delta_tables(pipeline) - assert "parent_delta__child" in delta_tables.keys() - assert delta_tables["parent_delta__child"].to_pyarrow_table().num_rows == 3 + tables = get_tables(pipeline) + assert "parent_table_format__child" in tables.keys() + assert get_num_rows(tables["parent_table_format__child"]) == 3 # test invalid input with pytest.raises(ValueError): - get_delta_tables(pipeline, "baz_not_delta") + get_tables(pipeline, "baz_not_table_format") with pytest.raises(ValueError): - get_delta_tables(pipeline, "non_existing_table") + get_tables(pipeline, "non_existing_table") # test unknown schema with pytest.raises(FileNotFoundError): - get_delta_tables(pipeline, "non_existing_table", schema_name="aux_2") + get_tables(pipeline, "non_existing_table", schema_name="aux_2") # load to a new schema and under new name aux_schema = dlt.Schema("aux_2") # NOTE: you cannot have a file with name - info = pipeline.run(parent_delta().with_name("aux_delta"), schema=aux_schema) + info = pipeline.run(parent_table_format().with_name("aux_table"), schema=aux_schema) # also state in seprate package assert_load_info(info, expected_load_packages=2) - delta_tables = get_delta_tables(pipeline, schema_name="aux_2") - assert "aux_delta__child" in delta_tables.keys() - get_delta_tables(pipeline, "aux_delta", schema_name="aux_2") + tables = get_tables(pipeline, schema_name="aux_2") + assert "aux_table__child" in tables.keys() + get_tables(pipeline, "aux_table", schema_name="aux_2") with pytest.raises(ValueError): - get_delta_tables(pipeline, "aux_delta") + get_tables(pipeline, "aux_table") @pytest.mark.parametrize( diff --git a/tests/load/sources/sql_database/test_sql_database_source.py b/tests/load/sources/sql_database/test_sql_database_source.py index 00257471e0..2de923fe38 100644 --- a/tests/load/sources/sql_database/test_sql_database_source.py +++ b/tests/load/sources/sql_database/test_sql_database_source.py @@ -1286,10 +1286,7 @@ def assert_no_precision_columns( ) -> None: actual = list(columns.values()) # we always infer and emit nullability - expected = cast( - List[TColumnSchema], - deepcopy(NULL_NO_PRECISION_COLUMNS if nullable else NOT_NULL_NO_PRECISION_COLUMNS), - ) + expected = deepcopy(NULL_NO_PRECISION_COLUMNS if nullable else NOT_NULL_NO_PRECISION_COLUMNS) if backend == "pyarrow": expected = cast( List[TColumnSchema], diff --git a/tests/load/utils.py b/tests/load/utils.py index 5c24b2d1dc..5660202ec3 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -26,7 +26,10 @@ from dlt.common.configuration import resolve_configuration from dlt.common.configuration.container import Container from dlt.common.configuration.specs.config_section_context import ConfigSectionContext -from dlt.common.configuration.specs import CredentialsConfiguration +from dlt.common.configuration.specs import ( + CredentialsConfiguration, + GcpOAuthCredentialsWithoutDefaults, +) from dlt.common.destination.reference import ( DestinationClientDwhConfiguration, JobClientBase, @@ -57,6 +60,7 @@ from dlt.pipeline.exceptions import SqlClientNotAvailable from tests.utils import ( ACTIVE_DESTINATIONS, + ACTIVE_TABLE_FORMATS, IMPLEMENTED_DESTINATIONS, SQL_DESTINATIONS, EXCLUDED_DESTINATION_CONFIGURATIONS, @@ -171,7 +175,9 @@ def destination_factory(self, **kwargs) -> Destination[Any, Any]: dest_type = kwargs.pop("destination", self.destination_type) dest_name = kwargs.pop("destination_name", self.destination_name) self.setup() - return Destination.from_reference(dest_type, destination_name=dest_name, **kwargs) + return Destination.from_reference( + dest_type, self.credentials, destination_name=dest_name, **kwargs + ) def raw_capabilities(self) -> DestinationCapabilitiesContext: dest = Destination.from_reference(self.destination_type) @@ -604,7 +610,7 @@ def destinations_configs( DestinationTestConfiguration( destination_type="filesystem", bucket_url=bucket, - extra_info=bucket + "-delta", + extra_info=bucket, table_format="delta", supports_merge=True, file_format="parquet", @@ -619,12 +625,33 @@ def destinations_configs( ), ) ] + if bucket == AZ_BUCKET: + # `pyiceberg` does not support `az` scheme + continue + destination_configs += [ + DestinationTestConfiguration( + destination_type="filesystem", + bucket_url=bucket, + extra_info=bucket, + table_format="iceberg", + supports_merge=False, + file_format="parquet", + destination_name="fsgcpoauth" if bucket == GCS_BUCKET else None, + ) + ] # filter out non active destinations destination_configs = [ conf for conf in destination_configs if conf.destination_type in ACTIVE_DESTINATIONS ] + # filter out non active table formats + destination_configs = [ + conf + for conf in destination_configs + if conf.table_format is None or conf.table_format in ACTIVE_TABLE_FORMATS + ] + # filter out destinations not in subset if subset: destination_configs = [ diff --git a/tests/pipeline/utils.py b/tests/pipeline/utils.py index 0ae734f72e..e72a27c827 100644 --- a/tests/pipeline/utils.py +++ b/tests/pipeline/utils.py @@ -197,10 +197,23 @@ def _load_tables_to_dicts_fs( delta_tables = get_delta_tables(p, *table_names, schema_name=schema_name) + iceberg_table_names = [ + table_name + for table_name in table_names + if get_table_format(client.schema.tables, table_name) == "iceberg" + ] + if len(iceberg_table_names) > 0: + from dlt.common.libs.pyiceberg import get_iceberg_tables + + iceberg_tables = get_iceberg_tables(p, *table_names, schema_name=schema_name) + for table_name in table_names: if table_name in client.schema.data_table_names() and table_name in delta_table_names: dt = delta_tables[table_name] result[table_name] = dt.to_pyarrow_table().to_pylist() + elif table_name in client.schema.data_table_names() and table_name in iceberg_table_names: + it = iceberg_tables[table_name] + result[table_name] = it.scan().to_arrow().to_pylist() else: table_files = client.list_table_files(table_name) for file in table_files: diff --git a/tests/sources/helpers/rest_client/test_client.py b/tests/sources/helpers/rest_client/test_client.py index 36fe009b93..e67ff9c70a 100644 --- a/tests/sources/helpers/rest_client/test_client.py +++ b/tests/sources/helpers/rest_client/test_client.py @@ -401,7 +401,7 @@ def test_paginate_json_body_without_params(self, rest_client) -> None: posts_skip = (DEFAULT_TOTAL_PAGES - 3) * DEFAULT_PAGE_SIZE class JSONBodyPageCursorPaginator(BaseReferencePaginator): - def update_state(self, response, data): + def update_state(self, response, data): # type: ignore[override] self._next_reference = response.json().get("next_page") def update_request(self, request): diff --git a/tests/utils.py b/tests/utils.py index 1aafa4bfe4..82d742ac65 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -32,6 +32,7 @@ from dlt.common.runtime.run_context import DOT_DLT, RunContext from dlt.common.runtime.telemetry import start_telemetry, stop_telemetry from dlt.common.schema import Schema +from dlt.common.schema.typing import TTableFormat from dlt.common.storages import FileStorage from dlt.common.storages.versioned_storage import VersionedStorage from dlt.common.typing import DictStrAny, StrAny, TDataItem @@ -88,6 +89,12 @@ ACTIVE_SQL_DESTINATIONS = SQL_DESTINATIONS.intersection(ACTIVE_DESTINATIONS) ACTIVE_NON_SQL_DESTINATIONS = NON_SQL_DESTINATIONS.intersection(ACTIVE_DESTINATIONS) +# filter out active table formats for current tests +IMPLEMENTED_TABLE_FORMATS = set(get_args(TTableFormat)) +ACTIVE_TABLE_FORMATS = set( + dlt.config.get("ACTIVE_TABLE_FORMATS", list) or IMPLEMENTED_TABLE_FORMATS +) + # sanity checks assert len(ACTIVE_DESTINATIONS) >= 0, "No active destinations selected" From 27fe53de33f732795a95fb41a87f23fc9da98740 Mon Sep 17 00:00:00 2001 From: Julian Alves <28436330+donotpush@users.noreply.github.com> Date: Wed, 11 Dec 2024 13:17:18 +0100 Subject: [PATCH 27/32] poetry lock update --- poetry.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/poetry.lock b/poetry.lock index 25dbf5a3b2..c151dec6db 100644 --- a/poetry.lock +++ b/poetry.lock @@ -10727,4 +10727,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "84e8b8eccd9b8ee104a2dc08f5b83987aeb06540d61330390ce849cc1ad6acb4" +content-hash = "8328e4870badd073330487acdbe1f3f2f1a02fb7c5c84c3dcaf80826a322adfb" From a59ba3eb94888dce6bbb48c1edf77badd0f358d5 Mon Sep 17 00:00:00 2001 From: Julian Alves <28436330+donotpush@users.noreply.github.com> Date: Wed, 11 Dec 2024 13:22:03 +0100 Subject: [PATCH 28/32] refactor databricks auth test --- .../test_databricks_configuration.py | 10 ++++++ .../load/pipeline/test_databricks_pipeline.py | 32 ++++++++++++++++--- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/tests/load/databricks/test_databricks_configuration.py b/tests/load/databricks/test_databricks_configuration.py index e27da4db2a..8b3beed2b3 100644 --- a/tests/load/databricks/test_databricks_configuration.py +++ b/tests/load/databricks/test_databricks_configuration.py @@ -4,6 +4,7 @@ pytest.importorskip("databricks") from dlt.common.exceptions import TerminalValueError +from dlt.common.configuration.exceptions import ConfigurationValueError from dlt.destinations.impl.databricks.databricks import DatabricksLoadJob from dlt.common.configuration import resolve_configuration @@ -86,3 +87,12 @@ def test_databricks_abfss_converter() -> None: abfss_url == "abfss://dlt-ci-test-bucket@my_account.dfs.core.windows.net/path/to/file.parquet" ) + + +def test_databricks_auth_invalid() -> None: + with pytest.raises(ConfigurationValueError, match="No valid authentication method detected.*"): + os.environ["DESTINATION__DATABRICKS__CREDENTIALS__CLIENT_ID"] = "" + os.environ["DESTINATION__DATABRICKS__CREDENTIALS__CLIENT_SECRET"] = "" + os.environ["DESTINATION__DATABRICKS__CREDENTIALS__ACCESS_TOKEN"] = "" + bricks = databricks() + bricks.configuration(None, accept_partial=True) diff --git a/tests/load/pipeline/test_databricks_pipeline.py b/tests/load/pipeline/test_databricks_pipeline.py index 8a227fd547..86cf91caa1 100644 --- a/tests/load/pipeline/test_databricks_pipeline.py +++ b/tests/load/pipeline/test_databricks_pipeline.py @@ -2,6 +2,7 @@ import os from dlt.common.utils import uniq_id +from dlt.destinations import databricks from tests.load.utils import ( GCS_BUCKET, DestinationTestConfiguration, @@ -151,12 +152,10 @@ def test_databricks_gcs_external_location(destination_config: DestinationTestCon destinations_configs(default_sql_configs=True, subset=("databricks",)), ids=lambda x: x.name, ) -def test_databricks_oauth(destination_config: DestinationTestConfiguration) -> None: - from dlt.destinations import databricks - +def test_databricks_auth_oauth(destination_config: DestinationTestConfiguration) -> None: + os.environ["DESTINATION__DATABRICKS__CREDENTIALS__ACCESS_TOKEN"] = "" bricks = databricks() config = bricks.configuration(None, accept_partial=True) - assert config.credentials.client_id and config.credentials.client_secret dataset_name = "test_databricks_oauth" + uniq_id() @@ -170,3 +169,28 @@ def test_databricks_oauth(destination_config: DestinationTestConfiguration) -> N with pipeline.sql_client() as client: rows = client.execute_sql(f"select * from {dataset_name}.digits") assert len(rows) == 3 + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, subset=("databricks",)), + ids=lambda x: x.name, +) +def test_databricks_auth_token(destination_config: DestinationTestConfiguration) -> None: + os.environ["DESTINATION__DATABRICKS__CREDENTIALS__CLIENT_ID"] = "" + os.environ["DESTINATION__DATABRICKS__CREDENTIALS__CLIENT_SECRET"] = "" + bricks = databricks() + config = bricks.configuration(None, accept_partial=True) + assert config.credentials.access_token + + dataset_name = "test_databricks_token" + uniq_id() + pipeline = destination_config.setup_pipeline( + "test_databricks_token", dataset_name=dataset_name, destination=bricks + ) + + info = pipeline.run([1, 2, 3], table_name="digits", **destination_config.run_kwargs) + assert info.has_failed_jobs is False + + with pipeline.sql_client() as client: + rows = client.execute_sql(f"select * from {dataset_name}.digits") + assert len(rows) == 3 From 7d69d3e69c18c123a84944712a3a0aaa1c3ccf65 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Wed, 4 Dec 2024 17:37:15 +0100 Subject: [PATCH 29/32] Fix formatting in dlt.common.libs.pyarrow (#2102) From b14d26c864233ac3e321f1b62426c17ce8157361 Mon Sep 17 00:00:00 2001 From: David Scharf Date: Tue, 10 Dec 2024 23:44:01 +0100 Subject: [PATCH 30/32] bump semver to minimum version 3.0.0 (#2132) --- poetry.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/poetry.lock b/poetry.lock index c151dec6db..650ad153ee 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "about-time" @@ -10727,4 +10727,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "8328e4870badd073330487acdbe1f3f2f1a02fb7c5c84c3dcaf80826a322adfb" +content-hash = "1bf3deccd929c083b880c1a82be0983430ab49f7ade247b1c5573bb8c70d9ff5" From c14e744fcd30007f0b712716288de113921e2b1e Mon Sep 17 00:00:00 2001 From: David Scharf Date: Wed, 11 Dec 2024 00:43:32 +0100 Subject: [PATCH 31/32] leverage ibis expression for getting readablerelations (#2046) * add ibis dataset in own class for now * make error clearer * fix some linting and fix broken test * make most destinations work with selecting the right db and catalog, transpiling sql via postgres in some cases and selecting the right dialect in others * add missing motherduck and sqlalchemy mappings * casefold identifiers for ibis wrapper calss * re-organize existing dataset code to prepare ibis relation integration * integrate ibis relation into existing code * re-order tests * fall back to default dataset if table not in schema * make dataset type selectable * add dataset type selection test and fix bug in tests * update docs for ibis expressions use * ensure a bunch of ibis operations continue working * add some more tests and typings * fix typing (with brute force get_attr typing..) * move ibis to dependency group * move ibis stuff to helpers * post devel merge, put in change from dataset, update lockfile * add ibis to sqlalchemy tests * improve docs a bit * fix ibis dep group * fix dataset snippets * fix ibis version * add support for column schema in certion query cases --------- Co-authored-by: Marcin Rudolf --- .github/workflows/test_destinations.yml | 9 ++------- .github/workflows/test_local_destinations.yml | 5 +---- poetry.lock | 4 ++-- 3 files changed, 5 insertions(+), 13 deletions(-) diff --git a/.github/workflows/test_destinations.yml b/.github/workflows/test_destinations.yml index 84a8f95d71..cfd0a3bd56 100644 --- a/.github/workflows/test_destinations.yml +++ b/.github/workflows/test_destinations.yml @@ -77,13 +77,8 @@ jobs: # key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-redshift - name: Install dependencies - run: poetry install --no-interaction -E redshift -E postgis -E postgres -E gs -E s3 -E az -E parquet -E duckdb -E cli -E filesystem --with sentry-sdk --with pipeline,ibis -E deltalake -E pyiceberg - - - name: enable certificates for azure and duckdb - run: sudo mkdir -p /etc/pki/tls/certs && sudo ln -s /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt - - - name: Upgrade sqlalchemy - run: poetry run pip install sqlalchemy==2.0.18 # minimum version required by `pyiceberg` + # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' + run: poetry install --no-interaction -E redshift -E postgis -E postgres -E gs -E s3 -E az -E parquet -E duckdb -E cli -E filesystem --with sentry-sdk --with pipeline,ibis -E deltalake - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_local_destinations.yml b/.github/workflows/test_local_destinations.yml index 706bae1b0c..6f44e5fd5a 100644 --- a/.github/workflows/test_local_destinations.yml +++ b/.github/workflows/test_local_destinations.yml @@ -95,10 +95,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-local-destinations - name: Install dependencies - run: poetry install --no-interaction -E postgres -E postgis -E duckdb -E parquet -E filesystem -E cli -E weaviate -E qdrant -E sftp --with sentry-sdk --with pipeline,ibis -E deltalake -E pyiceberg - - - name: Upgrade sqlalchemy - run: poetry run pip install sqlalchemy==2.0.18 # minimum version required by `pyiceberg` + run: poetry install --no-interaction -E postgres -E postgis -E duckdb -E parquet -E filesystem -E cli -E weaviate -E qdrant -E sftp --with sentry-sdk --with pipeline,ibis -E deltalake - name: Start SFTP server run: docker compose -f "tests/load/filesystem_sftp/docker-compose.yml" up -d diff --git a/poetry.lock b/poetry.lock index 650ad153ee..8539b3d6a4 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. [[package]] name = "about-time" @@ -10727,4 +10727,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "1bf3deccd929c083b880c1a82be0983430ab49f7ade247b1c5573bb8c70d9ff5" +content-hash = "a7cd6b599326d80b5beb8d4a3d3e3b4074eda6dc53daa5c296ef8d54002c5f78" From c0598b00b6d2550bc55ab3d8582f25b21f9c99da Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink <47451109+jorritsandbrink@users.noreply.github.com> Date: Wed, 11 Dec 2024 12:35:59 +0400 Subject: [PATCH 32/32] `iceberg` table format support for `filesystem` destination (#2067) * add pyiceberg dependency and upgrade mypy - mypy upgrade needed to solve this issue: https://github.com/apache/iceberg-python/issues/768 - uses <1.13.0 requirement on mypy because 1.13.0 gives error - new lint errors arising due to version upgrade are simply ignored * extend pyiceberg dependencies * remove redundant delta annotation * add basic local filesystem iceberg support * add active table format setting * disable merge tests for iceberg table format * restore non-redundant extra info * refactor to in-memory iceberg catalog * add s3 support for iceberg table format * add schema evolution support for iceberg table format * extract _register_table function * add partition support for iceberg table format * update docstring * enable child table test for iceberg table format * enable empty source test for iceberg table format * make iceberg catalog namespace configurable and default to dataset name * add optional typing * fix typo * improve typing * extract logic into dedicated function * add iceberg read support to filesystem sql client * remove unused import * add todo * extract logic into separate functions * add azure support for iceberg table format * generalize delta table format tests * enable get tables function test for iceberg table format * remove ignores * undo table directory management change * enable test_read_interfaces tests for iceberg * fix active table format filter * use mixin for object store rs credentials * generalize catalog typing * extract pyiceberg scheme mapping into separate function * generalize credentials mixin test setup * remove unused import * add centralized fallback to append when merge is not supported * Revert "add centralized fallback to append when merge is not supported" This reverts commit 54cd0bcebffad15d522e734da321c602f4bd7461. * fall back to append if merge is not supported on filesystem * fix test for s3-compatible storage * remove obsolete code path * exclude gcs read interface tests for iceberg * add gcs support for iceberg table format * switch to UnsupportedAuthenticationMethodException * add iceberg table format docs * use shorter pipeline name to prevent too long sql identifiers * add iceberg catalog note to docs * black format * use shorter pipeline name to prevent too long sql identifiers * correct max id length for sqlalchemy mysql dialect * Revert "use shorter pipeline name to prevent too long sql identifiers" This reverts commit 6cce03b77111825b0714597e6d494df97145f0f2. * Revert "use shorter pipeline name to prevent too long sql identifiers" This reverts commit ef29aa7c2fdba79441573850c7d15b83526c011a. * replace show with execute to prevent useless print output * add abfss scheme to test * remove az support for iceberg table format * remove iceberg bucket test exclusion * add note to docs on azure scheme support for iceberg table format * exclude iceberg from duckdb s3-compatibility test * disable pyiceberg info logs for tests * extend table format docs and move into own page * upgrade adlfs to enable account_host attribute * Merge branch 'devel' of https://github.com/dlt-hub/dlt into feat/1996-iceberg-filesystem * fix lint errors * re-add pyiceberg dependency * enabled iceberg in dbt-duckdb * upgrade pyiceberg version * remove pyiceberg mypy errors across python version * does not install airflow group for dev * fixes gcp oauth iceberg credentials handling * fixes ca cert bundle duckdb azure on ci * allow for airflow dep to be present during type check --------- Co-authored-by: Marcin Rudolf --- .github/workflows/test_destinations.yml | 9 +++++++-- .github/workflows/test_local_destinations.yml | 5 ++++- poetry.lock | 2 +- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test_destinations.yml b/.github/workflows/test_destinations.yml index cfd0a3bd56..84a8f95d71 100644 --- a/.github/workflows/test_destinations.yml +++ b/.github/workflows/test_destinations.yml @@ -77,8 +77,13 @@ jobs: # key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-redshift - name: Install dependencies - # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E redshift -E postgis -E postgres -E gs -E s3 -E az -E parquet -E duckdb -E cli -E filesystem --with sentry-sdk --with pipeline,ibis -E deltalake + run: poetry install --no-interaction -E redshift -E postgis -E postgres -E gs -E s3 -E az -E parquet -E duckdb -E cli -E filesystem --with sentry-sdk --with pipeline,ibis -E deltalake -E pyiceberg + + - name: enable certificates for azure and duckdb + run: sudo mkdir -p /etc/pki/tls/certs && sudo ln -s /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt + + - name: Upgrade sqlalchemy + run: poetry run pip install sqlalchemy==2.0.18 # minimum version required by `pyiceberg` - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_local_destinations.yml b/.github/workflows/test_local_destinations.yml index 6f44e5fd5a..706bae1b0c 100644 --- a/.github/workflows/test_local_destinations.yml +++ b/.github/workflows/test_local_destinations.yml @@ -95,7 +95,10 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-local-destinations - name: Install dependencies - run: poetry install --no-interaction -E postgres -E postgis -E duckdb -E parquet -E filesystem -E cli -E weaviate -E qdrant -E sftp --with sentry-sdk --with pipeline,ibis -E deltalake + run: poetry install --no-interaction -E postgres -E postgis -E duckdb -E parquet -E filesystem -E cli -E weaviate -E qdrant -E sftp --with sentry-sdk --with pipeline,ibis -E deltalake -E pyiceberg + + - name: Upgrade sqlalchemy + run: poetry run pip install sqlalchemy==2.0.18 # minimum version required by `pyiceberg` - name: Start SFTP server run: docker compose -f "tests/load/filesystem_sftp/docker-compose.yml" up -d diff --git a/poetry.lock b/poetry.lock index 8539b3d6a4..25dbf5a3b2 100644 --- a/poetry.lock +++ b/poetry.lock @@ -10727,4 +10727,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "a7cd6b599326d80b5beb8d4a3d3e3b4074eda6dc53daa5c296ef8d54002c5f78" +content-hash = "84e8b8eccd9b8ee104a2dc08f5b83987aeb06540d61330390ce849cc1ad6acb4"