From 346b270da077f6a43a2822794560cc6f2a6bc7dc Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Tue, 10 Dec 2024 23:24:48 +0100 Subject: [PATCH] fixes ca cert bundle duckdb azure on ci --- .github/workflows/test_destinations.yml | 4 +++- dlt/destinations/impl/filesystem/sql_client.py | 3 ++- .../website/docs/dlt-ecosystem/destinations/delta-iceberg.md | 5 +++-- .../docs/general-usage/dataset-access/ibis-backend.md | 3 +-- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/.github/workflows/test_destinations.yml b/.github/workflows/test_destinations.yml index f6204ab0aa..c2c66f539e 100644 --- a/.github/workflows/test_destinations.yml +++ b/.github/workflows/test_destinations.yml @@ -77,9 +77,11 @@ jobs: # key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-redshift - name: Install dependencies - # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' run: poetry install --no-interaction -E redshift -E postgis -E postgres -E gs -E s3 -E az -E parquet -E duckdb -E cli -E filesystem --with sentry-sdk --with pipeline -E deltalake -E pyiceberg + - name: enable certificates for azure and duckdb + run: sudo mkdir -p /etc/pki/tls/certs && sudo ln -s /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt + - name: Upgrade sqlalchemy run: poetry run pip install sqlalchemy==2.0.18 # minimum version required by `pyiceberg` diff --git a/dlt/destinations/impl/filesystem/sql_client.py b/dlt/destinations/impl/filesystem/sql_client.py index 6409167e29..d39f4c3431 100644 --- a/dlt/destinations/impl/filesystem/sql_client.py +++ b/dlt/destinations/impl/filesystem/sql_client.py @@ -13,6 +13,7 @@ from dlt.common.destination.reference import DBApiCursor +from dlt.common.storages.fsspec_filesystem import AZURE_BLOB_STORAGE_PROTOCOLS from dlt.destinations.sql_client import raise_database_error from dlt.destinations.impl.duckdb.sql_client import DuckDbSqlClient @@ -193,7 +194,7 @@ def open_connection(self) -> duckdb.DuckDBPyConnection: # the line below solves problems with certificate path lookup on linux # see duckdb docs - if self.fs_client.config.protocol in ["az", "abfss"]: + if self.fs_client.config.protocol in AZURE_BLOB_STORAGE_PROTOCOLS: self._conn.sql("SET azure_transport_option_type = 'curl';") return self._conn diff --git a/docs/website/docs/dlt-ecosystem/destinations/delta-iceberg.md b/docs/website/docs/dlt-ecosystem/destinations/delta-iceberg.md index 6c0dfa50ee..7a056d6b40 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/delta-iceberg.md +++ b/docs/website/docs/dlt-ecosystem/destinations/delta-iceberg.md @@ -10,12 +10,13 @@ keywords: [delta, iceberg, destination, data warehouse] ## How it works `dlt` uses the [deltalake](https://pypi.org/project/deltalake/) and [pyiceberg](https://pypi.org/project/pyiceberg/) libraries to write Delta and Iceberg tables, respectively. One or multiple Parquet files are prepared during the extract and normalize steps. In the load step, these Parquet files are exposed as an Arrow data structure and fed into `deltalake` or `pyiceberg`. -## Iceberg catalog +## Iceberg single-user ephemeral catalog `dlt` uses single-table, ephemeral, in-memory, sqlite-based [Iceberg catalog](https://iceberg.apache.org/concepts/catalog/)s. These catalogs are created "on demand" when a pipeline is run, and do not persist afterwards. If a table already exists in the filesystem, it gets registered into the catalog using its latest metadata file. This allows for a serverless setup. It is currently not possible to connect your own Iceberg catalog. :::caution While ephemeral catalogs make it easy to get started with Iceberg, it comes with limitations: - concurrent writes are not handled and may lead to corrupt table state +- we cannot guarantee that reads concurrent with writes are clean - the latest manifest file needs to be searched for using file listing—this can become slow with large tables, especially in cloud object stores ::: @@ -69,7 +70,7 @@ pipeline.run(my_resource, table_format="delta") ## Table format partitioning -Both `delta` and `iceberg` tables can be partitioned by specifying one or more `partition` column hints. This example partitions a Delta table by the `foo` column: +Both `delta` and `iceberg` tables can be partitioned by specifying one or more `partition` column hints. This example partitions a Delta table by the `foo` column: ```py @dlt.resource( diff --git a/docs/website/docs/general-usage/dataset-access/ibis-backend.md b/docs/website/docs/general-usage/dataset-access/ibis-backend.md index 8f4b0fb6b6..9f9b65e9c0 100644 --- a/docs/website/docs/general-usage/dataset-access/ibis-backend.md +++ b/docs/website/docs/general-usage/dataset-access/ibis-backend.md @@ -6,7 +6,7 @@ keywords: [data, dataset, ibis] # Ibis -Ibis is a powerful portable Python dataframe library. Learn more about what it is and how to use it in the [official documentation](https://ibis-project.org/). +Ibis is a powerful portable Python dataframe library. Learn more about what it is and how to use it in the [official documentation](https://ibis-project.org/). `dlt` provides an easy way to hand over your loaded dataset to an Ibis backend connection. @@ -46,4 +46,3 @@ print(table.limit(10).execute()) # Visit the ibis docs to learn more about the available methods ``` -