From 40eba75ce7bdecc93a7b0d553906b8be12bb73e0 Mon Sep 17 00:00:00 2001 From: Tishj Date: Wed, 17 Sep 2025 10:56:43 +0200 Subject: [PATCH 1/9] point to v1.4-andium in the distribution pipeline --- .github/workflows/MainDistributionPipeline.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/MainDistributionPipeline.yml b/.github/workflows/MainDistributionPipeline.yml index a44f45c5..a6651294 100644 --- a/.github/workflows/MainDistributionPipeline.yml +++ b/.github/workflows/MainDistributionPipeline.yml @@ -17,7 +17,7 @@ jobs: uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main with: extension_name: iceberg - duckdb_version: main + duckdb_version: v1.4-andium ci_tools_version: main exclude_archs: 'windows_amd64_mingw' extra_toolchains: 'python3' @@ -29,7 +29,7 @@ jobs: secrets: inherit with: extension_name: iceberg - duckdb_version: main + duckdb_version: v1.4-andium ci_tools_version: main exclude_archs: 'windows_amd64_mingw' deploy_latest: ${{ startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main' }} From a88de343894114a44aab945b5add78a617e57f85 Mon Sep 17 00:00:00 2001 From: Tishj Date: Tue, 16 Sep 2025 13:07:15 +0200 Subject: [PATCH 2/9] add test for iceberg_to_ducklake, checking all converted tables --- src/iceberg_functions/iceberg_to_ducklake.cpp | 5 +- test/sql/iceberg_to_ducklake.test | 82 +++++++++++++++++++ 2 files changed, 85 insertions(+), 2 deletions(-) create mode 100644 test/sql/iceberg_to_ducklake.test diff --git a/src/iceberg_functions/iceberg_to_ducklake.cpp b/src/iceberg_functions/iceberg_to_ducklake.cpp index ffb080c1..f95d9abe 100644 --- a/src/iceberg_functions/iceberg_to_ducklake.cpp +++ b/src/iceberg_functions/iceberg_to_ducklake.cpp @@ -1452,8 +1452,9 @@ struct IcebergToDuckLakeGlobalTableFunctionState : public GlobalTableFunctionSta "DuckLake version metadata is corrupt, the value can't be NULL and has to be of type VARCHAR"); } auto version_string = value.GetValue(); - if (version_string != "0.2") { - throw InvalidInputException("'iceberg_to_ducklake' only support version 0.2 currently"); + if (!StringUtil::StartsWith(version_string, "0.3")) { + throw InvalidInputException( + "'iceberg_to_ducklake' only support version 0.3 currently, detected '%s' instead", version_string); } } diff --git a/test/sql/iceberg_to_ducklake.test b/test/sql/iceberg_to_ducklake.test new file mode 100644 index 00000000..ed522a56 --- /dev/null +++ b/test/sql/iceberg_to_ducklake.test @@ -0,0 +1,82 @@ +# name: test/sql/iceberg_to_ducklake.test +# group: [sql] + +require-env ICEBERG_SERVER_AVAILABLE + +require avro + +require parquet + +require iceberg + +require httpfs + +require ducklake + +# Do not ignore 'HTTP' error messages! +set ignore_error_messages + +statement ok +set enable_logging=true + +statement ok +set logging_level='debug' + +statement ok +CREATE SECRET ( + TYPE S3, + KEY_ID 'admin', + SECRET 'password', + ENDPOINT '127.0.0.1:9000', + URL_STYLE 'path', + USE_SSL 0 +); + + +statement ok +ATTACH '' AS my_datalake ( + TYPE ICEBERG, + CLIENT_ID 'admin', + CLIENT_SECRET 'password', + ENDPOINT 'http://127.0.0.1:8181' +); + +statement ok +ATTACH 'ducklake:duckdb:__TEST_DIR__/ducklake.duckdb' as my_ducklake (DATA_PATH '__TEST_DIR__/data_path'); + +statement ok +call iceberg_to_ducklake( + 'my_datalake', + 'my_ducklake', + skip_tables := [ + 'pyspark_iceberg_table_v2', + 'deletion_vectors', + 'variant_column' + ] +) + +# NOTE: these produce wrong results/errors, omitted for now: +# - lineitem_partitioned_l_shipmode_deletes +# - schema_evolve_struct_in_list +# - schema_evolve_struct_in_map + +# These are empty, so they are omitted: +# - insert_all_types +# - simple_v3_table +# - test_not_null +# - tpch + +foreach table_name all_types_table day_timestamp day_timestamptz empty_insert filtering_on_bounds filtering_on_partition_bounds issue_328 lineitem_001_deletes lineitem_partitioned_l_shipmode lineitem_sf_01_1_delete lineitem_sf_01_no_deletes lineitem_sf1_deletes many_adds_deletes nested_types pyspark_iceberg_table_v1 quickstart_table schema_evolve_float_to_double schema_evolve_int_to_bigint schema_evolve_struct schema_evolve_widen_decimal table_more_deletes table_partitioned table_unpartitioned table_with_deletes year_timestamp year_timestamptz + +query I rowsort expected_res +select * from my_datalake.default.${table_name} +---- + +query I rowsort expected_res +select * from my_ducklake.default.${table_name} +---- + +reset label expected_res + +# table_name +endloop From 7a1fd41751b8e0cbd0307848a6d75106ac46e750 Mon Sep 17 00:00:00 2001 From: Tishj Date: Tue, 16 Sep 2025 14:12:59 +0200 Subject: [PATCH 3/9] attempting to fix iceberg_to_ducklake --- extension_config.cmake | 5 ++--- src/iceberg_functions/iceberg_to_ducklake.cpp | 9 ++++++++- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/extension_config.cmake b/extension_config.cmake index e70c9c64..6ff25eeb 100644 --- a/extension_config.cmake +++ b/extension_config.cmake @@ -17,7 +17,7 @@ duckdb_extension_load(icu) duckdb_extension_load(ducklake LOAD_TESTS GIT_URL https://github.com/duckdb/ducklake - GIT_TAG c1ebd032eb4c763910551c08f4b61bdb8168f209 + GIT_TAG e0a7b6ed5f2e6a52424d66fff0f91c87bc309022 ) @@ -27,9 +27,8 @@ if (NOT MINGW) duckdb_extension_load(aws LOAD_TESTS GIT_URL https://github.com/duckdb/duckdb-aws - GIT_TAG f855eb3dce37700bfd36fe906a683e4be17dcaf6 + GIT_TAG 880da03202acc973d6ee7f3a0423dae5a6dea83b ) endif () endif() - diff --git a/src/iceberg_functions/iceberg_to_ducklake.cpp b/src/iceberg_functions/iceberg_to_ducklake.cpp index f95d9abe..9b69338d 100644 --- a/src/iceberg_functions/iceberg_to_ducklake.cpp +++ b/src/iceberg_functions/iceberg_to_ducklake.cpp @@ -1001,7 +1001,7 @@ struct IcebergToDuckLakeBindData : public TableFunctionData { public: vector CreateSQLStatements() { //! Order to process in: - // - snapshot + // - snapshot + schema_versions // - schema // - table // - partition_info @@ -1026,6 +1026,13 @@ struct IcebergToDuckLakeBindData : public TableFunctionData { auto &snapshot = it.second; auto values = snapshot.FinalizeEntry(serializer); + if (snapshot.catalog_changes) { + auto snapshot_id = snapshot.snapshot_id; + auto schema_version = snapshot.base_schema_version; + sql.push_back( + StringUtil::Format("INSERT INTO {METADATA_CATALOG}.ducklake_schema_versions VALUES (%llu, %llu);", + snapshot_id.GetIndex(), schema_version)); + } sql.push_back(StringUtil::Format("INSERT INTO {METADATA_CATALOG}.ducklake_snapshot %s", values)); } From d9d53b297be073625bc59e1c76728909513355b8 Mon Sep 17 00:00:00 2001 From: Tishj Date: Tue, 16 Sep 2025 14:41:23 +0200 Subject: [PATCH 4/9] update for v1.4.0 DuckLake --- extension_config.cmake | 41 +++++++++++++++++++ src/iceberg_functions/iceberg_to_ducklake.cpp | 17 ++++---- ...ake.test => iceberg_to_ducklake.test_slow} | 10 +++-- 3 files changed, 57 insertions(+), 11 deletions(-) rename test/sql/{iceberg_to_ducklake.test => iceberg_to_ducklake.test_slow} (89%) diff --git a/extension_config.cmake b/extension_config.cmake index 6ff25eeb..36169117 100644 --- a/extension_config.cmake +++ b/extension_config.cmake @@ -12,6 +12,47 @@ duckdb_extension_load(iceberg LINKED_LIBS "../../vcpkg_installed/wasm32-emscripten/lib/*.a" ) +duckdb_extension_load(tpch) +duckdb_extension_load(icu) +duckdb_extension_load(ducklake + LOAD_TESTS + GIT_URL https://github.com/duckdb/ducklake + GIT_TAG 09f9b85b7ea1c5c4a14ebfb83dd8ac5c9d65a874 +) + +<<<<<<< HEAD +======= +duckdb_extension_load(avro + LOAD_TESTS + GIT_URL https://github.com/duckdb/duckdb-avro + GIT_TAG 0c97a61781f63f8c5444cf3e0c6881ecbaa9fe13 +) +>>>>>>> d96105af (update for v1.4.0 DuckLake) + +if (NOT EMSCRIPTEN) +################## AWS +if (NOT MINGW) + duckdb_extension_load(aws + LOAD_TESTS + GIT_URL https://github.com/duckdb/duckdb-aws + GIT_TAG 812ce80fde0bfa6e4641b6fd798087349a610795 + ) +endif () +endif() +# This file is included by DuckDB's build system. It specifies which extension to load +duckdb_extension_load(avro + LOAD_TESTS + GIT_URL https://github.com/duckdb/duckdb-avro + GIT_TAG 0c97a61781f63f8c5444cf3e0c6881ecbaa9fe13 +) + +# Extension from this repo +duckdb_extension_load(iceberg + SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR} + LOAD_TESTS + LINKED_LIBS "../../vcpkg_installed/wasm32-emscripten/lib/*.a" +) + duckdb_extension_load(tpch) duckdb_extension_load(icu) duckdb_extension_load(ducklake diff --git a/src/iceberg_functions/iceberg_to_ducklake.cpp b/src/iceberg_functions/iceberg_to_ducklake.cpp index 9b69338d..a4b22a75 100644 --- a/src/iceberg_functions/iceberg_to_ducklake.cpp +++ b/src/iceberg_functions/iceberg_to_ducklake.cpp @@ -439,7 +439,7 @@ struct DuckLakeDataFile { } public: - //! Contains the stats used to write the 'ducklake_file_column_statistics' + //! Contains the stats used to write the 'ducklake_file_column_stats' IcebergManifestEntry manifest_entry; DuckLakePartition &partition; @@ -1090,7 +1090,7 @@ struct IcebergToDuckLakeBindData : public TableFunctionData { auto data_file_id = data_file.data_file_id.GetIndex(); auto &start_snapshot = snapshots.at(data_file.start_snapshot); - //! ducklake_file_column_statistics + //! ducklake_file_column_stats auto columns = table.GetColumnsAtSnapshot(start_snapshot); for (auto &it : columns) { auto column_id = it.first; @@ -1137,11 +1137,11 @@ struct IcebergToDuckLakeBindData : public TableFunctionData { auto contains_nan = stats.has_nan ? "true" : "false"; auto min_value = stats.lower_bound.IsNull() ? "NULL" : "'" + stats.lower_bound.ToString() + "'"; auto max_value = stats.upper_bound.IsNull() ? "NULL" : "'" + stats.upper_bound.ToString() + "'"; - auto values = StringUtil::Format("VALUES(%d, %d, %d, %s, %s, %s, %s, %s, %s);", data_file_id, + auto values = StringUtil::Format("VALUES(%d, %d, %d, %s, %s, %s, %s, %s, %s, NULL);", data_file_id, table_id, column_id, column_size_bytes, value_count, null_count.ToString(), min_value, max_value, contains_nan); - sql.push_back(StringUtil::Format( - "INSERT INTO {METADATA_CATALOG}.ducklake_file_column_statistics %s", values)); + sql.push_back( + StringUtil::Format("INSERT INTO {METADATA_CATALOG}.ducklake_file_column_stats %s", values)); if (!data_file.has_end && !column.has_end && !column.IsNested()) { //! This data file is currently active, collect stats for it @@ -1225,8 +1225,8 @@ struct IcebergToDuckLakeBindData : public TableFunctionData { auto contains_nan = stats.contains_nan ? "true" : "false"; auto min_value = stats.min_value.IsNull() ? "NULL" : "'" + stats.min_value.ToString() + "'"; auto max_value = stats.max_value.IsNull() ? "NULL" : "'" + stats.max_value.ToString() + "'"; - auto values = StringUtil::Format("VALUES(%d, %d, %s, %s, %s, %s);", table_id, column_id, contains_null, - contains_nan, min_value, max_value); + auto values = StringUtil::Format("VALUES(%d, %d, %s, %s, %s, %s, NULL);", table_id, column_id, + contains_null, contains_nan, min_value, max_value); sql.push_back( StringUtil::Format("INSERT INTO {METADATA_CATALOG}.ducklake_table_column_stats %s", values)); } @@ -1293,7 +1293,8 @@ struct IcebergToDuckLakeBindData : public TableFunctionData { changes.push_back(StringUtil::Format("altered_table:%d", table_id)); } auto snapshot_id = snapshot.snapshot_id.GetIndex(); - auto values = StringUtil::Format("VALUES(%d, '%s');", snapshot_id, StringUtil::Join(changes, ",")); + auto values = + StringUtil::Format("VALUES(%d, '%s', NULL, NULL, NULL);", snapshot_id, StringUtil::Join(changes, ",")); sql.push_back(StringUtil::Format("INSERT INTO {METADATA_CATALOG}.ducklake_snapshot_changes %s", values)); } sql.push_back("COMMIT TRANSACTION;"); diff --git a/test/sql/iceberg_to_ducklake.test b/test/sql/iceberg_to_ducklake.test_slow similarity index 89% rename from test/sql/iceberg_to_ducklake.test rename to test/sql/iceberg_to_ducklake.test_slow index ed522a56..3e05d006 100644 --- a/test/sql/iceberg_to_ducklake.test +++ b/test/sql/iceberg_to_ducklake.test_slow @@ -1,4 +1,4 @@ -# name: test/sql/iceberg_to_ducklake.test +# name: test/sql/iceberg_to_ducklake.test_slow # group: [sql] require-env ICEBERG_SERVER_AVAILABLE @@ -57,8 +57,6 @@ call iceberg_to_ducklake( # NOTE: these produce wrong results/errors, omitted for now: # - lineitem_partitioned_l_shipmode_deletes -# - schema_evolve_struct_in_list -# - schema_evolve_struct_in_map # These are empty, so they are omitted: # - insert_all_types @@ -66,6 +64,12 @@ call iceberg_to_ducklake( # - test_not_null # - tpch +# These have an ALTER at the end, with no new snapshot added afterwards +# Without a snapshot, that means the change isn't recorded by the conversion +# So the results will not match +# - schema_evolve_struct_in_list +# - schema_evolve_struct_in_map + foreach table_name all_types_table day_timestamp day_timestamptz empty_insert filtering_on_bounds filtering_on_partition_bounds issue_328 lineitem_001_deletes lineitem_partitioned_l_shipmode lineitem_sf_01_1_delete lineitem_sf_01_no_deletes lineitem_sf1_deletes many_adds_deletes nested_types pyspark_iceberg_table_v1 quickstart_table schema_evolve_float_to_double schema_evolve_int_to_bigint schema_evolve_struct schema_evolve_widen_decimal table_more_deletes table_partitioned table_unpartitioned table_with_deletes year_timestamp year_timestamptz query I rowsort expected_res From 0c634e0b6e4fa3261f7ab3a2a7e734f8cf551b0f Mon Sep 17 00:00:00 2001 From: Tishj Date: Tue, 16 Sep 2025 17:31:31 +0200 Subject: [PATCH 5/9] fixes --- extension_config.cmake | 43 +------------------------- test/sql/iceberg_to_ducklake.test_slow | 3 +- 2 files changed, 3 insertions(+), 43 deletions(-) diff --git a/extension_config.cmake b/extension_config.cmake index 36169117..fb152869 100644 --- a/extension_config.cmake +++ b/extension_config.cmake @@ -17,48 +17,7 @@ duckdb_extension_load(icu) duckdb_extension_load(ducklake LOAD_TESTS GIT_URL https://github.com/duckdb/ducklake - GIT_TAG 09f9b85b7ea1c5c4a14ebfb83dd8ac5c9d65a874 -) - -<<<<<<< HEAD -======= -duckdb_extension_load(avro - LOAD_TESTS - GIT_URL https://github.com/duckdb/duckdb-avro - GIT_TAG 0c97a61781f63f8c5444cf3e0c6881ecbaa9fe13 -) ->>>>>>> d96105af (update for v1.4.0 DuckLake) - -if (NOT EMSCRIPTEN) -################## AWS -if (NOT MINGW) - duckdb_extension_load(aws - LOAD_TESTS - GIT_URL https://github.com/duckdb/duckdb-aws - GIT_TAG 812ce80fde0bfa6e4641b6fd798087349a610795 - ) -endif () -endif() -# This file is included by DuckDB's build system. It specifies which extension to load -duckdb_extension_load(avro - LOAD_TESTS - GIT_URL https://github.com/duckdb/duckdb-avro - GIT_TAG 0c97a61781f63f8c5444cf3e0c6881ecbaa9fe13 -) - -# Extension from this repo -duckdb_extension_load(iceberg - SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR} - LOAD_TESTS - LINKED_LIBS "../../vcpkg_installed/wasm32-emscripten/lib/*.a" -) - -duckdb_extension_load(tpch) -duckdb_extension_load(icu) -duckdb_extension_load(ducklake - LOAD_TESTS - GIT_URL https://github.com/duckdb/ducklake - GIT_TAG e0a7b6ed5f2e6a52424d66fff0f91c87bc309022 + GIT_TAG dbb022506e21c27fc4d4cd3d14995af89955401a ) diff --git a/test/sql/iceberg_to_ducklake.test_slow b/test/sql/iceberg_to_ducklake.test_slow index 3e05d006..5052cafa 100644 --- a/test/sql/iceberg_to_ducklake.test_slow +++ b/test/sql/iceberg_to_ducklake.test_slow @@ -51,7 +51,8 @@ call iceberg_to_ducklake( skip_tables := [ 'pyspark_iceberg_table_v2', 'deletion_vectors', - 'variant_column' + 'variant_column', + 'simple_v3_table' ] ) From 6f8e534875979362c084ee8874afb837f67bf248 Mon Sep 17 00:00:00 2001 From: Tishj Date: Wed, 17 Sep 2025 10:34:36 +0200 Subject: [PATCH 6/9] update ducklake, add table that errored before --- test/sql/iceberg_to_ducklake.test_slow | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/test/sql/iceberg_to_ducklake.test_slow b/test/sql/iceberg_to_ducklake.test_slow index 5052cafa..57db5d3e 100644 --- a/test/sql/iceberg_to_ducklake.test_slow +++ b/test/sql/iceberg_to_ducklake.test_slow @@ -56,9 +56,6 @@ call iceberg_to_ducklake( ] ) -# NOTE: these produce wrong results/errors, omitted for now: -# - lineitem_partitioned_l_shipmode_deletes - # These are empty, so they are omitted: # - insert_all_types # - simple_v3_table @@ -71,7 +68,7 @@ call iceberg_to_ducklake( # - schema_evolve_struct_in_list # - schema_evolve_struct_in_map -foreach table_name all_types_table day_timestamp day_timestamptz empty_insert filtering_on_bounds filtering_on_partition_bounds issue_328 lineitem_001_deletes lineitem_partitioned_l_shipmode lineitem_sf_01_1_delete lineitem_sf_01_no_deletes lineitem_sf1_deletes many_adds_deletes nested_types pyspark_iceberg_table_v1 quickstart_table schema_evolve_float_to_double schema_evolve_int_to_bigint schema_evolve_struct schema_evolve_widen_decimal table_more_deletes table_partitioned table_unpartitioned table_with_deletes year_timestamp year_timestamptz +foreach table_name lineitem_partitioned_l_shipmode_deletes all_types_table day_timestamp day_timestamptz empty_insert filtering_on_bounds filtering_on_partition_bounds issue_328 lineitem_001_deletes lineitem_partitioned_l_shipmode lineitem_sf_01_1_delete lineitem_sf_01_no_deletes lineitem_sf1_deletes many_adds_deletes nested_types pyspark_iceberg_table_v1 quickstart_table schema_evolve_float_to_double schema_evolve_int_to_bigint schema_evolve_struct schema_evolve_widen_decimal table_more_deletes table_partitioned table_unpartitioned table_with_deletes year_timestamp year_timestamptz query I rowsort expected_res select * from my_datalake.default.${table_name} From bdedfdc8d9db612ad4a811d9fd5d57769ec55c1a Mon Sep 17 00:00:00 2001 From: Tishj Date: Wed, 17 Sep 2025 11:02:27 +0200 Subject: [PATCH 7/9] undo --- extension_config.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extension_config.cmake b/extension_config.cmake index fb152869..6cc4fdb1 100644 --- a/extension_config.cmake +++ b/extension_config.cmake @@ -27,7 +27,7 @@ if (NOT MINGW) duckdb_extension_load(aws LOAD_TESTS GIT_URL https://github.com/duckdb/duckdb-aws - GIT_TAG 880da03202acc973d6ee7f3a0423dae5a6dea83b + GIT_TAG f855eb3dce37700bfd36fe906a683e4be17dcaf6 ) endif () endif() From cef50c71d9cd2ae853ef718829d69356d580d22c Mon Sep 17 00:00:00 2001 From: Tishj Date: Wed, 17 Sep 2025 11:08:30 +0200 Subject: [PATCH 8/9] v1.4.0 --- .github/workflows/MainDistributionPipeline.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/MainDistributionPipeline.yml b/.github/workflows/MainDistributionPipeline.yml index a6651294..8ce67153 100644 --- a/.github/workflows/MainDistributionPipeline.yml +++ b/.github/workflows/MainDistributionPipeline.yml @@ -17,7 +17,7 @@ jobs: uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main with: extension_name: iceberg - duckdb_version: v1.4-andium + duckdb_version: v1.4.0 ci_tools_version: main exclude_archs: 'windows_amd64_mingw' extra_toolchains: 'python3' @@ -29,7 +29,7 @@ jobs: secrets: inherit with: extension_name: iceberg - duckdb_version: v1.4-andium + duckdb_version: v1.4.0 ci_tools_version: main exclude_archs: 'windows_amd64_mingw' deploy_latest: ${{ startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main' }} From 19893afef27f246761f09599c94b3f94905424dd Mon Sep 17 00:00:00 2001 From: Tishj Date: Wed, 17 Sep 2025 11:23:54 +0200 Subject: [PATCH 9/9] rerun CI