Skip to content

Commit 49d67e4

Browse files
authored
Merge pull request #482 from Tishj/iceberg_to_ducklake_tests
Update `iceberg_to_ducklake` for DuckLake v0.3, add/improve tests
2 parents d5a55cd + 19893af commit 49d67e4

File tree

3 files changed

+105
-13
lines changed

3 files changed

+105
-13
lines changed

extension_config.cmake

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ duckdb_extension_load(icu)
1717
duckdb_extension_load(ducklake
1818
LOAD_TESTS
1919
GIT_URL https://github.com/duckdb/ducklake
20-
GIT_TAG c1ebd032eb4c763910551c08f4b61bdb8168f209
20+
GIT_TAG dbb022506e21c27fc4d4cd3d14995af89955401a
2121
)
2222

2323

@@ -32,4 +32,3 @@ if (NOT MINGW)
3232
endif ()
3333
endif()
3434

35-

src/iceberg_functions/iceberg_to_ducklake.cpp

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -439,7 +439,7 @@ struct DuckLakeDataFile {
439439
}
440440

441441
public:
442-
//! Contains the stats used to write the 'ducklake_file_column_statistics'
442+
//! Contains the stats used to write the 'ducklake_file_column_stats'
443443
IcebergManifestEntry manifest_entry;
444444
DuckLakePartition &partition;
445445

@@ -1001,7 +1001,7 @@ struct IcebergToDuckLakeBindData : public TableFunctionData {
10011001
public:
10021002
vector<string> CreateSQLStatements() {
10031003
//! Order to process in:
1004-
// - snapshot
1004+
// - snapshot + schema_versions
10051005
// - schema
10061006
// - table
10071007
// - partition_info
@@ -1026,6 +1026,13 @@ struct IcebergToDuckLakeBindData : public TableFunctionData {
10261026
auto &snapshot = it.second;
10271027

10281028
auto values = snapshot.FinalizeEntry(serializer);
1029+
if (snapshot.catalog_changes) {
1030+
auto snapshot_id = snapshot.snapshot_id;
1031+
auto schema_version = snapshot.base_schema_version;
1032+
sql.push_back(
1033+
StringUtil::Format("INSERT INTO {METADATA_CATALOG}.ducklake_schema_versions VALUES (%llu, %llu);",
1034+
snapshot_id.GetIndex(), schema_version));
1035+
}
10291036
sql.push_back(StringUtil::Format("INSERT INTO {METADATA_CATALOG}.ducklake_snapshot %s", values));
10301037
}
10311038

@@ -1083,7 +1090,7 @@ struct IcebergToDuckLakeBindData : public TableFunctionData {
10831090
auto data_file_id = data_file.data_file_id.GetIndex();
10841091
auto &start_snapshot = snapshots.at(data_file.start_snapshot);
10851092

1086-
//! ducklake_file_column_statistics
1093+
//! ducklake_file_column_stats
10871094
auto columns = table.GetColumnsAtSnapshot(start_snapshot);
10881095
for (auto &it : columns) {
10891096
auto column_id = it.first;
@@ -1130,11 +1137,11 @@ struct IcebergToDuckLakeBindData : public TableFunctionData {
11301137
auto contains_nan = stats.has_nan ? "true" : "false";
11311138
auto min_value = stats.lower_bound.IsNull() ? "NULL" : "'" + stats.lower_bound.ToString() + "'";
11321139
auto max_value = stats.upper_bound.IsNull() ? "NULL" : "'" + stats.upper_bound.ToString() + "'";
1133-
auto values = StringUtil::Format("VALUES(%d, %d, %d, %s, %s, %s, %s, %s, %s);", data_file_id,
1140+
auto values = StringUtil::Format("VALUES(%d, %d, %d, %s, %s, %s, %s, %s, %s, NULL);", data_file_id,
11341141
table_id, column_id, column_size_bytes, value_count,
11351142
null_count.ToString(), min_value, max_value, contains_nan);
1136-
sql.push_back(StringUtil::Format(
1137-
"INSERT INTO {METADATA_CATALOG}.ducklake_file_column_statistics %s", values));
1143+
sql.push_back(
1144+
StringUtil::Format("INSERT INTO {METADATA_CATALOG}.ducklake_file_column_stats %s", values));
11381145

11391146
if (!data_file.has_end && !column.has_end && !column.IsNested()) {
11401147
//! This data file is currently active, collect stats for it
@@ -1218,8 +1225,8 @@ struct IcebergToDuckLakeBindData : public TableFunctionData {
12181225
auto contains_nan = stats.contains_nan ? "true" : "false";
12191226
auto min_value = stats.min_value.IsNull() ? "NULL" : "'" + stats.min_value.ToString() + "'";
12201227
auto max_value = stats.max_value.IsNull() ? "NULL" : "'" + stats.max_value.ToString() + "'";
1221-
auto values = StringUtil::Format("VALUES(%d, %d, %s, %s, %s, %s);", table_id, column_id, contains_null,
1222-
contains_nan, min_value, max_value);
1228+
auto values = StringUtil::Format("VALUES(%d, %d, %s, %s, %s, %s, NULL);", table_id, column_id,
1229+
contains_null, contains_nan, min_value, max_value);
12231230
sql.push_back(
12241231
StringUtil::Format("INSERT INTO {METADATA_CATALOG}.ducklake_table_column_stats %s", values));
12251232
}
@@ -1286,7 +1293,8 @@ struct IcebergToDuckLakeBindData : public TableFunctionData {
12861293
changes.push_back(StringUtil::Format("altered_table:%d", table_id));
12871294
}
12881295
auto snapshot_id = snapshot.snapshot_id.GetIndex();
1289-
auto values = StringUtil::Format("VALUES(%d, '%s');", snapshot_id, StringUtil::Join(changes, ","));
1296+
auto values =
1297+
StringUtil::Format("VALUES(%d, '%s', NULL, NULL, NULL);", snapshot_id, StringUtil::Join(changes, ","));
12901298
sql.push_back(StringUtil::Format("INSERT INTO {METADATA_CATALOG}.ducklake_snapshot_changes %s", values));
12911299
}
12921300
sql.push_back("COMMIT TRANSACTION;");
@@ -1452,8 +1460,9 @@ struct IcebergToDuckLakeGlobalTableFunctionState : public GlobalTableFunctionSta
14521460
"DuckLake version metadata is corrupt, the value can't be NULL and has to be of type VARCHAR");
14531461
}
14541462
auto version_string = value.GetValue<string>();
1455-
if (version_string != "0.2") {
1456-
throw InvalidInputException("'iceberg_to_ducklake' only support version 0.2 currently");
1463+
if (!StringUtil::StartsWith(version_string, "0.3")) {
1464+
throw InvalidInputException(
1465+
"'iceberg_to_ducklake' only support version 0.3 currently, detected '%s' instead", version_string);
14571466
}
14581467
}
14591468

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
# name: test/sql/iceberg_to_ducklake.test_slow
2+
# group: [sql]
3+
4+
require-env ICEBERG_SERVER_AVAILABLE
5+
6+
require avro
7+
8+
require parquet
9+
10+
require iceberg
11+
12+
require httpfs
13+
14+
require ducklake
15+
16+
# Do not ignore 'HTTP' error messages!
17+
set ignore_error_messages
18+
19+
statement ok
20+
set enable_logging=true
21+
22+
statement ok
23+
set logging_level='debug'
24+
25+
statement ok
26+
CREATE SECRET (
27+
TYPE S3,
28+
KEY_ID 'admin',
29+
SECRET 'password',
30+
ENDPOINT '127.0.0.1:9000',
31+
URL_STYLE 'path',
32+
USE_SSL 0
33+
);
34+
35+
36+
statement ok
37+
ATTACH '' AS my_datalake (
38+
TYPE ICEBERG,
39+
CLIENT_ID 'admin',
40+
CLIENT_SECRET 'password',
41+
ENDPOINT 'http://127.0.0.1:8181'
42+
);
43+
44+
statement ok
45+
ATTACH 'ducklake:duckdb:__TEST_DIR__/ducklake.duckdb' as my_ducklake (DATA_PATH '__TEST_DIR__/data_path');
46+
47+
statement ok
48+
call iceberg_to_ducklake(
49+
'my_datalake',
50+
'my_ducklake',
51+
skip_tables := [
52+
'pyspark_iceberg_table_v2',
53+
'deletion_vectors',
54+
'variant_column',
55+
'simple_v3_table'
56+
]
57+
)
58+
59+
# These are empty, so they are omitted:
60+
# - insert_all_types
61+
# - simple_v3_table
62+
# - test_not_null
63+
# - tpch
64+
65+
# These have an ALTER at the end, with no new snapshot added afterwards
66+
# Without a snapshot, that means the change isn't recorded by the conversion
67+
# So the results will not match
68+
# - schema_evolve_struct_in_list
69+
# - schema_evolve_struct_in_map
70+
71+
foreach table_name lineitem_partitioned_l_shipmode_deletes all_types_table day_timestamp day_timestamptz empty_insert filtering_on_bounds filtering_on_partition_bounds issue_328 lineitem_001_deletes lineitem_partitioned_l_shipmode lineitem_sf_01_1_delete lineitem_sf_01_no_deletes lineitem_sf1_deletes many_adds_deletes nested_types pyspark_iceberg_table_v1 quickstart_table schema_evolve_float_to_double schema_evolve_int_to_bigint schema_evolve_struct schema_evolve_widen_decimal table_more_deletes table_partitioned table_unpartitioned table_with_deletes year_timestamp year_timestamptz
72+
73+
query I rowsort expected_res
74+
select * from my_datalake.default.${table_name}
75+
----
76+
77+
query I rowsort expected_res
78+
select * from my_ducklake.default.${table_name}
79+
----
80+
81+
reset label expected_res
82+
83+
# table_name
84+
endloop

0 commit comments

Comments
 (0)