diff --git a/src/ducklake_extension.cpp b/src/ducklake_extension.cpp index 418a02a6a2b..e0e86a5672b 100644 --- a/src/ducklake_extension.cpp +++ b/src/ducklake_extension.cpp @@ -34,6 +34,8 @@ static void LoadInternal(ExtensionLoader &loader) { config.AddExtensionOption("ducklake_default_data_inlining_row_limit", "Default row limit for data inlining (0 disables inlining)", LogicalType::UBIGINT, Value::UBIGINT(10), nullptr, SetScope::GLOBAL); + config.AddExtensionOption("ducklake_target_file_size", "Target file size for insertion and compaction", + LogicalType::VARCHAR, Value(), nullptr, SetScope::GLOBAL); config.AddExtensionOption( "ducklake_write_deletion_vectors", "[EXPERIMENTAL] Write Iceberg V3 deletion vectors (puffin) instead of positional delete files (parquet)", diff --git a/src/functions/ducklake_compaction_functions.cpp b/src/functions/ducklake_compaction_functions.cpp index 92ad52f05f4..13ff3b1a7fa 100644 --- a/src/functions/ducklake_compaction_functions.cpp +++ b/src/functions/ducklake_compaction_functions.cpp @@ -259,11 +259,7 @@ void DuckLakeCompactor::GenerateCompactions(DuckLakeTableEntry &table, auto &metadata_manager = transaction.GetMetadataManager(); auto snapshot = transaction.GetSnapshot(); - idx_t target_file_size = DuckLakeCatalog::DEFAULT_TARGET_FILE_SIZE; - string target_file_size_str; - if (catalog.TryGetConfigOption("target_file_size", target_file_size_str, table)) { - target_file_size = Value(target_file_size_str).DefaultCastAs(LogicalType::UBIGINT).GetValue(); - } + idx_t target_file_size = catalog.GetTargetFileSize(context, table); DuckLakeFileSizeOptions filter_options; filter_options.min_file_size = options.min_file_size; diff --git a/src/include/storage/ducklake_catalog.hpp b/src/include/storage/ducklake_catalog.hpp index f3d5cb014ab..807257c0813 100644 --- a/src/include/storage/ducklake_catalog.hpp +++ b/src/include/storage/ducklake_catalog.hpp @@ -116,6 +116,8 @@ class DuckLakeCatalog : public Catalog { idx_t DataInliningRowLimit(ClientContext &context, SchemaIndex schema_index, TableIndex table_index) const; //! Returns the inlining limit (0 if the table is not eligible) idx_t GetInliningLimit(ClientContext &context, DuckLakeTableEntry &table); + idx_t GetTargetFileSize(ClientContext &context, SchemaIndex schema_id, TableIndex table_id) const; + idx_t GetTargetFileSize(ClientContext &context, DuckLakeTableEntry &table) const; string &Separator() { return separator; } diff --git a/src/storage/ducklake_catalog.cpp b/src/storage/ducklake_catalog.cpp index c2c7e1f8a5a..f4cbfe26c66 100644 --- a/src/storage/ducklake_catalog.cpp +++ b/src/storage/ducklake_catalog.cpp @@ -5,6 +5,7 @@ #include "duckdb/catalog/catalog_entry/schema_catalog_entry.hpp" #include "duckdb/catalog/catalog_entry/table_catalog_entry.hpp" #include "duckdb/main/attached_database.hpp" +#include "duckdb/main/config.hpp" #include "duckdb/parser/constraints/not_null_constraint.hpp" #include "duckdb/parser/parsed_data/create_schema_info.hpp" #include "duckdb/parser/parsed_data/create_table_info.hpp" @@ -901,6 +902,20 @@ idx_t DuckLakeCatalog::DataInliningRowLimit(ClientContext &context, SchemaIndex return 10; } +idx_t DuckLakeCatalog::GetTargetFileSize(ClientContext &context, SchemaIndex schema_id, TableIndex table_id) const { + Value setting_val; + if (context.TryGetCurrentSetting("ducklake_target_file_size", setting_val) && !setting_val.IsNull() && + !setting_val.ToString().empty()) { + return DBConfig::ParseMemoryLimit(setting_val.ToString()); + } + return GetConfigOption("target_file_size", schema_id, table_id, DEFAULT_TARGET_FILE_SIZE); +} + +idx_t DuckLakeCatalog::GetTargetFileSize(ClientContext &context, DuckLakeTableEntry &table) const { + auto &schema = table.ParentSchema().Cast(); + return GetTargetFileSize(context, schema.GetSchemaId(), table.GetTableId()); +} + idx_t DuckLakeCatalog::GetInliningLimit(ClientContext &context, DuckLakeTableEntry &table) { auto &schema = table.ParentSchema().Cast(); idx_t limit = DataInliningRowLimit(context, schema.GetSchemaId(), table.GetTableId()); diff --git a/src/storage/ducklake_insert.cpp b/src/storage/ducklake_insert.cpp index ba3cc8bcf84..e15d40c9ccf 100644 --- a/src/storage/ducklake_insert.cpp +++ b/src/storage/ducklake_insert.cpp @@ -514,8 +514,7 @@ DuckLakeCopyOptions DuckLakeInsert::GetCopyOptions(ClientContext &context, DuckL if (catalog.TryGetConfigOption("per_thread_output", per_thread_output_str, schema_id, table_id)) { per_thread_output = per_thread_output_str == "true"; } - idx_t target_file_size = catalog.GetConfigOption("target_file_size", schema_id, table_id, - DuckLakeCatalog::DEFAULT_TARGET_FILE_SIZE); + idx_t target_file_size = catalog.GetTargetFileSize(context, schema_id, table_id); // Always use native parquet geometry for writing info->options["geoparquet_version"].emplace_back("NONE"); diff --git a/test/sql/insert/insert_session_target_file_size.test b/test/sql/insert/insert_session_target_file_size.test new file mode 100644 index 00000000000..5ae6aab490f --- /dev/null +++ b/test/sql/insert/insert_session_target_file_size.test @@ -0,0 +1,54 @@ +# name: test/sql/insert/insert_session_target_file_size.test +# description: test that ducklake_target_file_size session setting overrides metadata target_file_size +# group: [insert] + +require ducklake + +require parquet + +test-env DUCKLAKE_CONNECTION {TEST_DIR}/{UUID}.db + +test-env DATA_PATH {TEST_DIR} + + +statement ok +ATTACH 'ducklake:{DUCKLAKE_CONNECTION}' AS ducklake (DATA_PATH '{DATA_PATH}/ducklake_session_target_file_size') + +statement ok +CREATE TABLE ducklake.test(id INTEGER, s VARCHAR); + +# Set a large target_file_size in metadata so everything fits in one file +statement ok +CALL ducklake.set_option('target_file_size', '512MB') + +# Override with a tiny session setting - should produce multiple files +statement ok +SET ducklake_target_file_size = '10KB' + +query I +INSERT INTO ducklake.test SELECT i, concat('thisisalongstring', i) FROM range(500000) t(i) +---- +500000 + +# Session setting should have taken precedence: multiple files expected +query I +SELECT COUNT(*) > 1 FROM glob('{DATA_PATH}/ducklake_session_target_file_size/main/test/*.parquet') +---- +true + +# Reset the session setting - metadata value (512MB) should now govern: everything in one file +statement ok +RESET ducklake_target_file_size + +statement ok +CREATE TABLE ducklake.test2(id INTEGER, s VARCHAR); + +query I +INSERT INTO ducklake.test2 SELECT i, concat('thisisalongstring', i) FROM range(500000) t(i) +---- +500000 + +query I +SELECT COUNT(*) = 1 FROM glob('{DATA_PATH}/ducklake_session_target_file_size/main/test2/*.parquet') +---- +true