Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/ducklake_extension.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ static void LoadInternal(ExtensionLoader &loader) {
config.AddExtensionOption("ducklake_default_data_inlining_row_limit",
"Default row limit for data inlining (0 disables inlining)", LogicalType::UBIGINT,
Value::UBIGINT(10), nullptr, SetScope::GLOBAL);
config.AddExtensionOption("ducklake_target_file_size", "Target file size for insertion and compaction",
LogicalType::VARCHAR, Value(), nullptr, SetScope::GLOBAL);
config.AddExtensionOption(
"ducklake_write_deletion_vectors",
"[EXPERIMENTAL] Write Iceberg V3 deletion vectors (puffin) instead of positional delete files (parquet)",
Expand Down
6 changes: 1 addition & 5 deletions src/functions/ducklake_compaction_functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -259,11 +259,7 @@ void DuckLakeCompactor::GenerateCompactions(DuckLakeTableEntry &table,
auto &metadata_manager = transaction.GetMetadataManager();
auto snapshot = transaction.GetSnapshot();

idx_t target_file_size = DuckLakeCatalog::DEFAULT_TARGET_FILE_SIZE;
string target_file_size_str;
if (catalog.TryGetConfigOption("target_file_size", target_file_size_str, table)) {
target_file_size = Value(target_file_size_str).DefaultCastAs(LogicalType::UBIGINT).GetValue<idx_t>();
}
idx_t target_file_size = catalog.GetTargetFileSize(context, table);

DuckLakeFileSizeOptions filter_options;
filter_options.min_file_size = options.min_file_size;
Expand Down
2 changes: 2 additions & 0 deletions src/include/storage/ducklake_catalog.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,8 @@ class DuckLakeCatalog : public Catalog {
idx_t DataInliningRowLimit(ClientContext &context, SchemaIndex schema_index, TableIndex table_index) const;
//! Returns the inlining limit (0 if the table is not eligible)
idx_t GetInliningLimit(ClientContext &context, DuckLakeTableEntry &table);
idx_t GetTargetFileSize(ClientContext &context, SchemaIndex schema_id, TableIndex table_id) const;
idx_t GetTargetFileSize(ClientContext &context, DuckLakeTableEntry &table) const;
string &Separator() {
return separator;
}
Expand Down
15 changes: 15 additions & 0 deletions src/storage/ducklake_catalog.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "duckdb/catalog/catalog_entry/schema_catalog_entry.hpp"
#include "duckdb/catalog/catalog_entry/table_catalog_entry.hpp"
#include "duckdb/main/attached_database.hpp"
#include "duckdb/main/config.hpp"
#include "duckdb/parser/constraints/not_null_constraint.hpp"
#include "duckdb/parser/parsed_data/create_schema_info.hpp"
#include "duckdb/parser/parsed_data/create_table_info.hpp"
Expand Down Expand Up @@ -901,6 +902,20 @@ idx_t DuckLakeCatalog::DataInliningRowLimit(ClientContext &context, SchemaIndex
return 10;
}

idx_t DuckLakeCatalog::GetTargetFileSize(ClientContext &context, SchemaIndex schema_id, TableIndex table_id) const {
Value setting_val;
if (context.TryGetCurrentSetting("ducklake_target_file_size", setting_val) && !setting_val.IsNull() &&
!setting_val.ToString().empty()) {
return DBConfig::ParseMemoryLimit(setting_val.ToString());
}
return GetConfigOption<idx_t>("target_file_size", schema_id, table_id, DEFAULT_TARGET_FILE_SIZE);
}

idx_t DuckLakeCatalog::GetTargetFileSize(ClientContext &context, DuckLakeTableEntry &table) const {
auto &schema = table.ParentSchema().Cast<DuckLakeSchemaEntry>();
return GetTargetFileSize(context, schema.GetSchemaId(), table.GetTableId());
}

idx_t DuckLakeCatalog::GetInliningLimit(ClientContext &context, DuckLakeTableEntry &table) {
auto &schema = table.ParentSchema().Cast<DuckLakeSchemaEntry>();
idx_t limit = DataInliningRowLimit(context, schema.GetSchemaId(), table.GetTableId());
Expand Down
3 changes: 1 addition & 2 deletions src/storage/ducklake_insert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -514,8 +514,7 @@ DuckLakeCopyOptions DuckLakeInsert::GetCopyOptions(ClientContext &context, DuckL
if (catalog.TryGetConfigOption("per_thread_output", per_thread_output_str, schema_id, table_id)) {
per_thread_output = per_thread_output_str == "true";
}
idx_t target_file_size = catalog.GetConfigOption<idx_t>("target_file_size", schema_id, table_id,
DuckLakeCatalog::DEFAULT_TARGET_FILE_SIZE);
idx_t target_file_size = catalog.GetTargetFileSize(context, schema_id, table_id);

// Always use native parquet geometry for writing
info->options["geoparquet_version"].emplace_back("NONE");
Expand Down
54 changes: 54 additions & 0 deletions test/sql/insert/insert_session_target_file_size.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# name: test/sql/insert/insert_session_target_file_size.test
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a test that verifies a borked string fails at set?

e.g., 'not-a-size'

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you also cover compaction?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you also validate this option is over other scopes? (e.g., per-schema, per-table)

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what happens if we set to '0'?

# description: test that ducklake_target_file_size session setting overrides metadata target_file_size
# group: [insert]

require ducklake

require parquet

test-env DUCKLAKE_CONNECTION {TEST_DIR}/{UUID}.db

test-env DATA_PATH {TEST_DIR}


statement ok
ATTACH 'ducklake:{DUCKLAKE_CONNECTION}' AS ducklake (DATA_PATH '{DATA_PATH}/ducklake_session_target_file_size')

statement ok
CREATE TABLE ducklake.test(id INTEGER, s VARCHAR);

# Set a large target_file_size in metadata so everything fits in one file
statement ok
CALL ducklake.set_option('target_file_size', '512MB')

# Override with a tiny session setting - should produce multiple files
statement ok
SET ducklake_target_file_size = '10KB'

query I
INSERT INTO ducklake.test SELECT i, concat('thisisalongstring', i) FROM range(500000) t(i)
----
500000

# Session setting should have taken precedence: multiple files expected
query I
SELECT COUNT(*) > 1 FROM glob('{DATA_PATH}/ducklake_session_target_file_size/main/test/*.parquet')
----
true

# Reset the session setting - metadata value (512MB) should now govern: everything in one file
statement ok
RESET ducklake_target_file_size
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a test where you set ducklake_target_file_size after reset?


statement ok
CREATE TABLE ducklake.test2(id INTEGER, s VARCHAR);

query I
INSERT INTO ducklake.test2 SELECT i, concat('thisisalongstring', i) FROM range(500000) t(i)
----
500000

query I
SELECT COUNT(*) = 1 FROM glob('{DATA_PATH}/ducklake_session_target_file_size/main/test2/*.parquet')
----
true
Loading