Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 33 additions & 6 deletions src/common/ducklake_util.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "common/ducklake_util.hpp"
#include "duckdb/common/string_util.hpp"
#include "duckdb/common/path.hpp"
#include "duckdb/common/sql_identifier.hpp"
#include "duckdb/parser/keyword_helper.hpp"
#include "duckdb/parser/parser.hpp"
Expand Down Expand Up @@ -312,13 +313,39 @@ void DuckLakeUtil::EnsureDirectoryExists(FileSystem &fs, const string &data_path
}
}

string DuckLakeUtil::JoinPath(FileSystem &fs, const string &a, const string &b) {
auto sep = fs.PathSeparator(a);
if (StringUtil::EndsWith(a, sep)) {
return a + b;
} else {
return a + sep + b;
string DuckLakeUtil::LocalOrRemoteSeparator(FileSystem &fs, const string &path) {
auto parsed = Path::FromString(path);
if (parsed.IsRemote()) {
return "/";
}
return fs.PathSeparator(path);
}

string DuckLakeUtil::JoinPath(FileSystem &fs, const string &a, const string &b) {
static_cast<void>(fs);
if (a.empty()) {
return Path::Normalize(b);
}
if (b.empty()) {
return Path::Normalize(a);
}
auto full_tail = Path::FromString(b);
if (full_tail.HasScheme() || full_tail.HasDrive()) {
return full_tail.ToString();
}
auto trimmed_tail = b;
auto trim_pos = trimmed_tail.find_first_not_of("/\\");
if (trim_pos == string::npos) {
trimmed_tail.clear();
} else if (trim_pos > 0) {
trimmed_tail.erase(0, trim_pos);
}
if (trimmed_tail.empty()) {
return Path::Normalize(a);
}
auto base = Path::FromString(a);
auto rel = Path::FromString(trimmed_tail);
return base.Join(rel).ToString();
}

DynamicFilter *DuckLakeUtil::GetOptionalDynamicFilter(const TableFilter &filter) {
Expand Down
17 changes: 13 additions & 4 deletions src/functions/ducklake_flush_inlined_data.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "storage/ducklake_delete.hpp"
#include "storage/ducklake_delete_filter.hpp"
#include "duckdb/common/types/blob.hpp"
#include "common/ducklake_util.hpp"
#include "functions/ducklake_compaction_functions.hpp"
#include "storage/ducklake_sort_data.hpp"

Expand Down Expand Up @@ -420,6 +421,7 @@ static void FlushInlinedFileDeletions(ClientContext &context, DuckLakeCatalog &c
auto &metadata_manager = transaction.GetMetadataManager();
auto table_id = table.GetTableId();
auto snapshot = transaction.GetSnapshot();
auto &fs = FileSystem::GetFileSystem(context);

// Check if this table has an inlined deletion table
auto inlined_table_name = metadata_manager.GetInlinedDeletionTableName(table_id, snapshot);
Expand Down Expand Up @@ -466,7 +468,9 @@ LEFT JOIN (
if (file_info.file_path.empty()) {
auto path = chunk->GetValue(1, row_idx).GetValue<string>();
auto path_is_relative = chunk->GetValue(2, row_idx).GetValue<bool>();
file_info.file_path = path_is_relative ? table.DataPath() + path : path;
file_info.file_path = path_is_relative
? DuckLakeUtil::JoinPath(fs, table.DataPath(), path)
: path;
file_info.max_snapshot = begin_snapshot;

// Check for existing delete file
Expand Down Expand Up @@ -503,7 +507,6 @@ LEFT JOIN (
}

// Write delete files
auto &fs = FileSystem::GetFileSystem(context);
vector<DuckLakeDeleteFile> delete_files;

// Get encryption key if the catalog is encrypted
Expand All @@ -527,7 +530,10 @@ LEFT JOIN (
// Read existing deletions from the delete file
DuckLakeFileData existing_delete_file_data;
existing_delete_file_data.path = file_info.existing_delete_path_is_relative
? table.DataPath() + file_info.existing_delete_path
? DuckLakeUtil::JoinPath(
fs,
table.DataPath(),
file_info.existing_delete_path)
: file_info.existing_delete_path;
existing_delete_file_data.encryption_key = file_info.existing_delete_encryption_key;
existing_delete_file_data.format = file_info.existing_delete_format;
Expand Down Expand Up @@ -562,7 +568,10 @@ LEFT JOIN (
delete_file.overwrites_existing_delete = true;
delete_file.overwritten_delete_file.delete_file_id = file_info.existing_delete_file_id;
delete_file.overwritten_delete_file.path = file_info.existing_delete_path_is_relative
? table.DataPath() + file_info.existing_delete_path
? DuckLakeUtil::JoinPath(
fs,
table.DataPath(),
file_info.existing_delete_path)
: file_info.existing_delete_path;
}

Expand Down
1 change: 1 addition & 0 deletions src/include/common/ducklake_util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ class DuckLakeUtil {
static string ValueToSQL(DuckLakeMetadataManager &metadata_manager, ClientContext &context, const Value &val);

static ParsedCatalogEntry ParseCatalogEntry(const string &input);
static string LocalOrRemoteSeparator(FileSystem &fs, const string &path);
static string JoinPath(FileSystem &fs, const string &a, const string &b);

static DynamicFilter *GetOptionalDynamicFilter(const TableFilter &filter);
Expand Down
7 changes: 6 additions & 1 deletion src/storage/ducklake_catalog.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include "duckdb/common/operator/cast_operators.hpp"
#include "duckdb/common/types/uuid.hpp"
#include "common/ducklake_util.hpp"
#include "duckdb/common/file_system.hpp"

namespace duckdb {

Expand Down Expand Up @@ -147,7 +148,11 @@ optional_ptr<CatalogEntry> DuckLakeCatalog::CreateSchema(CatalogTransaction tran
//! get a local table-id
auto schema_id = SchemaIndex(duck_transaction.GetLocalCatalogId());
auto schema_uuid = duck_transaction.GenerateUUID();
auto schema_data_path = DataPath() + DuckLakeCatalog::GeneratePathFromName(schema_uuid, info.schema);
auto &fs = FileSystem::GetFileSystem(transaction.GetContext());
auto schema_data_path = DuckLakeUtil::JoinPath(
fs,
DataPath(),
DuckLakeCatalog::GeneratePathFromName(schema_uuid, info.schema));
auto schema_entry =
make_uniq<DuckLakeSchemaEntry>(*this, info, schema_id, std::move(schema_uuid), std::move(schema_data_path));
auto result = schema_entry.get();
Expand Down
3 changes: 2 additions & 1 deletion src/storage/ducklake_initializer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "storage/ducklake_transaction.hpp"
#include "storage/ducklake_schema_entry.hpp"
#include "common/ducklake_version.hpp"
#include "common/ducklake_util.hpp"
#include "metadata_manager/ducklake_metadata_manager_v1_1.hpp"
#include "metadata_manager/sqlite_metadata_manager.hpp"
#include "metadata_manager/postgres_metadata_manager.hpp"
Expand Down Expand Up @@ -130,7 +131,7 @@ void DuckLakeInitializer::InitializeDataPath() {
CheckAndAutoloadedRequiredExtension(data_path);

auto &fs = FileSystem::GetFileSystem(context);
auto separator = fs.PathSeparator(data_path);
auto separator = DuckLakeUtil::LocalOrRemoteSeparator(fs, data_path);
// pop trailing path separators
while (!data_path.empty() && (data_path.back() == '/' || data_path.back() == '\\')) {
data_path.pop_back();
Expand Down
10 changes: 7 additions & 3 deletions src/storage/ducklake_insert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "storage/ducklake_table_entry.hpp"
#include "storage/ducklake_transaction.hpp"
#include "common/ducklake_util.hpp"
#include "duckdb/common/file_system.hpp"
#include "storage/ducklake_scan.hpp"
#include "storage/ducklake_inline_data.hpp"
#include "storage/ducklake_geo_stats.hpp"
Expand Down Expand Up @@ -335,7 +336,7 @@ DuckLakeCopyOptions::DuckLakeCopyOptions(unique_ptr<CopyInfo> info_p, CopyFuncti

DuckLakeCopyInput::DuckLakeCopyInput(ClientContext &context, DuckLakeTableEntry &table, const string &hive_partition)
: catalog(table.ParentCatalog().Cast<DuckLakeCatalog>()), columns(table.GetColumns()),
data_path(table.DataPath() + hive_partition) {
data_path(DuckLakeUtil::JoinPath(FileSystem::GetFileSystem(context), table.DataPath(), hive_partition)) {
partition_data = table.GetPartitionData();
field_data = table.GetFieldData();
schema_id = table.ParentSchema().Cast<DuckLakeSchemaEntry>().GetSchemaId();
Expand Down Expand Up @@ -850,8 +851,11 @@ PhysicalOperator &DuckLakeCatalog::PlanCreateTableAs(ClientContext &context, Phy
DuckLakeTypes::CheckSupportedType(col.Type());
}
auto table_uuid = duck_transaction.GenerateUUID();
auto table_data_path =
duck_schema.DataPath() + DuckLakeCatalog::GeneratePathFromName(table_uuid, create_info.table);
auto &fs = FileSystem::GetFileSystem(context);
auto table_data_path = DuckLakeUtil::JoinPath(
fs,
duck_schema.DataPath(),
DuckLakeCatalog::GeneratePathFromName(table_uuid, create_info.table));

DuckLakeCopyInput copy_input(context, duck_schema, columns, table_data_path);
auto &physical_copy = DuckLakeInsert::PlanCopyForInsert(context, planner, copy_input, root.get());
Expand Down
12 changes: 7 additions & 5 deletions src/storage/ducklake_metadata_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include "storage/ducklake_transaction.hpp"
#include "storage/ducklake_variant_stats.hpp"
#include "common/ducklake_util.hpp"
#include "duckdb/common/path.hpp"
#include "duckdb/planner/tableref/bound_at_clause.hpp"
#include "duckdb/common/types/blob.hpp"
#include "duckdb/common/sql_identifier.hpp"
Expand Down Expand Up @@ -3030,12 +3031,12 @@ DuckLakePath DuckLakeMetadataManager::GetRelativePath(const string &path, const

string DuckLakeMetadataManager::GetPathSeparator(const string &path) {
auto &catalog = transaction.GetCatalog();
if (!catalog.DataPath().empty()) {
// use the cached separator from the catalog
auto parsed = Path::FromString(path);
if (!catalog.DataPath().empty() && !parsed.IsAbsolute() && !parsed.HasDrive()) {
// use the cached separator from the catalog for relative paths
return catalog.Separator();
}
// if catalog is not loaded, use the file system
return GetFileSystem().PathSeparator(path);
return DuckLakeUtil::LocalOrRemoteSeparator(GetFileSystem(), path);
}

string DuckLakeMetadataManager::StorePath(string path) {
Expand All @@ -3058,7 +3059,8 @@ string DuckLakeMetadataManager::FromRelativePath(const DuckLakePath &path, const
if (!path.path_is_relative) {
return LoadPath(path.path);
}
return LoadPath(base_path + path.path);
auto &fs = GetFileSystem();
return LoadPath(DuckLakeUtil::JoinPath(fs, base_path, path.path));
}

string DuckLakeMetadataManager::FromRelativePath(const DuckLakePath &path) {
Expand Down
7 changes: 6 additions & 1 deletion src/storage/ducklake_schema_entry.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "duckdb/catalog/catalog_entry/table_macro_catalog_entry.hpp"
#include "storage/ducklake_macro_entry.hpp"
#include "common/ducklake_util.hpp"
#include "duckdb/common/file_system.hpp"

namespace duckdb {

Expand Down Expand Up @@ -96,7 +97,11 @@ optional_ptr<CatalogEntry> DuckLakeSchemaEntry::CreateTable(CatalogTransaction t
auto &duck_catalog = catalog.Cast<DuckLakeCatalog>();
auto &base_info = info.Base();
auto table_uuid = duck_transaction.GenerateUUID();
auto table_data_path = DataPath() + duck_catalog.GeneratePathFromName(table_uuid, base_info.table);
auto &fs = FileSystem::GetFileSystem(transaction.GetContext());
auto table_data_path = DuckLakeUtil::JoinPath(
fs,
DataPath(),
duck_catalog.GeneratePathFromName(table_uuid, base_info.table));
return CreateTableExtended(transaction, info, std::move(table_uuid), std::move(table_data_path));
}

Expand Down
87 changes: 87 additions & 0 deletions test/sql/cleanup/cleanup_path_separator.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# name: test/sql/cleanup/cleanup_path_separator.test
# description: cleanup uses normalized paths when relative entries include leading separators
# group: [cleanup]

require ducklake

require parquet

test-env DUCKLAKE_CONNECTION __TEST_DIR__/{UUID}.db

test-env DATA_PATH __TEST_DIR__


statement ok
ATTACH 'ducklake:${DUCKLAKE_CONNECTION}' AS ducklake (DATA_PATH '${DATA_PATH}/cleanup_path_separator', METADATA_CATALOG 'ducklake_meta')

statement ok
INSERT INTO ducklake_meta.ducklake_files_scheduled_for_deletion VALUES (999, '/main/t/lead.parquet', true, NOW());

statement ok
INSERT INTO ducklake_meta.ducklake_files_scheduled_for_deletion VALUES (1000, 'main/./t/dot.parquet', true, NOW());

statement ok
INSERT INTO ducklake_meta.ducklake_files_scheduled_for_deletion VALUES (1001, 'main/../t/parent.parquet', true, NOW());

statement ok
INSERT INTO ducklake_meta.ducklake_files_scheduled_for_deletion VALUES (1002, 's3://other-bucket/abs.parquet', true, NOW());

statement ok
CREATE TEMP TABLE cleanup_paths AS
SELECT replace(path, chr(92), '/') AS path
FROM ducklake_cleanup_old_files('ducklake', dry_run => true, cleanup_all => true);

query I
SELECT COUNT(*) FROM cleanup_paths;
----
4

query I
SELECT COUNT(*) FROM cleanup_paths
WHERE path NOT LIKE '%://%'
AND ltrim(path, '/') LIKE '%//%';
----
0

query I
SELECT COUNT(*) FROM cleanup_paths
WHERE replace(path, '://', ':/') LIKE '%/./%'
OR replace(path, '://', ':/') LIKE '%/../%';
----
0

query I
SELECT COUNT(*) FROM cleanup_paths
WHERE path = 's3://other-bucket/abs.parquet';
----
1

statement ok
DROP TABLE cleanup_paths;

test-env DUCKLAKE_CONNECTION_ROOT __TEST_DIR__/{UUID}_root.db

statement ok
ATTACH 'ducklake:${DUCKLAKE_CONNECTION_ROOT}' AS ducklake_root (DATA_PATH '/', METADATA_CATALOG 'ducklake_root_meta')

statement ok
INSERT INTO ducklake_root_meta.ducklake_files_scheduled_for_deletion VALUES (2000, 'rooted.parquet', true, NOW());

statement ok
INSERT INTO ducklake_root_meta.ducklake_files_scheduled_for_deletion VALUES (2001, '/rooted2.parquet', true, NOW());

statement ok
CREATE TEMP TABLE cleanup_paths AS
SELECT replace(path, chr(92), '/') AS path
FROM ducklake_cleanup_old_files('ducklake_root', dry_run => true, cleanup_all => true);

query I
SELECT COUNT(*) FROM cleanup_paths;
----
2

query I
SELECT COUNT(*) FROM cleanup_paths
WHERE NOT (path LIKE '/%' OR substr(path, 2, 2) = ':/');
----
0
Loading