Skip to content

Commit f659cbe

Browse files
committed
Normalize ducklake path joins
1 parent 6541477 commit f659cbe

9 files changed

Lines changed: 162 additions & 21 deletions

src/common/ducklake_util.cpp

Lines changed: 33 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include "common/ducklake_util.hpp"
22
#include "duckdb/common/string_util.hpp"
3+
#include "duckdb/common/path.hpp"
34
#include "duckdb/parser/keyword_helper.hpp"
45
#include "duckdb/parser/parser.hpp"
56
#include "duckdb/common/file_system.hpp"
@@ -310,13 +311,39 @@ void DuckLakeUtil::EnsureDirectoryExists(FileSystem &fs, const string &data_path
310311
}
311312
}
312313

313-
string DuckLakeUtil::JoinPath(FileSystem &fs, const string &a, const string &b) {
314-
auto sep = fs.PathSeparator(a);
315-
if (StringUtil::EndsWith(a, sep)) {
316-
return a + b;
317-
} else {
318-
return a + sep + b;
314+
string DuckLakeUtil::LocalOrRemoteSeparator(FileSystem &fs, const string &path) {
315+
auto parsed = Path::FromString(path);
316+
if (parsed.IsRemote()) {
317+
return "/";
319318
}
319+
return fs.PathSeparator(path);
320+
}
321+
322+
string DuckLakeUtil::JoinPath(FileSystem &fs, const string &a, const string &b) {
323+
static_cast<void>(fs);
324+
if (a.empty()) {
325+
return Path::Normalize(b);
326+
}
327+
if (b.empty()) {
328+
return Path::Normalize(a);
329+
}
330+
auto full_tail = Path::FromString(b);
331+
if (full_tail.HasScheme() || full_tail.HasDrive()) {
332+
return full_tail.ToString();
333+
}
334+
auto trimmed_tail = b;
335+
auto trim_pos = trimmed_tail.find_first_not_of("/\\");
336+
if (trim_pos == string::npos) {
337+
trimmed_tail.clear();
338+
} else if (trim_pos > 0) {
339+
trimmed_tail.erase(0, trim_pos);
340+
}
341+
if (trimmed_tail.empty()) {
342+
return Path::Normalize(a);
343+
}
344+
auto base = Path::FromString(a);
345+
auto rel = Path::FromString(trimmed_tail);
346+
return base.Join(rel).ToString();
320347
}
321348

322349
DynamicFilter *DuckLakeUtil::GetOptionalDynamicFilter(const TableFilter &filter) {

src/functions/ducklake_flush_inlined_data.cpp

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include "storage/ducklake_delete.hpp"
2727
#include "storage/ducklake_delete_filter.hpp"
2828
#include "duckdb/common/types/blob.hpp"
29+
#include "common/ducklake_util.hpp"
2930
#include "functions/ducklake_compaction_functions.hpp"
3031
#include "storage/ducklake_sort_data.hpp"
3132

@@ -420,6 +421,7 @@ static void FlushInlinedFileDeletions(ClientContext &context, DuckLakeCatalog &c
420421
auto &metadata_manager = transaction.GetMetadataManager();
421422
auto table_id = table.GetTableId();
422423
auto snapshot = transaction.GetSnapshot();
424+
auto &fs = FileSystem::GetFileSystem(context);
423425

424426
// Check if this table has an inlined deletion table
425427
auto inlined_table_name = metadata_manager.GetInlinedDeletionTableName(table_id, snapshot);
@@ -466,7 +468,9 @@ LEFT JOIN (
466468
if (file_info.file_path.empty()) {
467469
auto path = chunk->GetValue(1, row_idx).GetValue<string>();
468470
auto path_is_relative = chunk->GetValue(2, row_idx).GetValue<bool>();
469-
file_info.file_path = path_is_relative ? table.DataPath() + path : path;
471+
file_info.file_path = path_is_relative
472+
? DuckLakeUtil::JoinPath(fs, table.DataPath(), path)
473+
: path;
470474
file_info.max_snapshot = begin_snapshot;
471475

472476
// Check for existing delete file
@@ -503,7 +507,6 @@ LEFT JOIN (
503507
}
504508

505509
// Write delete files
506-
auto &fs = FileSystem::GetFileSystem(context);
507510
vector<DuckLakeDeleteFile> delete_files;
508511

509512
// Get encryption key if the catalog is encrypted
@@ -527,7 +530,10 @@ LEFT JOIN (
527530
// Read existing deletions from the delete file
528531
DuckLakeFileData existing_delete_file_data;
529532
existing_delete_file_data.path = file_info.existing_delete_path_is_relative
530-
? table.DataPath() + file_info.existing_delete_path
533+
? DuckLakeUtil::JoinPath(
534+
fs,
535+
table.DataPath(),
536+
file_info.existing_delete_path)
531537
: file_info.existing_delete_path;
532538
existing_delete_file_data.encryption_key = file_info.existing_delete_encryption_key;
533539
existing_delete_file_data.format = file_info.existing_delete_format;
@@ -562,7 +568,10 @@ LEFT JOIN (
562568
delete_file.overwrites_existing_delete = true;
563569
delete_file.overwritten_delete_file.delete_file_id = file_info.existing_delete_file_id;
564570
delete_file.overwritten_delete_file.path = file_info.existing_delete_path_is_relative
565-
? table.DataPath() + file_info.existing_delete_path
571+
? DuckLakeUtil::JoinPath(
572+
fs,
573+
table.DataPath(),
574+
file_info.existing_delete_path)
566575
: file_info.existing_delete_path;
567576
}
568577

src/include/common/ducklake_util.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ class DuckLakeUtil {
3434
static string ValueToSQL(DuckLakeMetadataManager &metadata_manager, ClientContext &context, const Value &val);
3535

3636
static ParsedCatalogEntry ParseCatalogEntry(const string &input);
37+
static string LocalOrRemoteSeparator(FileSystem &fs, const string &path);
3738
static string JoinPath(FileSystem &fs, const string &a, const string &b);
3839

3940
static DynamicFilter *GetOptionalDynamicFilter(const TableFilter &filter);

src/storage/ducklake_catalog.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include "duckdb/common/operator/cast_operators.hpp"
3030
#include "duckdb/common/types/uuid.hpp"
3131
#include "common/ducklake_util.hpp"
32+
#include "duckdb/common/file_system.hpp"
3233

3334
namespace duckdb {
3435

@@ -147,7 +148,11 @@ optional_ptr<CatalogEntry> DuckLakeCatalog::CreateSchema(CatalogTransaction tran
147148
//! get a local table-id
148149
auto schema_id = SchemaIndex(duck_transaction.GetLocalCatalogId());
149150
auto schema_uuid = duck_transaction.GenerateUUID();
150-
auto schema_data_path = DataPath() + DuckLakeCatalog::GeneratePathFromName(schema_uuid, info.schema);
151+
auto &fs = FileSystem::GetFileSystem(transaction.GetContext());
152+
auto schema_data_path = DuckLakeUtil::JoinPath(
153+
fs,
154+
DataPath(),
155+
DuckLakeCatalog::GeneratePathFromName(schema_uuid, info.schema));
151156
auto schema_entry =
152157
make_uniq<DuckLakeSchemaEntry>(*this, info, schema_id, std::move(schema_uuid), std::move(schema_data_path));
153158
auto result = schema_entry.get();

src/storage/ducklake_initializer.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include "storage/ducklake_transaction.hpp"
1212
#include "storage/ducklake_schema_entry.hpp"
1313
#include "common/ducklake_version.hpp"
14+
#include "common/ducklake_util.hpp"
1415
#include "metadata_manager/ducklake_metadata_manager_v1_1.hpp"
1516
#include "metadata_manager/sqlite_metadata_manager.hpp"
1617
#include "metadata_manager/postgres_metadata_manager.hpp"
@@ -130,7 +131,7 @@ void DuckLakeInitializer::InitializeDataPath() {
130131
CheckAndAutoloadedRequiredExtension(data_path);
131132

132133
auto &fs = FileSystem::GetFileSystem(context);
133-
auto separator = fs.PathSeparator(data_path);
134+
auto separator = DuckLakeUtil::LocalOrRemoteSeparator(fs, data_path);
134135
// pop trailing path separators
135136
while (!data_path.empty() && (data_path.back() == '/' || data_path.back() == '\\')) {
136137
data_path.pop_back();

src/storage/ducklake_insert.cpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include "storage/ducklake_table_entry.hpp"
66
#include "storage/ducklake_transaction.hpp"
77
#include "common/ducklake_util.hpp"
8+
#include "duckdb/common/file_system.hpp"
89
#include "storage/ducklake_scan.hpp"
910
#include "storage/ducklake_inline_data.hpp"
1011
#include "storage/ducklake_geo_stats.hpp"
@@ -335,7 +336,7 @@ DuckLakeCopyOptions::DuckLakeCopyOptions(unique_ptr<CopyInfo> info_p, CopyFuncti
335336

336337
DuckLakeCopyInput::DuckLakeCopyInput(ClientContext &context, DuckLakeTableEntry &table, const string &hive_partition)
337338
: catalog(table.ParentCatalog().Cast<DuckLakeCatalog>()), columns(table.GetColumns()),
338-
data_path(table.DataPath() + hive_partition) {
339+
data_path(DuckLakeUtil::JoinPath(FileSystem::GetFileSystem(context), table.DataPath(), hive_partition)) {
339340
partition_data = table.GetPartitionData();
340341
field_data = table.GetFieldData();
341342
schema_id = table.ParentSchema().Cast<DuckLakeSchemaEntry>().GetSchemaId();
@@ -850,8 +851,11 @@ PhysicalOperator &DuckLakeCatalog::PlanCreateTableAs(ClientContext &context, Phy
850851
DuckLakeTypes::CheckSupportedType(col.Type());
851852
}
852853
auto table_uuid = duck_transaction.GenerateUUID();
853-
auto table_data_path =
854-
duck_schema.DataPath() + DuckLakeCatalog::GeneratePathFromName(table_uuid, create_info.table);
854+
auto &fs = FileSystem::GetFileSystem(context);
855+
auto table_data_path = DuckLakeUtil::JoinPath(
856+
fs,
857+
duck_schema.DataPath(),
858+
DuckLakeCatalog::GeneratePathFromName(table_uuid, create_info.table));
855859

856860
DuckLakeCopyInput copy_input(context, duck_schema, columns, table_data_path);
857861
auto &physical_copy = DuckLakeInsert::PlanCopyForInsert(context, planner, copy_input, root.get());

src/storage/ducklake_metadata_manager.cpp

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#include "storage/ducklake_transaction.hpp"
33
#include "storage/ducklake_variant_stats.hpp"
44
#include "common/ducklake_util.hpp"
5+
#include "duckdb/common/path.hpp"
56
#include "duckdb/planner/tableref/bound_at_clause.hpp"
67
#include "duckdb/common/types/blob.hpp"
78
#include "duckdb/common/type_visitor.hpp"
@@ -3024,12 +3025,12 @@ DuckLakePath DuckLakeMetadataManager::GetRelativePath(const string &path, const
30243025

30253026
string DuckLakeMetadataManager::GetPathSeparator(const string &path) {
30263027
auto &catalog = transaction.GetCatalog();
3027-
if (!catalog.DataPath().empty()) {
3028-
// use the cached separator from the catalog
3028+
auto parsed = Path::FromString(path);
3029+
if (!catalog.DataPath().empty() && !parsed.IsAbsolute() && !parsed.HasDrive()) {
3030+
// use the cached separator from the catalog for relative paths
30293031
return catalog.Separator();
30303032
}
3031-
// if catalog is not loaded, use the file system
3032-
return GetFileSystem().PathSeparator(path);
3033+
return DuckLakeUtil::LocalOrRemoteSeparator(GetFileSystem(), path);
30333034
}
30343035

30353036
string DuckLakeMetadataManager::StorePath(string path) {
@@ -3052,7 +3053,8 @@ string DuckLakeMetadataManager::FromRelativePath(const DuckLakePath &path, const
30523053
if (!path.path_is_relative) {
30533054
return LoadPath(path.path);
30543055
}
3055-
return LoadPath(base_path + path.path);
3056+
auto &fs = GetFileSystem();
3057+
return LoadPath(DuckLakeUtil::JoinPath(fs, base_path, path.path));
30563058
}
30573059

30583060
string DuckLakeMetadataManager::FromRelativePath(const DuckLakePath &path) {

src/storage/ducklake_schema_entry.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "duckdb/catalog/catalog_entry/table_macro_catalog_entry.hpp"
1515
#include "storage/ducklake_macro_entry.hpp"
1616
#include "common/ducklake_util.hpp"
17+
#include "duckdb/common/file_system.hpp"
1718

1819
namespace duckdb {
1920

@@ -96,7 +97,11 @@ optional_ptr<CatalogEntry> DuckLakeSchemaEntry::CreateTable(CatalogTransaction t
9697
auto &duck_catalog = catalog.Cast<DuckLakeCatalog>();
9798
auto &base_info = info.Base();
9899
auto table_uuid = duck_transaction.GenerateUUID();
99-
auto table_data_path = DataPath() + duck_catalog.GeneratePathFromName(table_uuid, base_info.table);
100+
auto &fs = FileSystem::GetFileSystem(transaction.GetContext());
101+
auto table_data_path = DuckLakeUtil::JoinPath(
102+
fs,
103+
DataPath(),
104+
duck_catalog.GeneratePathFromName(table_uuid, base_info.table));
100105
return CreateTableExtended(transaction, info, std::move(table_uuid), std::move(table_data_path));
101106
}
102107

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
# name: test/sql/cleanup/cleanup_path_separator.test
2+
# description: cleanup uses normalized paths when relative entries include leading separators
3+
# group: [cleanup]
4+
5+
require ducklake
6+
7+
require parquet
8+
9+
test-env DUCKLAKE_CONNECTION __TEST_DIR__/{UUID}.db
10+
11+
test-env DATA_PATH __TEST_DIR__
12+
13+
14+
statement ok
15+
ATTACH 'ducklake:${DUCKLAKE_CONNECTION}' AS ducklake (DATA_PATH '${DATA_PATH}/cleanup_path_separator', METADATA_CATALOG 'ducklake_meta')
16+
17+
statement ok
18+
INSERT INTO ducklake_meta.ducklake_files_scheduled_for_deletion VALUES (999, '/main/t/lead.parquet', true, NOW());
19+
20+
statement ok
21+
INSERT INTO ducklake_meta.ducklake_files_scheduled_for_deletion VALUES (1000, 'main/./t/dot.parquet', true, NOW());
22+
23+
statement ok
24+
INSERT INTO ducklake_meta.ducklake_files_scheduled_for_deletion VALUES (1001, 'main/../t/parent.parquet', true, NOW());
25+
26+
statement ok
27+
INSERT INTO ducklake_meta.ducklake_files_scheduled_for_deletion VALUES (1002, 's3://other-bucket/abs.parquet', true, NOW());
28+
29+
statement ok
30+
CREATE TEMP TABLE cleanup_paths AS
31+
SELECT replace(path, chr(92), '/') AS path
32+
FROM ducklake_cleanup_old_files('ducklake', dry_run => true, cleanup_all => true);
33+
34+
query I
35+
SELECT COUNT(*) FROM cleanup_paths;
36+
----
37+
4
38+
39+
query I
40+
SELECT COUNT(*) FROM cleanup_paths
41+
WHERE path NOT LIKE '%://%'
42+
AND ltrim(path, '/') LIKE '%//%';
43+
----
44+
0
45+
46+
query I
47+
SELECT COUNT(*) FROM cleanup_paths
48+
WHERE replace(path, '://', ':/') LIKE '%/./%'
49+
OR replace(path, '://', ':/') LIKE '%/../%';
50+
----
51+
0
52+
53+
query I
54+
SELECT COUNT(*) FROM cleanup_paths
55+
WHERE path = 's3://other-bucket/abs.parquet';
56+
----
57+
1
58+
59+
statement ok
60+
DROP TABLE cleanup_paths;
61+
62+
test-env DUCKLAKE_CONNECTION_ROOT __TEST_DIR__/{UUID}_root.db
63+
64+
statement ok
65+
ATTACH 'ducklake:${DUCKLAKE_CONNECTION_ROOT}' AS ducklake_root (DATA_PATH '/', METADATA_CATALOG 'ducklake_root_meta')
66+
67+
statement ok
68+
INSERT INTO ducklake_root_meta.ducklake_files_scheduled_for_deletion VALUES (2000, 'rooted.parquet', true, NOW());
69+
70+
statement ok
71+
INSERT INTO ducklake_root_meta.ducklake_files_scheduled_for_deletion VALUES (2001, '/rooted2.parquet', true, NOW());
72+
73+
statement ok
74+
CREATE TEMP TABLE cleanup_paths AS
75+
SELECT replace(path, chr(92), '/') AS path
76+
FROM ducklake_cleanup_old_files('ducklake_root', dry_run => true, cleanup_all => true);
77+
78+
query I
79+
SELECT COUNT(*) FROM cleanup_paths;
80+
----
81+
2
82+
83+
query I
84+
SELECT COUNT(*) FROM cleanup_paths
85+
WHERE NOT (path LIKE '/%' OR substr(path, 2, 2) = ':/');
86+
----
87+
0

0 commit comments

Comments
 (0)