Skip to content

Commit 75f880d

Browse files
authored
Merge pull request duckdb#63 from teaguesterling/issues/29
Addresses duckdb#29: Support missing version-hint.txt and provide additional options
2 parents 10e0862 + 201be6b commit 75f880d

File tree

8 files changed

+171
-39
lines changed

8 files changed

+171
-39
lines changed

src/common/iceberg.cpp

Lines changed: 37 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -167,30 +167,51 @@ IcebergSnapshot IcebergSnapshot::GetSnapshotByTimestamp(const string &path, File
167167
return ParseSnapShot(snapshot, info->iceberg_version, info->schema_id, info->schemas, metadata_compression_codec, skip_schema_inference);
168168
}
169169

170-
// Function to generate a metadata file url
171-
string GenerateMetaDataUrl(FileSystem &fs, const string &meta_path, const string &table_version, const string &metadata_compression_codec) {
172-
if (metadata_compression_codec != "gzip") {
173-
return fs.JoinPath(meta_path, "v" + table_version + ".metadata.json");
170+
// Function to generate a metadata file url from version and format string
171+
// default format is "v%s%s.metadata.json" -> v00###-xxxxxxxxx-.gz.metadata.json"
172+
string GenerateMetaDataUrl(FileSystem &fs, const string &meta_path, string &table_version, string &metadata_compression_codec, string &version_format = DEFAULT_TABLE_VERSION_FORMAT) {
173+
// TODO: Need to URL Encode table_version
174+
string compression_suffix = "";
175+
string url;
176+
if (metadata_compression_codec == "gzip") {
177+
compression_suffix = ".gz";
178+
}
179+
for(auto try_format : StringUtil::Split(version_format, ',')) {
180+
url = fs.JoinPath(meta_path, StringUtil::Format(try_format, table_version, compression_suffix));
181+
if(fs.FileExists(url)) {
182+
return url;
183+
}
174184
}
175-
return fs.JoinPath(meta_path, "v" + table_version + ".gz.metadata.json");
185+
186+
throw IOException(
187+
"Iceberg metadata file not found for table version '%s' using '%s' compression and format(s): '%s'", table_version, metadata_compression_codec, version_format);
176188
}
177189

178-
string IcebergSnapshot::ReadMetaData(const string &path, FileSystem &fs, string metadata_compression_codec) {
179-
string metadata_file_path;
190+
191+
string IcebergSnapshot::GetMetaDataPath(const string &path, FileSystem &fs, string metadata_compression_codec, string table_version = DEFAULT_VERSION_HINT_FILE, string version_format = DEFAULT_TABLE_VERSION_FORMAT) {
180192
if (StringUtil::EndsWith(path, ".json")) {
181-
metadata_file_path = path;
193+
return path;
194+
}
195+
196+
auto meta_path = fs.JoinPath(path, "metadata");
197+
string version_hint;
198+
if(StringUtil::EndsWith(table_version, ".text")||StringUtil::EndsWith(table_version, ".txt")) {
199+
version_hint = GetTableVersion(meta_path, fs, table_version);
182200
} else {
183-
auto table_version = GetTableVersion(path, fs);
184-
auto meta_path = fs.JoinPath(path, "metadata");
185-
metadata_file_path = GenerateMetaDataUrl(fs, meta_path, table_version, metadata_compression_codec);
201+
version_hint = table_version;
186202
}
203+
return GenerateMetaDataUrl(fs, meta_path, version_hint, metadata_compression_codec, version_format);
204+
}
205+
187206

207+
string IcebergSnapshot::ReadMetaData(const string &path, FileSystem &fs, string metadata_compression_codec) {
188208
if (metadata_compression_codec == "gzip") {
189-
return IcebergUtils::GzFileToString(metadata_file_path, fs);
209+
return IcebergUtils::GzFileToString(path, fs);
190210
}
191-
return IcebergUtils::FileToString(metadata_file_path, fs);
211+
return IcebergUtils::FileToString(path, fs);
192212
}
193213

214+
194215
IcebergSnapshot IcebergSnapshot::ParseSnapShot(yyjson_val *snapshot, idx_t iceberg_format_version, idx_t schema_id,
195216
vector<yyjson_val *> &schemas, string metadata_compression_codec,
196217
bool skip_schema_inference) {
@@ -217,9 +238,8 @@ IcebergSnapshot IcebergSnapshot::ParseSnapShot(yyjson_val *snapshot, idx_t icebe
217238
return ret;
218239
}
219240

220-
string IcebergSnapshot::GetTableVersion(const string &path, FileSystem &fs) {
221-
auto meta_path = fs.JoinPath(path, "metadata");
222-
auto version_file_path = fs.JoinPath(meta_path, "version-hint.text");
241+
string IcebergSnapshot::GetTableVersion(const string &meta_path, FileSystem &fs, string version_file = DEFAULT_VERSION_HINT_FILE) {
242+
auto version_file_path = fs.JoinPath(meta_path, version_file);
223243
auto version_file_content = IcebergUtils::FileToString(version_file_path, fs);
224244

225245
try {
@@ -288,4 +308,4 @@ yyjson_val *IcebergSnapshot::IcebergSnapshot::FindSnapshotByIdTimestampInternal(
288308
return max_snapshot;
289309
}
290310

291-
} // namespace duckdb
311+
} // namespace duckdb

src/iceberg_functions/iceberg_metadata.cpp

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@ static unique_ptr<FunctionData> IcebergMetaDataBind(ClientContext &context, Tabl
5757
bool allow_moved_paths = false;
5858
string metadata_compression_codec = "none";
5959
bool skip_schema_inference = false;
60+
string table_version = DEFAULT_VERSION_HINT_FILE;
61+
string version_name_format = DEFAULT_TABLE_VERSION_FORMAT;
6062

6163
for (auto &kv : input.named_parameters) {
6264
auto loption = StringUtil::Lower(kv.first);
@@ -66,20 +68,26 @@ static unique_ptr<FunctionData> IcebergMetaDataBind(ClientContext &context, Tabl
6668
metadata_compression_codec = StringValue::Get(kv.second);
6769
} else if (loption == "skip_schema_inference") {
6870
skip_schema_inference = BooleanValue::Get(kv.second);
71+
} else if (loption == "version") {
72+
table_version = StringValue::Get(kv.second);
73+
} else if (loption == "version_name_format") {
74+
version_name_format = StringValue::Get(kv.second);
6975
}
7076
}
77+
78+
auto iceberg_meta_path = IcebergSnapshot::GetMetaDataPath(iceberg_path, fs, metadata_compression_codec, table_version, version_name_format);
7179
IcebergSnapshot snapshot_to_scan;
7280
if (input.inputs.size() > 1) {
7381
if (input.inputs[1].type() == LogicalType::UBIGINT) {
74-
snapshot_to_scan = IcebergSnapshot::GetSnapshotById(iceberg_path, fs, input.inputs[1].GetValue<uint64_t>(), metadata_compression_codec, skip_schema_inference);
82+
snapshot_to_scan = IcebergSnapshot::GetSnapshotById(iceberg_meta_path, fs, input.inputs[1].GetValue<uint64_t>(), metadata_compression_codec, skip_schema_inference);
7583
} else if (input.inputs[1].type() == LogicalType::TIMESTAMP) {
7684
snapshot_to_scan =
77-
IcebergSnapshot::GetSnapshotByTimestamp(iceberg_path, fs, input.inputs[1].GetValue<timestamp_t>(), metadata_compression_codec, skip_schema_inference);
85+
IcebergSnapshot::GetSnapshotByTimestamp(iceberg_meta_path, fs, input.inputs[1].GetValue<timestamp_t>(), metadata_compression_codec, skip_schema_inference);
7886
} else {
7987
throw InvalidInputException("Unknown argument type in IcebergScanBindReplace.");
8088
}
8189
} else {
82-
snapshot_to_scan = IcebergSnapshot::GetLatestSnapshot(iceberg_path, fs, metadata_compression_codec, skip_schema_inference);
90+
snapshot_to_scan = IcebergSnapshot::GetLatestSnapshot(iceberg_meta_path, fs, metadata_compression_codec, skip_schema_inference);
8391
}
8492

8593
ret->iceberg_table =
@@ -143,23 +151,29 @@ TableFunctionSet IcebergFunctions::GetIcebergMetadataFunction() {
143151

144152
auto fun = TableFunction({LogicalType::VARCHAR}, IcebergMetaDataFunction, IcebergMetaDataBind,
145153
IcebergMetaDataGlobalTableFunctionState::Init);
146-
fun.named_parameters["skip_schema_inference"] = LogicalType::BOOLEAN;
147154
fun.named_parameters["allow_moved_paths"] = LogicalType::BOOLEAN;
155+
fun.named_parameters["skip_schema_inference"] = LogicalType::BOOLEAN;
148156
fun.named_parameters["metadata_compression_codec"] = LogicalType::VARCHAR;
157+
fun.named_parameters["version"] = LogicalType::VARCHAR;
158+
fun.named_parameters["version_name_format"] = LogicalType::VARCHAR;
149159
function_set.AddFunction(fun);
150160

151161
fun = TableFunction({LogicalType::VARCHAR, LogicalType::UBIGINT}, IcebergMetaDataFunction, IcebergMetaDataBind,
152162
IcebergMetaDataGlobalTableFunctionState::Init);
153-
fun.named_parameters["skip_schema_inference"] = LogicalType::BOOLEAN;
154163
fun.named_parameters["allow_moved_paths"] = LogicalType::BOOLEAN;
164+
fun.named_parameters["skip_schema_inference"] = LogicalType::BOOLEAN;
155165
fun.named_parameters["metadata_compression_codec"] = LogicalType::VARCHAR;
166+
fun.named_parameters["version"] = LogicalType::VARCHAR;
167+
fun.named_parameters["version_name_format"] = LogicalType::VARCHAR;
156168
function_set.AddFunction(fun);
157169

158170
fun = TableFunction({LogicalType::VARCHAR, LogicalType::TIMESTAMP}, IcebergMetaDataFunction, IcebergMetaDataBind,
159171
IcebergMetaDataGlobalTableFunctionState::Init);
160-
fun.named_parameters["skip_schema_inference"] = LogicalType::BOOLEAN;
161172
fun.named_parameters["allow_moved_paths"] = LogicalType::BOOLEAN;
173+
fun.named_parameters["skip_schema_inference"] = LogicalType::BOOLEAN;
162174
fun.named_parameters["metadata_compression_codec"] = LogicalType::VARCHAR;
175+
fun.named_parameters["version"] = LogicalType::VARCHAR;
176+
fun.named_parameters["version_name_format"] = LogicalType::VARCHAR;
163177
function_set.AddFunction(fun);
164178

165179
return function_set;

src/iceberg_functions/iceberg_scan.cpp

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,8 @@ static unique_ptr<TableRef> IcebergScanBindReplace(ClientContext &context, Table
214214
bool skip_schema_inference = false;
215215
string mode = "default";
216216
string metadata_compression_codec = "none";
217+
string table_version = DEFAULT_VERSION_HINT_FILE;
218+
string version_name_format = DEFAULT_TABLE_VERSION_FORMAT;
217219

218220
for (auto &kv : input.named_parameters) {
219221
auto loption = StringUtil::Lower(kv.first);
@@ -229,20 +231,25 @@ static unique_ptr<TableRef> IcebergScanBindReplace(ClientContext &context, Table
229231
metadata_compression_codec = StringValue::Get(kv.second);
230232
} else if (loption == "skip_schema_inference") {
231233
skip_schema_inference = BooleanValue::Get(kv.second);
234+
} else if (loption == "version") {
235+
table_version = StringValue::Get(kv.second);
236+
} else if (loption == "version_name_format") {
237+
version_name_format = StringValue::Get(kv.second);
232238
}
233239
}
240+
auto iceberg_meta_path = IcebergSnapshot::GetMetaDataPath(iceberg_path, fs, metadata_compression_codec, table_version, version_name_format);
234241
IcebergSnapshot snapshot_to_scan;
235242
if (input.inputs.size() > 1) {
236243
if (input.inputs[1].type() == LogicalType::UBIGINT) {
237-
snapshot_to_scan = IcebergSnapshot::GetSnapshotById(iceberg_path, fs, input.inputs[1].GetValue<uint64_t>(), metadata_compression_codec, skip_schema_inference);
244+
snapshot_to_scan = IcebergSnapshot::GetSnapshotById(iceberg_meta_path, fs, input.inputs[1].GetValue<uint64_t>(), metadata_compression_codec, skip_schema_inference);
238245
} else if (input.inputs[1].type() == LogicalType::TIMESTAMP) {
239246
snapshot_to_scan =
240-
IcebergSnapshot::GetSnapshotByTimestamp(iceberg_path, fs, input.inputs[1].GetValue<timestamp_t>(), metadata_compression_codec, skip_schema_inference);
247+
IcebergSnapshot::GetSnapshotByTimestamp(iceberg_meta_path, fs, input.inputs[1].GetValue<timestamp_t>(), metadata_compression_codec, skip_schema_inference);
241248
} else {
242249
throw InvalidInputException("Unknown argument type in IcebergScanBindReplace.");
243250
}
244251
} else {
245-
snapshot_to_scan = IcebergSnapshot::GetLatestSnapshot(iceberg_path, fs, metadata_compression_codec, skip_schema_inference);
252+
snapshot_to_scan = IcebergSnapshot::GetLatestSnapshot(iceberg_meta_path, fs, metadata_compression_codec, skip_schema_inference);
246253
}
247254

248255
IcebergTable iceberg_table = IcebergTable::Load(iceberg_path, snapshot_to_scan, fs, allow_moved_paths, metadata_compression_codec);
@@ -277,6 +284,8 @@ TableFunctionSet IcebergFunctions::GetIcebergScanFunction() {
277284
fun.named_parameters["allow_moved_paths"] = LogicalType::BOOLEAN;
278285
fun.named_parameters["mode"] = LogicalType::VARCHAR;
279286
fun.named_parameters["metadata_compression_codec"] = LogicalType::VARCHAR;
287+
fun.named_parameters["version"] = LogicalType::VARCHAR;
288+
fun.named_parameters["version_name_format"] = LogicalType::VARCHAR;
280289
function_set.AddFunction(fun);
281290

282291
fun = TableFunction({LogicalType::VARCHAR, LogicalType::UBIGINT}, nullptr, nullptr,
@@ -286,6 +295,8 @@ TableFunctionSet IcebergFunctions::GetIcebergScanFunction() {
286295
fun.named_parameters["allow_moved_paths"] = LogicalType::BOOLEAN;
287296
fun.named_parameters["mode"] = LogicalType::VARCHAR;
288297
fun.named_parameters["metadata_compression_codec"] = LogicalType::VARCHAR;
298+
fun.named_parameters["version"] = LogicalType::VARCHAR;
299+
fun.named_parameters["version_name_format"] = LogicalType::VARCHAR;
289300
function_set.AddFunction(fun);
290301

291302
fun = TableFunction({LogicalType::VARCHAR, LogicalType::TIMESTAMP}, nullptr, nullptr,
@@ -295,6 +306,8 @@ TableFunctionSet IcebergFunctions::GetIcebergScanFunction() {
295306
fun.named_parameters["allow_moved_paths"] = LogicalType::BOOLEAN;
296307
fun.named_parameters["mode"] = LogicalType::VARCHAR;
297308
fun.named_parameters["metadata_compression_codec"] = LogicalType::VARCHAR;
309+
fun.named_parameters["version"] = LogicalType::VARCHAR;
310+
fun.named_parameters["version_name_format"] = LogicalType::VARCHAR;
298311
function_set.AddFunction(fun);
299312

300313
return function_set;

src/iceberg_functions/iceberg_snapshots.cpp

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ struct IcebergSnaphotsBindData : public TableFunctionData {
1313
IcebergSnaphotsBindData() {};
1414
string filename;
1515
string metadata_compression_codec;
16+
string table_version;
17+
string version_name_format;
1618
bool skip_schema_inference = false;
1719
};
1820

@@ -29,7 +31,10 @@ struct IcebergSnapshotGlobalTableFunctionState : public GlobalTableFunctionState
2931
auto global_state = make_uniq<IcebergSnapshotGlobalTableFunctionState>();
3032

3133
FileSystem &fs = FileSystem::GetFileSystem(context);
32-
global_state->metadata_file = IcebergSnapshot::ReadMetaData(bind_data.filename, fs, bind_data.metadata_compression_codec);
34+
35+
auto iceberg_meta_path = IcebergSnapshot::GetMetaDataPath(
36+
bind_data.filename, fs, bind_data.metadata_compression_codec, bind_data.table_version, bind_data.version_name_format);
37+
global_state->metadata_file = IcebergSnapshot::ReadMetaData(iceberg_meta_path, fs, bind_data.metadata_compression_codec);
3338
global_state->metadata_doc =
3439
yyjson_read(global_state->metadata_file.c_str(), global_state->metadata_file.size(), 0);
3540
auto root = yyjson_doc_get_root(global_state->metadata_doc);
@@ -50,19 +55,27 @@ static unique_ptr<FunctionData> IcebergSnapshotsBind(ClientContext &context, Tab
5055
auto bind_data = make_uniq<IcebergSnaphotsBindData>();
5156

5257
string metadata_compression_codec = "none";
58+
string table_version = DEFAULT_VERSION_HINT_FILE;
59+
string version_name_format = DEFAULT_TABLE_VERSION_FORMAT;
5360
bool skip_schema_inference = false;
5461

5562
for (auto &kv : input.named_parameters) {
5663
auto loption = StringUtil::Lower(kv.first);
5764
if (loption == "metadata_compression_codec") {
5865
metadata_compression_codec = StringValue::Get(kv.second);
66+
} else if (loption == "version") {
67+
table_version = StringValue::Get(kv.second);
68+
} else if (loption == "version_name_format") {
69+
version_name_format = StringValue::Get(kv.second);
5970
} else if (loption == "skip_schema_inference") {
6071
skip_schema_inference = BooleanValue::Get(kv.second);
6172
}
6273
}
6374
bind_data->filename = input.inputs[0].ToString();
6475
bind_data->metadata_compression_codec = metadata_compression_codec;
6576
bind_data->skip_schema_inference = skip_schema_inference;
77+
bind_data->table_version = table_version;
78+
bind_data->version_name_format = version_name_format;
6679

6780
names.emplace_back("sequence_number");
6881
return_types.emplace_back(LogicalType::UBIGINT);
@@ -115,6 +128,8 @@ TableFunctionSet IcebergFunctions::GetIcebergSnapshotsFunction() {
115128
TableFunction table_function({LogicalType::VARCHAR}, IcebergSnapshotsFunction, IcebergSnapshotsBind,
116129
IcebergSnapshotGlobalTableFunctionState::Init);
117130
table_function.named_parameters["metadata_compression_codec"] = LogicalType::VARCHAR;
131+
table_function.named_parameters["version"] = LogicalType::VARCHAR;
132+
table_function.named_parameters["version_name_format"] = LogicalType::VARCHAR;
118133
table_function.named_parameters["skip_schema_inference"] = LogicalType::BOOLEAN;
119134
function_set.AddFunction(table_function);
120135
return function_set;

src/include/iceberg_metadata.hpp

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,12 @@ using namespace duckdb_yyjson;
1616

1717
namespace duckdb {
1818

19+
// First arg is version string, arg is either empty or ".gz" if gzip
20+
// Allows for both "v###.gz.metadata.json" and "###.metadata.json" styles
21+
static string DEFAULT_TABLE_VERSION_FORMAT = "v%s%s.metadata.json,%s%s.metadata.json";
22+
23+
static string DEFAULT_VERSION_HINT_FILE = "version-hint.text";
24+
1925
struct IcebergColumnDefinition {
2026
public:
2127
static IcebergColumnDefinition ParseFromJson(yyjson_val *val);
@@ -61,19 +67,20 @@ class IcebergSnapshot {
6167
vector<IcebergColumnDefinition> schema;
6268
string metadata_compression_codec = "none";
6369

64-
static IcebergSnapshot GetLatestSnapshot(const string &path, FileSystem &fs, string GetSnapshotByTimestamp, bool skip_schema_inference);
65-
static IcebergSnapshot GetSnapshotById(const string &path, FileSystem &fs, idx_t snapshot_id, string GetSnapshotByTimestamp, bool skip_schema_inference);
66-
static IcebergSnapshot GetSnapshotByTimestamp(const string &path, FileSystem &fs, timestamp_t timestamp, string GetSnapshotByTimestamp, bool skip_schema_inference);
70+
static IcebergSnapshot GetLatestSnapshot(const string &path, FileSystem &fs, string metadata_compression_codec, bool skip_schema_inference);
71+
static IcebergSnapshot GetSnapshotById(const string &path, FileSystem &fs, idx_t snapshot_id, string metadata_compression_codec, bool skip_schema_inference);
72+
static IcebergSnapshot GetSnapshotByTimestamp(const string &path, FileSystem &fs, timestamp_t timestamp, string metadata_compression_codec, bool skip_schema_inference);
6773

6874
static IcebergSnapshot ParseSnapShot(yyjson_val *snapshot, idx_t iceberg_format_version, idx_t schema_id,
6975
vector<yyjson_val *> &schemas, string metadata_compression_codec, bool skip_schema_inference);
70-
static string ReadMetaData(const string &path, FileSystem &fs, string GetSnapshotByTimestamp);
76+
static string GetMetaDataPath(const string &path, FileSystem &fs, string metadata_compression_codec, string table_version, string version_format);
77+
static string ReadMetaData(const string &path, FileSystem &fs, string metadata_compression_codec);
7178
static yyjson_val *GetSnapshots(const string &path, FileSystem &fs, string GetSnapshotByTimestamp);
7279
static unique_ptr<SnapshotParseInfo> GetParseInfo(yyjson_doc &metadata_json);
7380

7481
protected:
7582
//! Internal JSON parsing functions
76-
static string GetTableVersion(const string &path, FileSystem &fs);
83+
static string GetTableVersion(const string &path, FileSystem &fs, string version_format);
7784
static yyjson_val *FindLatestSnapshotInternal(yyjson_val *snapshots);
7885
static yyjson_val *FindSnapshotByIdInternal(yyjson_val *snapshots, idx_t target_id);
7986
static yyjson_val *FindSnapshotByIdTimestampInternal(yyjson_val *snapshots, timestamp_t timestamp);
@@ -124,4 +131,4 @@ struct IcebergTable {
124131
string path;
125132
};
126133

127-
} // namespace duckdb
134+
} // namespace duckdb

0 commit comments

Comments
 (0)