Skip to content

Commit 2f29f01

Browse files
committed
Use minmax info for removing dandling deletes from plan
1 parent 0858a19 commit 2f29f01

File tree

1 file changed

+50
-7
lines changed

1 file changed

+50
-7
lines changed

iceberg/tea_scan.cpp

Lines changed: 50 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -731,7 +731,16 @@ class ScanMetadataBuilder {
731731
for (auto& [partition_key, partition_map] : partitions) {
732732
ScanMetadata::Partition partition;
733733
for (auto& [seqnum, layer] : partition_map) {
734-
partition.emplace_back(std::move(layer));
734+
ScanMetadata::Layer result_layer;
735+
result_layer.data_entries_ = std::move(layer.data_entries_);
736+
result_layer.equality_delete_entries_ = std::move(result_layer.equality_delete_entries_);
737+
738+
result_layer.positional_delete_entries_.reserve(layer.positional_delete_entries_.size());
739+
for (auto& pos_delete : layer.positional_delete_entries_) {
740+
result_layer.positional_delete_entries_.emplace_back(std::move(pos_delete.positional_delete_.path));
741+
}
742+
743+
partition.emplace_back(std::move(result_layer));
735744
}
736745
result.partitions.emplace_back(std::move(partition));
737746
}
@@ -779,7 +788,19 @@ class ScanMetadataBuilder {
779788
// - There is no deletion vector that must be applied to the data file (when added, such a vector must
780789
// contain
781790
// all deletes from existing position delete files)
782-
AddPositionDeletes(serialized_partition_key, sequence_number, entry.data_file.file_path);
791+
std::optional<std::pair<std::string, std::string>> min_max_referenced_path;
792+
constexpr uint32_t kFilePathId = 2147483546;
793+
if (entry.data_file.lower_bounds.contains(kFilePathId) && entry.data_file.upper_bounds.contains(kFilePathId)) {
794+
const std::vector<uint8_t>& min_bytes = entry.data_file.lower_bounds.at(kFilePathId);
795+
const std::vector<uint8_t>& max_bytes = entry.data_file.upper_bounds.at(kFilePathId);
796+
797+
std::string min_path(min_bytes.begin(), min_bytes.end());
798+
std::string max_path(max_bytes.begin(), max_bytes.end());
799+
800+
min_max_referenced_path.emplace(std::move(min_path), std::move(max_path));
801+
}
802+
AddPositionDeletes(serialized_partition_key, sequence_number, entry.data_file.file_path,
803+
min_max_referenced_path);
783804
break;
784805
}
785806

@@ -807,8 +828,10 @@ class ScanMetadataBuilder {
807828
}
808829

809830
virtual void AddPositionDeletes(const std::string& serialized_partition_key, SequenceNumber sequence_number,
810-
const std::string& path) {
811-
partitions[serialized_partition_key][sequence_number].positional_delete_entries_.emplace_back(path);
831+
const std::string& path,
832+
const std::optional<std::pair<std::string, std::string>>& min_max_referenced_path) {
833+
partitions[serialized_partition_key][sequence_number].positional_delete_entries_.emplace_back(
834+
path, min_max_referenced_path);
812835
}
813836

814837
virtual void AddGlobalEqualityDeletes(SequenceNumber sequence_number, const std::string& path,
@@ -850,7 +873,26 @@ class ScanMetadataBuilder {
850873
const TableMetadataV2& table_metadata_;
851874
std::shared_ptr<const iceberg::Schema> schema_;
852875

853-
std::map<std::string, std::map<SequenceNumber, ScanMetadata::Layer>> partitions;
876+
struct PositionalDeleteWithExtraInfo {
877+
PositionalDeleteInfo positional_delete_;
878+
std::optional<std::pair<std::string, std::string>> min_max_referenced_path_;
879+
880+
PositionalDeleteWithExtraInfo(std::string path,
881+
std::optional<std::pair<std::string, std::string>> min_max_referenced_path)
882+
: positional_delete_(std::move(path)), min_max_referenced_path_(std::move(min_max_referenced_path)) {}
883+
};
884+
885+
struct LayerWithExtraInfo {
886+
std::vector<DataEntry> data_entries_;
887+
std::vector<PositionalDeleteWithExtraInfo> positional_delete_entries_;
888+
std::vector<EqualityDeleteInfo> equality_delete_entries_;
889+
890+
bool operator==(const LayerWithExtraInfo& layer) const = default;
891+
892+
bool Empty() const;
893+
};
894+
895+
std::map<std::string, std::map<SequenceNumber, LayerWithExtraInfo>> partitions;
854896
// if there are k partitions and t global equality delete entries, k * t entries will be created
855897
// TODO(gmusya): improve
856898
std::map<SequenceNumber, std::vector<EqualityDeleteInfo>> global_equality_deletes;
@@ -868,9 +910,10 @@ class ScanMetadataBuilderMT : public ScanMetadataBuilder {
868910
}
869911

870912
void AddPositionDeletes(const std::string& serialized_partition_key, SequenceNumber sequence_number,
871-
const std::string& path) override {
913+
const std::string& path,
914+
const std::optional<std::pair<std::string, std::string>>& min_max_referenced_path) override {
872915
std::lock_guard<std::mutex> guard(mutex_);
873-
ScanMetadataBuilder::AddPositionDeletes(serialized_partition_key, sequence_number, path);
916+
ScanMetadataBuilder::AddPositionDeletes(serialized_partition_key, sequence_number, path, min_max_referenced_path);
874917
}
875918

876919
void AddGlobalEqualityDeletes(SequenceNumber sequence_number, const std::string& path,

0 commit comments

Comments
 (0)