Skip to content

Commit

Permalink
Add punch hole GC
Browse files Browse the repository at this point in the history
Signed-off-by: v01dstar <[email protected]>
  • Loading branch information
v01dstar committed Aug 10, 2024
1 parent 3ef3c19 commit a7ec858
Show file tree
Hide file tree
Showing 29 changed files with 940 additions and 49 deletions.
3 changes: 3 additions & 0 deletions include/titan/db.h
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,9 @@ class TitanDB : public StackableDB {
// "rocksdb.titandb.discardable_ratio_le100_file_num" - returns count of
// file whose discardable ratio is less or equal to 100%.
static const std::string kNumDiscardableRatioLE100File;
// "rockdb.titandb.kNumHolePunchableBlobSize" - returns the size of hole
// punchable blobs (no longer referenced in SSTs) in the database.
static const std::string kHolePunchableBlobSize;
};

bool GetProperty(ColumnFamilyHandle* column_family, const Slice& property,
Expand Down
5 changes: 4 additions & 1 deletion include/titan/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ struct TitanCFOptions : public ColumnFamilyOptions {
// data's 0s and 0s created by punch holes).
uint64_t block_size{4096};
bool enable_punch_hole_gc{false};
uint64_t punch_hole_threshold{4 * 1024 * 1024};

TitanCFOptions() = default;
explicit TitanCFOptions(const ColumnFamilyOptions& options)
Expand Down Expand Up @@ -229,12 +230,14 @@ struct MutableTitanCFOptions {
: blob_run_mode(opts.blob_run_mode),
min_blob_size(opts.min_blob_size),
blob_file_compression(opts.blob_file_compression),
blob_file_discardable_ratio(opts.blob_file_discardable_ratio) {}
blob_file_discardable_ratio(opts.blob_file_discardable_ratio),
punch_hole_threshold(opts.punch_hole_threshold) {}

TitanBlobRunMode blob_run_mode;
uint64_t min_blob_size;
CompressionType blob_file_compression;
double blob_file_discardable_ratio;
uint64_t punch_hole_threshold;
};

struct TitanOptions : public TitanDBOptions, public TitanCFOptions {
Expand Down
7 changes: 6 additions & 1 deletion src/blob_file_builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,12 @@ void BlobFileBuilder::FlushSampleRecords(OutContexts* out_ctx) {
void BlobFileBuilder::WriteEncoderData(BlobHandle* handle) {
handle->offset = file_->GetFileSize();
handle->size = encoder_.GetEncodedSize();
live_data_size_ += handle->size;
if (block_size_ > 0) {
live_data_size_ +=
(handle->size + block_size_ - 1) / block_size_ * block_size_;
} else {
live_data_size_ += handle->size;
}

status_ = file_->Append(encoder_.GetHeader());
if (ok()) {
Expand Down
2 changes: 2 additions & 0 deletions src/blob_file_builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ class BlobFileBuilder {
const std::string& GetSmallestKey() { return smallest_key_; }
const std::string& GetLargestKey() { return largest_key_; }

uint64_t GetBlockSize() { return block_size_; }

uint64_t live_data_size() const { return live_data_size_; }

private:
Expand Down
9 changes: 9 additions & 0 deletions src/blob_file_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,15 @@ class BlobFileManager {
(void)handles;
return Status::OK();
}

// Updates the metadata of the file. This is used to update the
// metadata of the file after the file is punched with holes.
virtual Status BatchUpdateFiles(
uint32_t cf_id, const std::vector<std::shared_ptr<BlobFileMeta>>& files) {
(void)cf_id;
(void)files;
return Status::OK();
}
};

} // namespace titandb
Expand Down
16 changes: 16 additions & 0 deletions src/blob_file_set.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,22 @@ class BlobFileSet {

bool IsOpened() { return opened_.load(std::memory_order_acquire); }

uint64_t GetBlockSize(uint32_t cf_id) {
MutexLock l(mutex_);
auto storage = GetBlobStorage(cf_id).lock();
if (storage != nullptr && storage->cf_options().enable_punch_hole_gc) {
return storage->cf_options().block_size;
}
return 0;
}

std::unordered_map<uint64_t, uint64_t> GetFileBlockSizes(uint32_t cf_id) {
MutexLock l(mutex_);
auto storage = GetBlobStorage(cf_id).lock();
return storage ? storage->GetFileBlockSizes()
: std::unordered_map<uint64_t, uint64_t>();
}

private:
struct ManifestWriter;

Expand Down
34 changes: 30 additions & 4 deletions src/blob_file_size_collector.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,13 @@ namespace titandb {

TablePropertiesCollector*
BlobFileSizeCollectorFactory::CreateTablePropertiesCollector(
rocksdb::TablePropertiesCollectorFactory::Context /* context */) {
return new BlobFileSizeCollector();
rocksdb::TablePropertiesCollectorFactory::Context context) {
if (blob_file_set_ != nullptr) {
return new BlobFileSizeCollector(
blob_file_set_->GetBlockSize(context.column_family_id),
blob_file_set_->GetFileBlockSizes(context.column_family_id));
}
return new BlobFileSizeCollector(0, {});
}

const std::string BlobFileSizeCollector::kPropertiesName =
Expand Down Expand Up @@ -57,11 +62,32 @@ Status BlobFileSizeCollector::AddUserKey(const Slice& /* key */,
return s;
}

auto size = index.blob_handle.size;
if (default_block_size_ > 0 && !file_block_sizes_.empty()) {
// If the blob file cannot be found in the block size map, it must be a
// newly created file that has not been added blob_file_set, in this case,
// we know the block size of the file is default_block_size_.
// If the blob file can be found in the block size map, it implies we are
// moving the reference only, while keeping the blob at the original file,
// in this case, we should use the block size of the original file.
uint64_t block_size = default_block_size_;
if (!file_block_sizes_.empty()) {
auto iter = file_block_sizes_.find(index.file_number);
if (iter != file_block_sizes_.end()) {
block_size = iter->second;
}
}
if (block_size > 0) {
// Align blob size with block size.
size = (size + block_size - 1) / block_size * block_size;
}
}

auto iter = blob_files_size_.find(index.file_number);
if (iter == blob_files_size_.end()) {
blob_files_size_[index.file_number] = index.blob_handle.size;
blob_files_size_[index.file_number] = size;
} else {
iter->second += index.blob_handle.size;
iter->second += size;
}

return Status::OK();
Expand Down
17 changes: 17 additions & 0 deletions src/blob_file_size_collector.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,31 @@ namespace titandb {
class BlobFileSizeCollectorFactory final
: public TablePropertiesCollectorFactory {
public:
// If punch_hole_gc is enabled, then blob_file_set must be provided.
// If blob_file_set is not provided, then punch_hole_gc will be considered
// disabled, blob size will not align with block size.
BlobFileSizeCollectorFactory(BlobFileSet* blob_file_set = nullptr)
: blob_file_set_(blob_file_set) {}
BlobFileSizeCollectorFactory(const BlobFileSizeCollectorFactory&) = delete;
void operator=(const BlobFileSizeCollectorFactory&) = delete;
TablePropertiesCollector* CreateTablePropertiesCollector(
TablePropertiesCollectorFactory::Context context) override;

const char* Name() const override { return "BlobFileSizeCollector"; }

private:
BlobFileSet* blob_file_set_;
};

class BlobFileSizeCollector final : public TablePropertiesCollector {
public:
const static std::string kPropertiesName;

BlobFileSizeCollector(uint64_t default_block_size,
std::unordered_map<uint64_t, uint64_t> file_block_sizes)
: default_block_size_(default_block_size),
file_block_sizes_(file_block_sizes) {}

static bool Encode(const std::map<uint64_t, uint64_t>& blob_files_size,
std::string* result);
static bool Decode(Slice* slice,
Expand All @@ -38,6 +53,8 @@ class BlobFileSizeCollector final : public TablePropertiesCollector {

private:
std::map<uint64_t, uint64_t> blob_files_size_;
uint64_t default_block_size_;
std::unordered_map<uint64_t, uint64_t> file_block_sizes_;
};

} // namespace titandb
Expand Down
7 changes: 7 additions & 0 deletions src/blob_format.cc
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ void BlobFileMeta::EncodeTo(std::string* dst) const {
PutVarint64(dst, block_size_);
PutLengthPrefixedSlice(dst, smallest_key_);
PutLengthPrefixedSlice(dst, largest_key_);
PutVarint64(dst, effective_file_size_);
}

Status BlobFileMeta::DecodeFromV1(Slice* src) {
Expand Down Expand Up @@ -189,6 +190,12 @@ Status BlobFileMeta::DecodeFrom(Slice* src) {
} else {
return Status::Corruption("BlobLargestKey decode failed");
}
uint64_t effective_file_size;
if (!GetVarint64(src, &effective_file_size)) {
return Status::Corruption(
"BlobFileMeta hole_punchable_size_ decode failed");
}
effective_file_size_ = effective_file_size;
return Status::OK();
}

Expand Down
24 changes: 24 additions & 0 deletions src/blob_format.h
Original file line number Diff line number Diff line change
Expand Up @@ -248,10 +248,14 @@ class BlobFileMeta {
uint64_t file_size() const { return file_size_; }
uint64_t live_data_size() const { return live_data_size_; }
uint32_t file_level() const { return file_level_; }
uint64_t block_size() const { return block_size_; }
const std::string& smallest_key() const { return smallest_key_; }
const std::string& largest_key() const { return largest_key_; }
int64_t effective_file_size() const { return effective_file_size_; }

void set_live_data_size(int64_t size) { live_data_size_ = size; }
// This should be called with db mutex held.
void set_effective_file_size(int64_t size) { effective_file_size_ = size; }
uint64_t file_entries() const { return file_entries_; }
FileState file_state() const { return state_; }
bool is_obsolete() const { return state_ == FileState::kObsolete; }
Expand All @@ -275,6 +279,10 @@ class BlobFileMeta {
(file_size_ - kBlobMaxHeaderSize - kBlobFooterSize));
}
TitanInternalStats::StatsType GetDiscardableRatioLevel() const;
// This should be called with db mutex held.
uint64_t GetHolePunchableSize() const {
return effective_file_size_ - live_data_size_;
}
void Dump(bool with_keys) const;

private:
Expand All @@ -291,6 +299,18 @@ class BlobFileMeta {
std::string smallest_key_;
std::string largest_key_;

// The effective size of current file. This is different from `file_size_`, as
// `file_size_` is the original size of the file, and does not consider space
// reclaimed by punch hole GC.
// We can't use file system's `st_blocks` to get the logical size, because
// the file system's block size may be different from Titan's block size.
// This is used to calculate the size of the punchable hole. i.e.
// effective_file_size_ - live_data_size_.
// This might be bigger than the actual size of the file, when Titan crashes
// before updating the `effective_file_size_` during punch hole GC. This is
// fine, as it will be corrected when the file is chose for GC next time.
int64_t effective_file_size_{0};

// Not persistent field

// Size of data with reference from SST files.
Expand All @@ -303,7 +323,11 @@ class BlobFileMeta {
// So when state_ == kPendingLSM, it uses this to record the delta as a
// positive number if any later compaction is trigger before previous
// `OnCompactionCompleted()` is called.
// The size is aligned with block size, when punch hole GC is enabled.
std::atomic<int64_t> live_data_size_{0};
// This is different from `file_size_`, as `file_size_` is the original size
// of the file, and does not consider space reclaimed by punch hole GC.
std::atomic<int64_t> disk_usage_{0};
std::atomic<FileState> state_{FileState::kNone};
};

Expand Down
6 changes: 4 additions & 2 deletions src/blob_gc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@ namespace rocksdb {
namespace titandb {

BlobGC::BlobGC(std::vector<std::shared_ptr<BlobFileMeta>>&& blob_files,
TitanCFOptions&& _titan_cf_options, bool need_trigger_next)
TitanCFOptions&& _titan_cf_options, bool need_trigger_next,
bool punch_hole_gc)
: inputs_(blob_files),
titan_cf_options_(std::move(_titan_cf_options)),
trigger_next_(need_trigger_next) {
trigger_next_(need_trigger_next),
punch_hole_gc_(punch_hole_gc) {
MarkFilesBeingGC();
}

Expand Down
6 changes: 5 additions & 1 deletion src/blob_gc.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ namespace titandb {
class BlobGC {
public:
BlobGC(std::vector<std::shared_ptr<BlobFileMeta>>&& blob_files,
TitanCFOptions&& _titan_cf_options, bool need_trigger_next);
TitanCFOptions&& _titan_cf_options, bool need_trigger_next,
bool punch_hole_gc = false);

// No copying allowed
BlobGC(const BlobGC&) = delete;
Expand All @@ -40,13 +41,16 @@ class BlobGC {

bool trigger_next() { return trigger_next_; }

bool punch_hole_gc() { return punch_hole_gc_; }

private:
std::vector<std::shared_ptr<BlobFileMeta>> inputs_;
std::vector<BlobFileMeta*> outputs_;
TitanCFOptions titan_cf_options_;
ColumnFamilyHandle* cfh_{nullptr};
// Whether need to trigger gc after this gc or not
const bool trigger_next_;
const bool punch_hole_gc_;
};

struct GCScore {
Expand Down
55 changes: 54 additions & 1 deletion src/blob_gc_picker.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,19 @@ BasicBlobGCPicker::BasicBlobGCPicker(TitanDBOptions db_options,

BasicBlobGCPicker::~BasicBlobGCPicker() {}

std::unique_ptr<BlobGC> BasicBlobGCPicker::PickBlobGC(
std::unique_ptr<BlobGC> BasicBlobGCPicker::PickBlobGC(BlobStorage* blob_storage,
bool allow_punch_hole) {
auto regular_gc = PickRegularBlobGC(blob_storage);
if (regular_gc) {
return regular_gc;
}
if (allow_punch_hole) {
return PickPunchHoleGC(blob_storage);
}
return nullptr;
}

std::unique_ptr<BlobGC> BasicBlobGCPicker::PickRegularBlobGC(
BlobStorage* blob_storage) {
Status s;
std::vector<std::shared_ptr<BlobFileMeta>> blob_files;
Expand Down Expand Up @@ -103,6 +115,47 @@ std::unique_ptr<BlobGC> BasicBlobGCPicker::PickBlobGC(
std::move(blob_files), std::move(cf_options_), maybe_continue_next_time));
}

std::unique_ptr<BlobGC> BasicBlobGCPicker::PickPunchHoleGC(
BlobStorage* blob_storage) {
Status s;
std::vector<std::shared_ptr<BlobFileMeta>> blob_files;

uint64_t batch_size = 0;
uint64_t estimate_output_size = 0;
bool stop_picking = false;
bool maybe_continue_next_time = false;
uint64_t next_gc_size = 0;

for (auto& gc_score : blob_storage->punch_hole_score()) {
auto blob_file = blob_storage->FindFile(gc_score.file_number).lock();
if (!CheckBlobFile(blob_file.get())) {
// Skip this file id this file is being GCed
// or this file had been GCed
TITAN_LOG_INFO(db_options_.info_log,
"Blob file %" PRIu64 " no need punch hole gc",
blob_file->file_number());
continue;
}
if (!stop_picking) {
blob_files.emplace_back(blob_file);
batch_size += blob_file->file_size();
if (batch_size >= cf_options_.max_gc_batch_size) {
// Stop pick file for this gc, but still check file for whether need
// trigger gc after this
stop_picking = true;
}
} else {
// TODO: add a batch threshold for punch hole gc.
maybe_continue_next_time = true;
break;
}
}
if (blob_files.empty()) return nullptr;
return std::unique_ptr<BlobGC>(new BlobGC(
std::move(blob_files), std::move(cf_options_), maybe_continue_next_time,
/*punch_hole_gc=*/true));
}

bool BasicBlobGCPicker::CheckBlobFile(BlobFileMeta* blob_file) const {
assert(blob_file == nullptr ||
blob_file->file_state() != BlobFileMeta::FileState::kNone);
Expand Down
Loading

0 comments on commit a7ec858

Please sign in to comment.