Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add punch hole GC #326

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions include/titan/db.h
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,9 @@ class TitanDB : public StackableDB {
// "rocksdb.titandb.discardable_ratio_le100_file_num" - returns count of
// file whose discardable ratio is less or equal to 100%.
static const std::string kNumDiscardableRatioLE100File;
// "rockdb.titandb.kNumHolePunchableBlobSize" - returns the size of hole
// punchable blobs (no longer referenced in SSTs) in the database.
static const std::string kHolePunchableBlobSize;
};

bool GetProperty(ColumnFamilyHandle* column_family, const Slice& property,
Expand Down
5 changes: 4 additions & 1 deletion include/titan/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ struct TitanCFOptions : public ColumnFamilyOptions {
// data's 0s and 0s created by punch holes).
uint64_t block_size{4096};
bool enable_punch_hole_gc{false};
uint64_t punch_hole_threshold{4 * 1024 * 1024};
v01dstar marked this conversation as resolved.
Show resolved Hide resolved

TitanCFOptions() = default;
explicit TitanCFOptions(const ColumnFamilyOptions& options)
Expand Down Expand Up @@ -229,12 +230,14 @@ struct MutableTitanCFOptions {
: blob_run_mode(opts.blob_run_mode),
min_blob_size(opts.min_blob_size),
blob_file_compression(opts.blob_file_compression),
blob_file_discardable_ratio(opts.blob_file_discardable_ratio) {}
blob_file_discardable_ratio(opts.blob_file_discardable_ratio),
punch_hole_threshold(opts.punch_hole_threshold) {}

TitanBlobRunMode blob_run_mode;
uint64_t min_blob_size;
CompressionType blob_file_compression;
double blob_file_discardable_ratio;
uint64_t punch_hole_threshold;
};

struct TitanOptions : public TitanDBOptions, public TitanCFOptions {
Expand Down
7 changes: 6 additions & 1 deletion src/blob_file_builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,12 @@ void BlobFileBuilder::FlushSampleRecords(OutContexts* out_ctx) {
void BlobFileBuilder::WriteEncoderData(BlobHandle* handle) {
handle->offset = file_->GetFileSize();
handle->size = encoder_.GetEncodedSize();
live_data_size_ += handle->size;
if (block_size_ > 0) {
live_data_size_ +=
(handle->size + block_size_ - 1) / block_size_ * block_size_;
} else {
live_data_size_ += handle->size;
}
v01dstar marked this conversation as resolved.
Show resolved Hide resolved

status_ = file_->Append(encoder_.GetHeader());
if (ok()) {
Expand Down
2 changes: 2 additions & 0 deletions src/blob_file_builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ class BlobFileBuilder {
const std::string& GetSmallestKey() { return smallest_key_; }
const std::string& GetLargestKey() { return largest_key_; }

uint64_t GetBlockSize() { return block_size_; }

uint64_t live_data_size() const { return live_data_size_; }

private:
Expand Down
9 changes: 9 additions & 0 deletions src/blob_file_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,15 @@ class BlobFileManager {
(void)handles;
return Status::OK();
}

// Updates the metadata of the file. This is used to update the
// metadata of the file after the file is punched with holes.
virtual Status BatchUpdateFiles(
uint32_t cf_id, const std::vector<std::shared_ptr<BlobFileMeta>>& files) {
(void)cf_id;
(void)files;
return Status::OK();
}
};

} // namespace titandb
Expand Down
16 changes: 16 additions & 0 deletions src/blob_file_set.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,22 @@ class BlobFileSet {

bool IsOpened() { return opened_.load(std::memory_order_acquire); }

uint64_t GetBlockSize(uint32_t cf_id) {
MutexLock l(mutex_);
auto storage = GetBlobStorage(cf_id).lock();
if (storage != nullptr && storage->cf_options().enable_punch_hole_gc) {
return storage->cf_options().block_size;
}
return 0;
}

std::unordered_map<uint64_t, uint64_t> GetFileBlockSizes(uint32_t cf_id) {
MutexLock l(mutex_);
auto storage = GetBlobStorage(cf_id).lock();
return storage ? storage->GetFileBlockSizes()
: std::unordered_map<uint64_t, uint64_t>();
}

private:
struct ManifestWriter;

Expand Down
34 changes: 30 additions & 4 deletions src/blob_file_size_collector.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,13 @@ namespace titandb {

TablePropertiesCollector*
BlobFileSizeCollectorFactory::CreateTablePropertiesCollector(
rocksdb::TablePropertiesCollectorFactory::Context /* context */) {
return new BlobFileSizeCollector();
rocksdb::TablePropertiesCollectorFactory::Context context) {
if (blob_file_set_ != nullptr) {
return new BlobFileSizeCollector(
blob_file_set_->GetBlockSize(context.column_family_id),
blob_file_set_->GetFileBlockSizes(context.column_family_id));
}
return new BlobFileSizeCollector(0, {});
}

const std::string BlobFileSizeCollector::kPropertiesName =
Expand Down Expand Up @@ -57,11 +62,32 @@ Status BlobFileSizeCollector::AddUserKey(const Slice& /* key */,
return s;
}

auto size = index.blob_handle.size;
if (default_block_size_ > 0 && !file_block_sizes_.empty()) {
// If the blob file cannot be found in the block size map, it must be a
// newly created file that has not been added blob_file_set, in this case,
// we know the block size of the file is default_block_size_.
// If the blob file can be found in the block size map, it implies we are
// moving the reference only, while keeping the blob at the original file,
// in this case, we should use the block size of the original file.
uint64_t block_size = default_block_size_;
if (!file_block_sizes_.empty()) {
auto iter = file_block_sizes_.find(index.file_number);
if (iter != file_block_sizes_.end()) {
block_size = iter->second;
}
}
if (block_size > 0) {
// Align blob size with block size.
size = (size + block_size - 1) / block_size * block_size;
v01dstar marked this conversation as resolved.
Show resolved Hide resolved
}
}

auto iter = blob_files_size_.find(index.file_number);
if (iter == blob_files_size_.end()) {
blob_files_size_[index.file_number] = index.blob_handle.size;
blob_files_size_[index.file_number] = size;
} else {
iter->second += index.blob_handle.size;
iter->second += size;
}

return Status::OK();
Expand Down
17 changes: 17 additions & 0 deletions src/blob_file_size_collector.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,31 @@ namespace titandb {
class BlobFileSizeCollectorFactory final
: public TablePropertiesCollectorFactory {
public:
// If punch_hole_gc is enabled, then blob_file_set must be provided.
// If blob_file_set is not provided, then punch_hole_gc will be considered
// disabled, blob size will not align with block size.
BlobFileSizeCollectorFactory(BlobFileSet* blob_file_set = nullptr)
: blob_file_set_(blob_file_set) {}
BlobFileSizeCollectorFactory(const BlobFileSizeCollectorFactory&) = delete;
void operator=(const BlobFileSizeCollectorFactory&) = delete;
TablePropertiesCollector* CreateTablePropertiesCollector(
TablePropertiesCollectorFactory::Context context) override;

const char* Name() const override { return "BlobFileSizeCollector"; }

private:
BlobFileSet* blob_file_set_;
};

class BlobFileSizeCollector final : public TablePropertiesCollector {
public:
const static std::string kPropertiesName;

BlobFileSizeCollector(uint64_t default_block_size,
std::unordered_map<uint64_t, uint64_t> file_block_sizes)
: default_block_size_(default_block_size),
file_block_sizes_(file_block_sizes) {}

static bool Encode(const std::map<uint64_t, uint64_t>& blob_files_size,
std::string* result);
static bool Decode(Slice* slice,
Expand All @@ -38,6 +53,8 @@ class BlobFileSizeCollector final : public TablePropertiesCollector {

private:
std::map<uint64_t, uint64_t> blob_files_size_;
uint64_t default_block_size_;
std::unordered_map<uint64_t, uint64_t> file_block_sizes_;
};

} // namespace titandb
Expand Down
7 changes: 7 additions & 0 deletions src/blob_format.cc
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ void BlobFileMeta::EncodeTo(std::string* dst) const {
PutVarint64(dst, block_size_);
PutLengthPrefixedSlice(dst, smallest_key_);
PutLengthPrefixedSlice(dst, largest_key_);
PutVarint64(dst, effective_file_size_);
}

Status BlobFileMeta::DecodeFromV1(Slice* src) {
Expand Down Expand Up @@ -189,6 +190,12 @@ Status BlobFileMeta::DecodeFrom(Slice* src) {
} else {
return Status::Corruption("BlobLargestKey decode failed");
}
uint64_t effective_file_size;
if (!GetVarint64(src, &effective_file_size)) {
return Status::Corruption(
"BlobFileMeta hole_punchable_size_ decode failed");
}
effective_file_size_ = effective_file_size;
return Status::OK();
}

Expand Down
24 changes: 24 additions & 0 deletions src/blob_format.h
Original file line number Diff line number Diff line change
Expand Up @@ -248,10 +248,14 @@ class BlobFileMeta {
uint64_t file_size() const { return file_size_; }
uint64_t live_data_size() const { return live_data_size_; }
uint32_t file_level() const { return file_level_; }
uint64_t block_size() const { return block_size_; }
const std::string& smallest_key() const { return smallest_key_; }
const std::string& largest_key() const { return largest_key_; }
int64_t effective_file_size() const { return effective_file_size_; }

void set_live_data_size(int64_t size) { live_data_size_ = size; }
// This should be called with db mutex held.
void set_effective_file_size(int64_t size) { effective_file_size_ = size; }
uint64_t file_entries() const { return file_entries_; }
FileState file_state() const { return state_; }
bool is_obsolete() const { return state_ == FileState::kObsolete; }
Expand All @@ -275,6 +279,10 @@ class BlobFileMeta {
(file_size_ - kBlobMaxHeaderSize - kBlobFooterSize));
}
TitanInternalStats::StatsType GetDiscardableRatioLevel() const;
// This should be called with db mutex held.
v01dstar marked this conversation as resolved.
Show resolved Hide resolved
uint64_t GetHolePunchableSize() const {
return effective_file_size_ - live_data_size_;
}
void Dump(bool with_keys) const;

private:
Expand All @@ -291,6 +299,18 @@ class BlobFileMeta {
std::string smallest_key_;
std::string largest_key_;

// The effective size of current file. This is different from `file_size_`, as
// `file_size_` is the original size of the file, and does not consider space
// reclaimed by punch hole GC.
// We can't use file system's `st_blocks` to get the logical size, because
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's okay to get the size as effective_file_size after restart. The size doesn't have to be so precise. Indeed, it may have false positive for triggering punch hole GC, but it would be updated to the accurate number after the gc scan.
Then, we can get rid of updating manifest for effective_file_size

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

// the file system's block size may be different from Titan's block size.
// This is used to calculate the size of the punchable hole. i.e.
// effective_file_size_ - live_data_size_.
// This might be bigger than the actual size of the file, when Titan crashes
// before updating the `effective_file_size_` during punch hole GC. This is
// fine, as it will be corrected when the file is chose for GC next time.
int64_t effective_file_size_{0};

// Not persistent field

// Size of data with reference from SST files.
Expand All @@ -303,7 +323,11 @@ class BlobFileMeta {
// So when state_ == kPendingLSM, it uses this to record the delta as a
// positive number if any later compaction is trigger before previous
// `OnCompactionCompleted()` is called.
// The size is aligned with block size, when punch hole GC is enabled.
std::atomic<int64_t> live_data_size_{0};
// This is different from `file_size_`, as `file_size_` is the original size
// of the file, and does not consider space reclaimed by punch hole GC.
std::atomic<int64_t> disk_usage_{0};
std::atomic<FileState> state_{FileState::kNone};
};

Expand Down
6 changes: 4 additions & 2 deletions src/blob_gc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@ namespace rocksdb {
namespace titandb {

BlobGC::BlobGC(std::vector<std::shared_ptr<BlobFileMeta>>&& blob_files,
TitanCFOptions&& _titan_cf_options, bool need_trigger_next)
TitanCFOptions&& _titan_cf_options, bool need_trigger_next,
bool punch_hole_gc)
: inputs_(blob_files),
titan_cf_options_(std::move(_titan_cf_options)),
trigger_next_(need_trigger_next) {
trigger_next_(need_trigger_next),
punch_hole_gc_(punch_hole_gc) {
MarkFilesBeingGC();
}

Expand Down
6 changes: 5 additions & 1 deletion src/blob_gc.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ namespace titandb {
class BlobGC {
public:
BlobGC(std::vector<std::shared_ptr<BlobFileMeta>>&& blob_files,
TitanCFOptions&& _titan_cf_options, bool need_trigger_next);
TitanCFOptions&& _titan_cf_options, bool need_trigger_next,
bool punch_hole_gc = false);

// No copying allowed
BlobGC(const BlobGC&) = delete;
Expand All @@ -40,13 +41,16 @@ class BlobGC {

bool trigger_next() { return trigger_next_; }

bool punch_hole_gc() { return punch_hole_gc_; }

private:
std::vector<std::shared_ptr<BlobFileMeta>> inputs_;
std::vector<BlobFileMeta*> outputs_;
TitanCFOptions titan_cf_options_;
ColumnFamilyHandle* cfh_{nullptr};
// Whether need to trigger gc after this gc or not
const bool trigger_next_;
const bool punch_hole_gc_;
};

struct GCScore {
Expand Down
55 changes: 54 additions & 1 deletion src/blob_gc_picker.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,19 @@ BasicBlobGCPicker::BasicBlobGCPicker(TitanDBOptions db_options,

BasicBlobGCPicker::~BasicBlobGCPicker() {}

std::unique_ptr<BlobGC> BasicBlobGCPicker::PickBlobGC(
std::unique_ptr<BlobGC> BasicBlobGCPicker::PickBlobGC(BlobStorage* blob_storage,
bool allow_punch_hole) {
auto regular_gc = PickRegularBlobGC(blob_storage);
if (regular_gc) {
return regular_gc;
}
if (allow_punch_hole) {
return PickPunchHoleGC(blob_storage);
}
return nullptr;
}

std::unique_ptr<BlobGC> BasicBlobGCPicker::PickRegularBlobGC(
BlobStorage* blob_storage) {
Status s;
std::vector<std::shared_ptr<BlobFileMeta>> blob_files;
Expand Down Expand Up @@ -103,6 +115,47 @@ std::unique_ptr<BlobGC> BasicBlobGCPicker::PickBlobGC(
std::move(blob_files), std::move(cf_options_), maybe_continue_next_time));
}

std::unique_ptr<BlobGC> BasicBlobGCPicker::PickPunchHoleGC(
BlobStorage* blob_storage) {
Status s;
std::vector<std::shared_ptr<BlobFileMeta>> blob_files;

uint64_t batch_size = 0;
uint64_t estimate_output_size = 0;
bool stop_picking = false;
bool maybe_continue_next_time = false;
v01dstar marked this conversation as resolved.
Show resolved Hide resolved
uint64_t next_gc_size = 0;

for (auto& gc_score : blob_storage->punch_hole_score()) {
auto blob_file = blob_storage->FindFile(gc_score.file_number).lock();
if (!CheckBlobFile(blob_file.get())) {
// Skip this file id this file is being GCed
// or this file had been GCed
TITAN_LOG_INFO(db_options_.info_log,
"Blob file %" PRIu64 " no need punch hole gc",
blob_file->file_number());
continue;
}
if (!stop_picking) {
blob_files.emplace_back(blob_file);
batch_size += blob_file->file_size();
if (batch_size >= cf_options_.max_gc_batch_size) {
// Stop pick file for this gc, but still check file for whether need
// trigger gc after this
v01dstar marked this conversation as resolved.
Show resolved Hide resolved
stop_picking = true;
}
} else {
// TODO: add a batch threshold for punch hole gc.
maybe_continue_next_time = true;
break;
}
}
if (blob_files.empty()) return nullptr;
return std::unique_ptr<BlobGC>(new BlobGC(
std::move(blob_files), std::move(cf_options_), maybe_continue_next_time,
/*punch_hole_gc=*/true));
}

bool BasicBlobGCPicker::CheckBlobFile(BlobFileMeta* blob_file) const {
assert(blob_file == nullptr ||
blob_file->file_state() != BlobFileMeta::FileState::kNone);
Expand Down
Loading