Skip to content

Commit 94e65a2

Browse files
Xingbo Wangfacebook-github-bot
authored andcommitted
Add option to validate key during seek in SkipList Memtable (#13902)
Summary: Add a new CF immutable option `paranoid_memory_check_key_checksum_on_seek` that allows additional data integrity validations during seek on SkipList Memtable. When this option is enabled and memtable_protection_bytes_per_key is non zero, skiplist-based memtable will validate the checksum of each key visited during seek operation. The option is opt-in due to performance overhead. This is an enhancement on top of paranoid_memory_checks option. Pull Request resolved: #13902 Test Plan: * new unit test added for paranoid_memory_check_key_checksum_on_seek=true. * existing unit test for paranoid_memory_check_key_checksum_on_seek=false. * enable in stress test. Performance Benchmark: we check for performance regression in read path where data is in memtable only. For each benchmark, the script was run at the same time for main and this PR: ### Memtable-only randomread ops/sec: * Value size = 100 Bytes ``` for B in 0 1 2 4 8; do (for I in $(seq 1 50);do ./db_bench --benchmarks=fillseq,readrandom --write_buffer_size=268435456 --writes=250000 --value_size=100 --num=250000 --reads=500000 --seed=1723056275 --paranoid_memory_check_key_checksum_on_seek=true --memtable_protection_bytes_per_key=$B 2>&1 | grep "readrandom"; done;) | awk '{ t += $5; c++; print } END { print 1.0 * t / c }'; done; ``` 1. Main: 928999 2. PR with paranoid_memory_check_key_checksum_on_seek=false: 930993 (+0.2%) 3. PR with paranoid_memory_check_key_checksum_on_seek=true: 3.1 memtable_protection_bytes_per_key=1: 464577 (-50%) 3.2 memtable_protection_bytes_per_key=2: 470319 (-49%) 3.3 memtable_protection_bytes_per_key=4: 468457 (-50%) 3.4 memtable_protection_bytes_per_key=8: 465061 (-50%) * Value size = 1000 Bytes ``` for B in 0 1 2 4 8; do (for I in $(seq 1 50);do ./db_bench --benchmarks=fillseq,readrandom --write_buffer_size=268435456 --writes=250000 --value_size=1000 --num=250000 --reads=500000 --seed=1723056275 --paranoid_memory_check_key_checksum_on_seek=true --memtable_protection_bytes_per_key=$B 2>&1 | grep "readrandom"; done;) | awk '{ t += $5; c++; print } END { print 1.0 * t / c }'; done; ``` 1. Main: 601321 2. PR with paranoid_memory_check_key_checksum_on_seek=false: 607885 (+1.1%) 3. PR with paranoid_memory_check_key_checksum_on_seek=true: 3.1 memtable_protection_bytes_per_key=1: 185742 (-69%) 3.2 memtable_protection_bytes_per_key=2: 177167 (-71%) 3.3 memtable_protection_bytes_per_key=4: 185908 (-69%) 3.4 memtable_protection_bytes_per_key=8: 183639 (-69%) Reviewed By: pdillinger Differential Revision: D81199245 Pulled By: xingbowang fbshipit-source-id: e3c29552ab92f2c5f360361366a293fa26934913
1 parent 5a1ff2c commit 94e65a2

20 files changed

+320
-68
lines changed

db/db_basic_test.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5087,6 +5087,7 @@ TEST_F(DBBasicTest, DisallowMemtableWrite) {
50875087
Options options_disallow = options_allow;
50885088
options_disallow.disallow_memtable_writes = true;
50895089
options_disallow.paranoid_memory_checks = true;
5090+
options_disallow.memtable_veirfy_per_key_checksum_on_seek = true;
50905091

50915092
DestroyAndReopen(options_allow);
50925093
// CFs allowing and disallowing memtable write

db/db_memtable_test.cc

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,135 @@ TEST_F(DBMemTableTest, ColumnFamilyId) {
339339
}
340340
}
341341

342+
class DBMemTableTestForSeek : public DBMemTableTest,
343+
virtual public ::testing::WithParamInterface<
344+
std::tuple<bool, bool, bool>> {};
345+
346+
TEST_P(DBMemTableTestForSeek, IntegrityChecks) {
347+
// Validate key corruption could be detected during seek.
348+
// We insert many keys into skiplist. Then we corrupt the each key one at a
349+
// time. With memtable_veirfy_per_key_checksum_on_seek enabled, when the
350+
// corrupted key is searched, the checksum of every key visited during the
351+
// seek is validated. It will report data corruption. Otherwise seek returns
352+
// not found.
353+
auto allow_data_in_error = std::get<0>(GetParam());
354+
Options options = CurrentOptions();
355+
options.allow_data_in_errors = allow_data_in_error;
356+
options.paranoid_memory_checks = std::get<1>(GetParam());
357+
options.memtable_veirfy_per_key_checksum_on_seek = std::get<2>(GetParam());
358+
options.memtable_protection_bytes_per_key = 8;
359+
DestroyAndReopen(options);
360+
361+
// capture the data pointer of all of the keys
362+
std::vector<char*> raw_data_pointer;
363+
364+
// Insert enough keys, so memtable would create multiple levels.
365+
auto key_count = 100;
366+
for (int i = 0; i < key_count; i++) {
367+
// The last digit of the key will be corrupted from value 0 to value 5
368+
ASSERT_OK(Put(Key(i * 10), "val0"));
369+
}
370+
371+
ReadOptions rops;
372+
373+
// Iterate all the keys to get key pointers
374+
SyncPoint::GetInstance()->DisableProcessing();
375+
SyncPoint::GetInstance()->SetCallBack("InlineSkipList::Iterator::Next::key",
376+
[&raw_data_pointer](void* key) {
377+
auto p = static_cast<char*>(key);
378+
raw_data_pointer.push_back(p);
379+
});
380+
SyncPoint::GetInstance()->EnableProcessing();
381+
382+
{
383+
std::unique_ptr<Iterator> iter{db_->NewIterator(rops)};
384+
iter->Seek(Key(0));
385+
while (iter->Valid()) {
386+
ASSERT_OK(iter->status());
387+
iter->Next();
388+
}
389+
// check status after valid returned false.
390+
auto status = iter->status();
391+
ASSERT_TRUE(status.ok());
392+
}
393+
394+
SyncPoint::GetInstance()->DisableProcessing();
395+
SyncPoint::GetInstance()->ClearAllCallBacks();
396+
397+
ASSERT_EQ(raw_data_pointer.size(), key_count);
398+
399+
bool enable_key_validation_on_seek =
400+
options.memtable_veirfy_per_key_checksum_on_seek;
401+
402+
// For each key, corrupt it, validate corruption is detected correctly, then
403+
// revert it.
404+
for (int i = 0; i < key_count; i++) {
405+
std::string key_to_corrupt = Key(i * 10);
406+
raw_data_pointer[i][key_to_corrupt.size()] = '5';
407+
408+
auto corrupted_key = key_to_corrupt;
409+
corrupted_key.data()[key_to_corrupt.size() - 1] = '5';
410+
auto corrupted_key_slice =
411+
Slice(corrupted_key.data(), corrupted_key.length());
412+
auto corrupted_key_hex = corrupted_key_slice.ToString(/*hex=*/true);
413+
414+
{
415+
// Test Get API
416+
std::string val;
417+
auto status = db_->Get(rops, key_to_corrupt, &val);
418+
if (enable_key_validation_on_seek) {
419+
ASSERT_TRUE(status.IsCorruption()) << key_to_corrupt;
420+
ASSERT_EQ(
421+
status.ToString().find(corrupted_key_hex) != std::string::npos,
422+
allow_data_in_error)
423+
<< status.ToString() << "\n"
424+
<< corrupted_key_hex;
425+
} else {
426+
ASSERT_TRUE(status.IsNotFound());
427+
}
428+
}
429+
430+
{
431+
// Test MultiGet API
432+
std::vector<std::string> vals;
433+
std::vector<Status> statuses = db_->MultiGet(
434+
rops, {db_->DefaultColumnFamily()}, {key_to_corrupt}, &vals, nullptr);
435+
if (enable_key_validation_on_seek) {
436+
ASSERT_TRUE(statuses[0].IsCorruption());
437+
ASSERT_EQ(
438+
statuses[0].ToString().find(corrupted_key_hex) != std::string::npos,
439+
allow_data_in_error);
440+
} else {
441+
ASSERT_TRUE(statuses[0].IsNotFound());
442+
}
443+
}
444+
445+
{
446+
// Test Iterator Seek API
447+
std::unique_ptr<Iterator> iter{db_->NewIterator(rops)};
448+
ASSERT_OK(iter->status());
449+
iter->Seek(key_to_corrupt);
450+
auto status = iter->status();
451+
if (enable_key_validation_on_seek) {
452+
ASSERT_TRUE(status.IsCorruption());
453+
ASSERT_EQ(
454+
status.ToString().find(corrupted_key_hex) != std::string::npos,
455+
allow_data_in_error);
456+
} else {
457+
ASSERT_FALSE(iter->Valid());
458+
ASSERT_FALSE(status.ok());
459+
}
460+
}
461+
462+
// revert the key corruption.
463+
raw_data_pointer[i][key_to_corrupt.size()] = '0';
464+
}
465+
}
466+
467+
INSTANTIATE_TEST_CASE_P(DBMemTableTestForSeek, DBMemTableTestForSeek,
468+
::testing::Combine(::testing::Bool(), ::testing::Bool(),
469+
::testing::Bool()));
470+
342471
TEST_F(DBMemTableTest, IntegrityChecks) {
343472
// We insert keys key000000, key000001 and key000002 into skiplist at fixed
344473
// height 1 (smallest height). Then we corrupt the second key to aey000001 to

db/memtable.cc

Lines changed: 38 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,9 @@ ImmutableMemTableOptions::ImmutableMemTableOptions(
7070
protection_bytes_per_key(
7171
mutable_cf_options.memtable_protection_bytes_per_key),
7272
allow_data_in_errors(ioptions.allow_data_in_errors),
73-
paranoid_memory_checks(mutable_cf_options.paranoid_memory_checks) {}
73+
paranoid_memory_checks(mutable_cf_options.paranoid_memory_checks),
74+
memtable_veirfy_per_key_checksum_on_seek(
75+
mutable_cf_options.memtable_veirfy_per_key_checksum_on_seek) {}
7476

7577
MemTable::MemTable(const InternalKeyComparator& cmp,
7678
const ImmutableOptions& ioptions,
@@ -115,7 +117,13 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
115117
oldest_key_time_(std::numeric_limits<uint64_t>::max()),
116118
approximate_memory_usage_(0),
117119
memtable_max_range_deletions_(
118-
mutable_cf_options.memtable_max_range_deletions) {
120+
mutable_cf_options.memtable_max_range_deletions),
121+
key_validation_callback_(
122+
(moptions_.protection_bytes_per_key != 0 &&
123+
moptions_.memtable_veirfy_per_key_checksum_on_seek)
124+
? std::bind(&MemTable::ValidateKey, this, std::placeholders::_1,
125+
std::placeholders::_2)
126+
: std::function<Status(const char*, bool)>(nullptr)) {
119127
UpdateFlushState();
120128
// something went wrong if we need to flush before inserting anything
121129
assert(!ShouldScheduleFlush());
@@ -394,7 +402,11 @@ class MemTableIterator : public InternalIterator {
394402
!mem.GetImmutableMemTableOptions()->inplace_update_support),
395403
arena_mode_(arena != nullptr),
396404
paranoid_memory_checks_(mem.moptions_.paranoid_memory_checks),
397-
allow_data_in_error(mem.moptions_.allow_data_in_errors) {
405+
validate_on_seek_(
406+
mem.moptions_.paranoid_memory_checks ||
407+
mem.moptions_.memtable_veirfy_per_key_checksum_on_seek),
408+
allow_data_in_error_(mem.moptions_.allow_data_in_errors),
409+
key_validation_callback_(mem.key_validation_callback_) {
398410
if (kind == kRangeDelEntries) {
399411
iter_ = mem.range_del_table_->GetIterator(arena);
400412
} else if (prefix_extractor_ != nullptr &&
@@ -463,8 +475,10 @@ class MemTableIterator : public InternalIterator {
463475
}
464476
}
465477
}
466-
if (paranoid_memory_checks_) {
467-
status_ = iter_->SeekAndValidate(k, nullptr, allow_data_in_error);
478+
if (validate_on_seek_) {
479+
status_ = iter_->SeekAndValidate(k, nullptr, allow_data_in_error_,
480+
paranoid_memory_checks_,
481+
key_validation_callback_);
468482
} else {
469483
iter_->Seek(k, nullptr);
470484
}
@@ -488,8 +502,10 @@ class MemTableIterator : public InternalIterator {
488502
}
489503
}
490504
}
491-
if (paranoid_memory_checks_) {
492-
status_ = iter_->SeekAndValidate(k, nullptr, allow_data_in_error);
505+
if (validate_on_seek_) {
506+
status_ = iter_->SeekAndValidate(k, nullptr, allow_data_in_error_,
507+
paranoid_memory_checks_,
508+
key_validation_callback_);
493509
} else {
494510
iter_->Seek(k, nullptr);
495511
}
@@ -518,7 +534,7 @@ class MemTableIterator : public InternalIterator {
518534
PERF_COUNTER_ADD(next_on_memtable_count, 1);
519535
assert(Valid());
520536
if (paranoid_memory_checks_) {
521-
status_ = iter_->NextAndValidate(allow_data_in_error);
537+
status_ = iter_->NextAndValidate(allow_data_in_error_);
522538
} else {
523539
iter_->Next();
524540
TEST_SYNC_POINT_CALLBACK("MemTableIterator::Next:0", iter_);
@@ -540,7 +556,7 @@ class MemTableIterator : public InternalIterator {
540556
PERF_COUNTER_ADD(prev_on_memtable_count, 1);
541557
assert(Valid());
542558
if (paranoid_memory_checks_) {
543-
status_ = iter_->PrevAndValidate(allow_data_in_error);
559+
status_ = iter_->PrevAndValidate(allow_data_in_error_);
544560
} else {
545561
iter_->Prev();
546562
}
@@ -599,7 +615,9 @@ class MemTableIterator : public InternalIterator {
599615
bool value_pinned_;
600616
bool arena_mode_;
601617
const bool paranoid_memory_checks_;
602-
const bool allow_data_in_error;
618+
const bool validate_on_seek_;
619+
const bool allow_data_in_error_;
620+
const std::function<Status(const char*, bool)> key_validation_callback_;
603621

604622
void VerifyEntryChecksum() {
605623
if (protection_bytes_per_key_ > 0 && Valid()) {
@@ -1493,11 +1511,13 @@ void MemTable::GetFromTable(const LookupKey& key,
14931511
saver.allow_data_in_errors = moptions_.allow_data_in_errors;
14941512
saver.protection_bytes_per_key = moptions_.protection_bytes_per_key;
14951513

1496-
if (!moptions_.paranoid_memory_checks) {
1514+
if (!moptions_.paranoid_memory_checks &&
1515+
!moptions_.memtable_veirfy_per_key_checksum_on_seek) {
14971516
table_->Get(key, &saver, SaveValue);
14981517
} else {
1499-
Status check_s = table_->GetAndValidate(key, &saver, SaveValue,
1500-
moptions_.allow_data_in_errors);
1518+
Status check_s = table_->GetAndValidate(
1519+
key, &saver, SaveValue, moptions_.allow_data_in_errors,
1520+
moptions_.paranoid_memory_checks, key_validation_callback_);
15011521
if (check_s.IsCorruption()) {
15021522
*(saver.status) = check_s;
15031523
// Should stop searching the LSM.
@@ -1508,6 +1528,11 @@ void MemTable::GetFromTable(const LookupKey& key,
15081528
*seq = saver.seq;
15091529
}
15101530

1531+
Status MemTable::ValidateKey(const char* key, bool allow_data_in_errors) {
1532+
return VerifyEntryChecksum(key, moptions_.protection_bytes_per_key,
1533+
allow_data_in_errors);
1534+
}
1535+
15111536
void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
15121537
ReadCallback* callback, bool immutable_memtable) {
15131538
// The sequence number is updated synchronously in version_set.h

db/memtable.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ struct ImmutableMemTableOptions {
6464
uint32_t protection_bytes_per_key;
6565
bool allow_data_in_errors;
6666
bool paranoid_memory_checks;
67+
bool memtable_veirfy_per_key_checksum_on_seek;
6768
};
6869

6970
// Batched counters to updated when inserting keys in one write batch.
@@ -826,6 +827,9 @@ class MemTable final : public ReadOnlyMemTable {
826827
uint32_t protection_bytes_per_key,
827828
bool allow_data_in_errors = false);
828829

830+
// Validate the checksum of the key/value pair.
831+
Status ValidateKey(const char* key, bool allow_data_in_errors);
832+
829833
private:
830834
enum FlushStateEnum { FLUSH_NOT_REQUESTED, FLUSH_REQUESTED, FLUSH_SCHEDULED };
831835

@@ -956,6 +960,8 @@ class MemTable final : public ReadOnlyMemTable {
956960
SequenceNumber s, char* checksum_ptr);
957961

958962
void MaybeUpdateNewestUDT(const Slice& user_key);
963+
964+
const std::function<Status(const char*, bool)> key_validation_callback_;
959965
};
960966

961967
const char* EncodeKey(std::string* scratch, const Slice& target);

db_stress_tool/db_stress_common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,7 @@ DECLARE_string(last_level_temperature);
275275
DECLARE_string(default_write_temperature);
276276
DECLARE_string(default_temperature);
277277
DECLARE_bool(paranoid_memory_checks);
278+
DECLARE_bool(memtable_veirfy_per_key_checksum_on_seek);
278279

279280
// Options for transaction dbs.
280281
// Use TransactionDB (a.k.a. Pessimistic Transaction DB)

db_stress_tool/db_stress_gflags.cc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1484,6 +1484,11 @@ DEFINE_bool(paranoid_memory_checks,
14841484
ROCKSDB_NAMESPACE::Options().paranoid_memory_checks,
14851485
"Sets CF option paranoid_memory_checks.");
14861486

1487+
DEFINE_bool(
1488+
memtable_veirfy_per_key_checksum_on_seek,
1489+
ROCKSDB_NAMESPACE::Options().memtable_veirfy_per_key_checksum_on_seek,
1490+
"Sets CF option memtable_veirfy_per_key_checksum_on_seek.");
1491+
14871492
DEFINE_uint32(commit_bypass_memtable_one_in, 0,
14881493
"If greater than zero, transaction option will set "
14891494
"commit_bypass_memtable to per every N transactions on average.");

db_stress_tool/db_stress_test_base.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4442,6 +4442,8 @@ void InitializeOptionsFromFlags(
44424442
FLAGS_memtable_protection_bytes_per_key;
44434443
options.block_protection_bytes_per_key = FLAGS_block_protection_bytes_per_key;
44444444
options.paranoid_memory_checks = FLAGS_paranoid_memory_checks;
4445+
options.memtable_veirfy_per_key_checksum_on_seek =
4446+
FLAGS_memtable_veirfy_per_key_checksum_on_seek;
44454447

44464448
// Integrated BlobDB
44474449
options.enable_blob_files = FLAGS_enable_blob_files;

include/rocksdb/advanced_options.h

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1101,12 +1101,22 @@ struct AdvancedColumnFamilyOptions {
11011101
uint32_t bottommost_file_compaction_delay = 0;
11021102

11031103
// Enables additional integrity checks during reads/scans.
1104-
// Specifically, for skiplist-based memtables, we verify that keys visited
1105-
// are in order. This is helpful to detect corrupted memtable keys during
1106-
// reads. Enabling this feature incurs a performance overhead due to an
1107-
// additional key comparison during memtable lookup.
1104+
// Specifically, for skiplist-based memtables, key ordering validation could
1105+
// be enabled optionally. This is helpful to detect corrupted memtable keys
1106+
// during reads. Enabling this feature incurs a performance overhead due to
1107+
// additional comparison during memtable lookup.
11081108
bool paranoid_memory_checks = false;
11091109

1110+
// Enables additional integrity checks during seek.
1111+
// Specifically, for skiplist-based memtables, key checksum validation could
1112+
// be enabled during seek optionally. This is helpful to detect corrupted
1113+
// memtable keys during reads. Enabling this feature incurs a performance
1114+
// overhead due to additional key checksum validation during memtable seek
1115+
// operation.
1116+
// This option depends on memtable_protection_bytes_per_key to be non zero.
1117+
// If memtable_protection_bytes_per_key is zero, no validation is performed.
1118+
bool memtable_veirfy_per_key_checksum_on_seek = false;
1119+
11101120
// When an iterator scans this number of invisible entries (tombstones or
11111121
// hidden puts) from the active memtable during a single iterator operation,
11121122
// we will attempt to flush the memtable. Currently only forward scans are

include/rocksdb/memtablerep.h

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
#include <stdint.h>
3939
#include <stdlib.h>
4040

41+
#include <functional>
4142
#include <memory>
4243
#include <stdexcept>
4344
#include <unordered_set>
@@ -201,11 +202,12 @@ class MemTableRep {
201202
bool (*callback_func)(void* arg, const char* entry));
202203

203204
// Same as Get() but performs data integrity validation.
204-
virtual Status GetAndValidate(const LookupKey& /* k */,
205-
void* /* callback_args */,
206-
bool (* /* callback_func */)(void* arg,
207-
const char* entry),
208-
bool /*allow_data_in_error*/) {
205+
virtual Status GetAndValidate(
206+
const LookupKey& /* k */, void* /* callback_args */,
207+
bool (* /* callback_func */)(void* arg, const char* entry),
208+
bool /* allow_data_in_error */, bool /* detect_key_out_of_order */,
209+
const std::function<Status(const char*, bool)>&
210+
/* key_validation_callback */) {
209211
return Status::NotSupported("GetAndValidate() not implemented.");
210212
}
211213

@@ -276,9 +278,11 @@ class MemTableRep {
276278
// Seek and perform integrity validations on the skip list.
277279
// Iterator becomes invalid and Corruption is returned if a
278280
// corruption is found.
279-
virtual Status SeekAndValidate(const Slice& /* internal_key */,
280-
const char* /* memtable_key */,
281-
bool /* allow_data_in_errors */) {
281+
virtual Status SeekAndValidate(
282+
const Slice& /* internal_key */, const char* /* memtable_key */,
283+
bool /* allow_data_in_errors */, bool /* detect_key_out_of_order */,
284+
const std::function<Status(const char*, bool)>&
285+
/* key_validation_callback */) {
282286
return Status::NotSupported("SeekAndValidate() not implemented.");
283287
}
284288

include/rocksdb/options.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -623,6 +623,7 @@ struct DBOptions {
623623
// checking for corruption, including
624624
// * paranoid_file_checks
625625
// * paranoid_memory_checks
626+
// * memtable_veirfy_per_key_checksum_on_seek
626627
// * DB::VerifyChecksum()
627628
//
628629
// Default: true

0 commit comments

Comments
 (0)