diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml index 5ba45260..ab844b16 100644 --- a/.github/workflows/c-cpp.yml +++ b/.github/workflows/c-cpp.yml @@ -30,3 +30,7 @@ jobs: - name: Test test run: ctest --test-dir build --output-on-failure + + - name: Setup tmate session + if: ${{ failure() }} + uses: mxschmitt/action-tmate@v3 diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index b06af500..ed7455ce 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -1 +1 @@ -# add_subdirectory(ycsb) +add_subdirectory(micro-benchmarks) \ No newline at end of file diff --git a/benchmarks/micro-benchmarks/CMakeLists.txt b/benchmarks/micro-benchmarks/CMakeLists.txt new file mode 100644 index 00000000..725d459f --- /dev/null +++ b/benchmarks/micro-benchmarks/CMakeLists.txt @@ -0,0 +1,2 @@ +add_executable(CastBench CastBench.cpp) +target_link_libraries(CastBench gbench leanstore) diff --git a/benchmarks/micro-benchmarks/CastBench.cpp b/benchmarks/micro-benchmarks/CastBench.cpp new file mode 100644 index 00000000..7395a5aa --- /dev/null +++ b/benchmarks/micro-benchmarks/CastBench.cpp @@ -0,0 +1,28 @@ +#include "storage/buffer-manager/BufferFrame.hpp" +#include "utils/Misc.hpp" + +#include + +using namespace leanstore; +using namespace leanstore::utils; +using namespace leanstore::storage; + +static void BenchU8ToPage(benchmark::State& state) { + AlignedBuffer<512> alignedBuffer(FLAGS_page_size * 4); + auto buf = alignedBuffer.Get(); + auto i = 1; + for (auto _ : state) { + reinterpret_cast(&buf[i * FLAGS_page_size])->mGSN = 1; + } +} +static void BenchPageDirectly(benchmark::State& state) { + Page pages[4]; + auto i = 1; + for (auto _ : state) { + pages[i].mGSN = 1; + } +} + +BENCHMARK(BenchU8ToPage); +BENCHMARK(BenchPageDirectly); +BENCHMARK_MAIN(); \ No newline at end of file diff --git a/benchmarks/ycsb/deterministic.cpp b/benchmarks/ycsb/deterministic.cpp index 581cdbb4..030daeca 100644 --- a/benchmarks/ycsb/deterministic.cpp +++ b/benchmarks/ycsb/deterministic.cpp @@ -124,7 +124,7 @@ int main(int argc, char** argv) { cout << calculateMTPS(begin, end, n) << " M tps" << endl; // ------------------------------------------------------------------------------------- const u64 written_pages = db.getBufferManager().consumedPages(); - const u64 mib = written_pages * PAGE_SIZE / 1024 / 1024; + const u64 mib = written_pages * FLAGS_page_size / 1024 / 1024; cout << "Inserted volume: (pages, MiB) = (" << written_pages << ", " << mib << ")" << endl; cout << "------------------------------------------------------------------" @@ -133,7 +133,7 @@ int main(int argc, char** argv) { } // ------------------------------------------------------------------------------------- const u64 DISTANCE = - 8 * (PAGE_SIZE / (sizeof(YCSBKey) + sizeof(YCSBPayload))); + 8 * (FLAGS_page_size / (sizeof(YCSBKey) + sizeof(YCSBPayload))); cout << setprecision(4); // ------------------------------------------------------------------------------------- cout << "~Transactions" << endl; diff --git a/benchmarks/ycsb/ycsb.cpp b/benchmarks/ycsb/ycsb.cpp index df0189ab..7831c168 100644 --- a/benchmarks/ycsb/ycsb.cpp +++ b/benchmarks/ycsb/ycsb.cpp @@ -160,7 +160,7 @@ int main(int argc, char** argv) { cout << calculateMTPS(begin, end, n) << " M tps" << endl; // ------------------------------------------------------------------------------------- const u64 written_pages = db.getBufferManager().consumedPages(); - const u64 mib = written_pages * PAGE_SIZE / 1024 / 1024; + const u64 mib = written_pages * FLAGS_page_size / 1024 / 1024; cout << "Inserted volume: (pages, MiB) = (" << written_pages << ", " << mib << ")" << endl; cout << "------------------------------------------------------------------" diff --git a/source/Config.cpp b/source/Config.cpp index 4a83cb78..f7972b19 100644 --- a/source/Config.cpp +++ b/source/Config.cpp @@ -1,6 +1,7 @@ #include // Buffer management +DEFINE_uint32(page_size, 4096, "The page size (bytes)"); // 4 KiB DEFINE_uint64(buffer_pool_size, 1073741824, "The buffer pool size (bytes)"); // 1 GiB DEFINE_string(data_dir, "~/.leanstore", diff --git a/source/Config.hpp b/source/Config.hpp index 488b2f27..0f97453a 100644 --- a/source/Config.hpp +++ b/source/Config.hpp @@ -5,6 +5,7 @@ #include // Buffer management +DECLARE_uint32(page_size); DECLARE_uint64(buffer_pool_size); DECLARE_string(data_dir); DECLARE_uint64(db_file_capacity); diff --git a/source/concurrency-recovery/HistoryTree.cpp b/source/concurrency-recovery/HistoryTree.cpp index df645f59..93011e31 100644 --- a/source/concurrency-recovery/HistoryTree.cpp +++ b/source/concurrency-recovery/HistoryTree.cpp @@ -6,7 +6,6 @@ #include "storage/btree/core/BTreeSharedIterator.hpp" #include "utils/Misc.hpp" -#include #include #include #include @@ -152,10 +151,10 @@ void HistoryTree::purgeVersions(WORKERID workerId, TXID from_tx_id, TXID to_tx_id, RemoveVersionCallback cb, [[maybe_unused]] const u64 limit) { auto keySize = sizeof(to_tx_id); - auto keyBuffer = utils::ArrayOnStack(PAGE_SIZE); + auto keyBuffer = utils::ArrayOnStack(FLAGS_page_size); utils::fold(keyBuffer, from_tx_id); Slice key(keyBuffer, keySize); - auto payload = utils::ArrayOnStack(PAGE_SIZE); + auto payload = utils::ArrayOnStack(FLAGS_page_size); u16 payload_length; volatile u64 removed_versions = 0; BTreeLL* volatile btree = remove_btrees[workerId]; @@ -168,7 +167,7 @@ void HistoryTree::purgeVersions(WORKERID workerId, TXID from_tx_id, iterator.exitLeafCallback( [&](GuardedBufferFrame& guardedLeaf) { if (guardedLeaf->freeSpaceAfterCompaction() >= - BTreeNodeHeader::sUnderFullSize) { + BTreeNode::UnderFullSize()) { iterator.cleanUpCallback([&, to_find = guardedLeaf.mBf] { JUMPMU_TRY() { btree->tryMerge(*to_find); @@ -228,12 +227,11 @@ void HistoryTree::purgeVersions(WORKERID workerId, TXID from_tx_id, if (guardedLeaf->mLowerFence.length == 0) { // Allocate in the stack, freed when the calling function exits. - u8* last_key = (u8*)alloca( - guardedLeaf->getFullKeyLen(guardedLeaf->mNumSeps - 1) * - sizeof(u8)); - guardedLeaf->copyFullKey(guardedLeaf->mNumSeps - 1, last_key); + auto lastKey = utils::ArrayOnStack( + guardedLeaf->getFullKeyLen(guardedLeaf->mNumSeps - 1)); + guardedLeaf->copyFullKey(guardedLeaf->mNumSeps - 1, lastKey); TXID last_key_tx_id; - utils::unfold(last_key, last_key_tx_id); + utils::unfold(lastKey, last_key_tx_id); if (last_key_tx_id > to_tx_id) { should_try = false; } @@ -250,7 +248,7 @@ void HistoryTree::purgeVersions(WORKERID workerId, TXID from_tx_id, iterator.exitLeafCallback( [&](GuardedBufferFrame& guardedLeaf) { if (guardedLeaf->freeSpaceAfterCompaction() >= - BTreeNodeHeader::sUnderFullSize) { + BTreeNode::UnderFullSize()) { iterator.cleanUpCallback([&, to_find = guardedLeaf.mBf] { JUMPMU_TRY() { btree->tryMerge(*to_find); @@ -270,19 +268,18 @@ void HistoryTree::purgeVersions(WORKERID workerId, TXID from_tx_id, return; } // Allocate in the stack, freed when the calling function exits. - auto first_key = - (u8*)alloca(guardedLeaf->getFullKeyLen(0) * sizeof(u8)); - guardedLeaf->copyFullKey(0, first_key); + auto firstKey = + utils::ArrayOnStack(guardedLeaf->getFullKeyLen(0)); + guardedLeaf->copyFullKey(0, firstKey); TXID first_key_tx_id; - utils::unfold(first_key, first_key_tx_id); + utils::unfold(firstKey, first_key_tx_id); // Allocate in the stack, freed when the calling function exits. - auto last_key = (u8*)alloca( - guardedLeaf->getFullKeyLen(guardedLeaf->mNumSeps - 1) * - sizeof(u8)); - guardedLeaf->copyFullKey(guardedLeaf->mNumSeps - 1, last_key); + auto lastKey = utils::ArrayOnStack( + guardedLeaf->getFullKeyLen(guardedLeaf->mNumSeps - 1)); + guardedLeaf->copyFullKey(guardedLeaf->mNumSeps - 1, lastKey); TXID last_key_tx_id; - utils::unfold(last_key, last_key_tx_id); + utils::unfold(lastKey, last_key_tx_id); if (first_key_tx_id >= from_tx_id && to_tx_id >= last_key_tx_id) { // Purge the whole page removed_versions = removed_versions + guardedLeaf->mNumSeps; @@ -320,11 +317,11 @@ void HistoryTree::visitRemoveVersions( // [from, to] BTreeLL* btree = remove_btrees[workerId]; auto keySize = sizeof(to_tx_id); - auto keyBuffer = utils::ArrayOnStack(PAGE_SIZE); + auto keyBuffer = utils::ArrayOnStack(FLAGS_page_size); u64 offset = 0; offset += utils::fold(keyBuffer + offset, from_tx_id); Slice key(keyBuffer, keySize); - auto payload = utils::ArrayOnStack(PAGE_SIZE); + auto payload = utils::ArrayOnStack(FLAGS_page_size); u16 payload_length; JUMPMU_TRY() { diff --git a/source/concurrency-recovery/Recovery.cpp b/source/concurrency-recovery/Recovery.cpp index 8cb6feae..c7b94187 100644 --- a/source/concurrency-recovery/Recovery.cpp +++ b/source/concurrency-recovery/Recovery.cpp @@ -7,12 +7,13 @@ using namespace leanstore::storage; using namespace leanstore::storage::btree; void Recovery::Analysis() { - Page page; // asume that each WALEntry is smaller than the page size - u8* walEntryPtr = (u8*)&page; + // asume that each WALEntry is smaller than the page size + utils::AlignedBuffer<512> alignedBuffer(FLAGS_page_size); + u8* walEntryPtr = alignedBuffer.Get(); u64 walEntrySize = sizeof(WALEntry); for (auto offset = mWalStartOffset; offset < mWalSize;) { - auto bytesRead = ReadWalEntry(offset, walEntrySize, &page); - auto walEntry = reinterpret_cast(&page); + auto bytesRead = ReadWalEntry(offset, walEntrySize, walEntryPtr); + auto walEntry = reinterpret_cast(walEntryPtr); switch (walEntry->type) { case WALEntry::TYPE::TX_START: { DCHECK_EQ(bytesRead, walEntry->size); @@ -73,8 +74,9 @@ void Recovery::Analysis() { } void Recovery::Redo() { - Page page; // asume that each WALEntry is smaller than the page size - u8* walEntryPtr = (u8*)&page; + // asume that each WALEntry is smaller than the page size + utils::AlignedBuffer<512> alignedBuffer(FLAGS_page_size); + u8* walEntryPtr = alignedBuffer.Get(); u64 walEntrySize = sizeof(WALEntry); for (auto offset = mWalStartOffset; offset < mWalSize;) { diff --git a/source/profiling/tables/BMTable.cpp b/source/profiling/tables/BMTable.cpp index ab454f8f..32821670 100644 --- a/source/profiling/tables/BMTable.cpp +++ b/source/profiling/tables/BMTable.cpp @@ -4,29 +4,28 @@ #include "profiling/counters/PPCounters.hpp" #include "profiling/counters/WorkerCounters.hpp" #include "utils/ThreadLocalAggregator.hpp" -// ------------------------------------------------------------------------------------- -// ------------------------------------------------------------------------------------- -// ------------------------------------------------------------------------------------- + using leanstore::utils::threadlocal::sum; + namespace leanstore { namespace profiling { -// ------------------------------------------------------------------------------------- + BMTable::BMTable(BufferManager& bm) : ProfilingTable(), bm(bm) { } -// ------------------------------------------------------------------------------------- + std::string BMTable::getName() { return "bm"; } -// ------------------------------------------------------------------------------------- + void BMTable::open() { columns.emplace("key", [](Column& col) { col << 0; }); columns.emplace("space_usage_gib", [&](Column& col) { const double gib = - bm.consumedPages() * 1.0 * PAGE_SIZE / 1024.0 / 1024.0 / 1024.0; + bm.consumedPages() * 1.0 * FLAGS_page_size / 1024.0 / 1024.0 / 1024.0; col << gib; }); columns.emplace("space_usage_kib", [&](Column& col) { - const double kib = bm.consumedPages() * 1.0 * PAGE_SIZE / 1024.0; + const double kib = bm.consumedPages() * 1.0 * FLAGS_page_size / 1024.0; col << kib; }); columns.emplace("consumed_pages", @@ -65,7 +64,7 @@ void BMTable::open() { }); columns.emplace("evicted_mib", [&](Column& col) { col << (sum(PPCounters::pp_counters, &PPCounters::evicted_pages) * - EFFECTIVE_PAGE_SIZE / 1024.0 / 1024.0); + FLAGS_page_size / 1024.0 / 1024.0); }); columns.emplace("rounds", [&](Column& col) { col << (sum(PPCounters::pp_counters, &PPCounters::pp_thread_rounds)); @@ -86,7 +85,7 @@ void BMTable::open() { }); columns.emplace("w_mib", [&](Column& col) { col << (sum(PPCounters::pp_counters, &PPCounters::flushed_pages_counter) * - EFFECTIVE_PAGE_SIZE / 1024.0 / 1024.0); + FLAGS_page_size / 1024.0 / 1024.0); }); // ------------------------------------------------------------------------------------- columns.emplace("allocate_ops", [&](Column& col) { @@ -96,7 +95,7 @@ void BMTable::open() { columns.emplace("r_mib", [&](Column& col) { col << (sum(WorkerCounters::worker_counters, &WorkerCounters::read_operations_counter) * - EFFECTIVE_PAGE_SIZE / 1024.0 / 1024.0); + FLAGS_page_size / 1024.0 / 1024.0); }); } diff --git a/source/storage/btree/BTreeLL.cpp b/source/storage/btree/BTreeLL.cpp index 36dd38c8..571d7068 100644 --- a/source/storage/btree/BTreeLL.cpp +++ b/source/storage/btree/BTreeLL.cpp @@ -142,7 +142,7 @@ OP_RESULT BTreeLL::insert(Slice key, Slice val) { DCHECK(cr::Worker::my().IsTxStarted()); cr::activeTX().markAsWrite(); if (config.mEnableWal) { - cr::Worker::my().mLogging.walEnsureEnoughSpace(PAGE_SIZE * 1); + cr::Worker::my().mLogging.walEnsureEnoughSpace(FLAGS_page_size * 1); } JUMPMU_TRY() { @@ -271,11 +271,11 @@ OP_RESULT BTreeLL::append(std::function o_key, u16 o_key_length, OP_RESULT ret = iterator.enoughSpaceInCurrentNode(o_key_length, o_value_length); if (ret == OP_RESULT::OK) { - auto key_buffer = (u8*)alloca(o_key_length * sizeof(u8)); - o_key(key_buffer); + auto keyBuffer = utils::ArrayOnStack(o_key_length); + o_key(keyBuffer); const s32 pos = iterator.mGuardedLeaf->mNumSeps; iterator.mGuardedLeaf->insertDoNotCopyPayload( - Slice(key_buffer, o_key_length), o_value_length, pos); + Slice(keyBuffer, o_key_length), o_value_length, pos); iterator.mSlotId = pos; o_value(iterator.mutableValue().data()); iterator.MarkAsDirty(); @@ -292,11 +292,11 @@ OP_RESULT BTreeLL::append(std::function o_key, u16 o_key_length, while (true) { JUMPMU_TRY() { BTreeExclusiveIterator iterator(*static_cast(this)); - auto key_buffer = (u8*)alloca(o_key_length * sizeof(u8)); + auto keyBuffer = utils::ArrayOnStack(o_key_length); for (u64 i = 0; i < o_key_length; i++) { - key_buffer[i] = 255; + keyBuffer[i] = 255; } - const Slice key(key_buffer, o_key_length); + const Slice key(keyBuffer, o_key_length); OP_RESULT ret = iterator.seekToInsert(key); RAISE_WHEN(ret == OP_RESULT::DUPLICATE); ret = iterator.enoughSpaceInCurrentNode(key, o_value_length); @@ -304,7 +304,7 @@ OP_RESULT BTreeLL::append(std::function o_key, u16 o_key_length, iterator.splitForKey(key); JUMPMU_CONTINUE; } - o_key(key_buffer); + o_key(keyBuffer); iterator.insertInCurrentNode(key, o_value_length); o_value(iterator.mutableValue().data()); iterator.MarkAsDirty(); @@ -333,7 +333,7 @@ OP_RESULT BTreeLL::updateSameSizeInPlace( UpdateSameSizeInPlaceDescriptor& update_descriptor) { cr::activeTX().markAsWrite(); if (config.mEnableWal) { - cr::Worker::my().mLogging.walEnsureEnoughSpace(PAGE_SIZE * 1); + cr::Worker::my().mLogging.walEnsureEnoughSpace(FLAGS_page_size * 1); } JUMPMU_TRY() { @@ -380,7 +380,7 @@ OP_RESULT BTreeLL::updateSameSizeInPlace( OP_RESULT BTreeLL::remove(Slice key) { cr::activeTX().markAsWrite(); if (config.mEnableWal) { - cr::Worker::my().mLogging.walEnsureEnoughSpace(PAGE_SIZE * 1); + cr::Worker::my().mLogging.walEnsureEnoughSpace(FLAGS_page_size * 1); } JUMPMU_TRY() { BTreeExclusiveIterator iterator(*static_cast(this)); @@ -414,7 +414,7 @@ OP_RESULT BTreeLL::rangeRemove(Slice startKey, Slice endKey, bool page_wise) { BTreeExclusiveIterator iterator(*static_cast(this)); iterator.exitLeafCallback([&](GuardedBufferFrame& guardedLeaf) { if (guardedLeaf->freeSpaceAfterCompaction() >= - BTreeNodeHeader::sUnderFullSize) { + BTreeNode::UnderFullSize()) { iterator.cleanUpCallback([&, to_find = guardedLeaf.mBf] { JUMPMU_TRY() { this->tryMerge(*to_find); diff --git a/source/storage/btree/BTreeVI.cpp b/source/storage/btree/BTreeVI.cpp index d895f8a1..cbd518dc 100644 --- a/source/storage/btree/BTreeVI.cpp +++ b/source/storage/btree/BTreeVI.cpp @@ -165,7 +165,7 @@ OP_RESULT BTreeVI::updateSameSizeInPlace( }); cr::activeTX().markAsWrite(); - cr::Worker::my().mLogging.walEnsureEnoughSpace(PAGE_SIZE * 1); + cr::Worker::my().mLogging.walEnsureEnoughSpace(FLAGS_page_size * 1); OP_RESULT ret; // 20K instructions more @@ -361,7 +361,7 @@ OP_RESULT BTreeVI::insert(Slice key, Slice val) { }); cr::activeTX().markAsWrite(); - cr::Worker::my().mLogging.walEnsureEnoughSpace(PAGE_SIZE * 1); + cr::Worker::my().mLogging.walEnsureEnoughSpace(FLAGS_page_size * 1); u16 payloadSize = val.size() + sizeof(ChainedTuple); while (true) { @@ -428,7 +428,7 @@ OP_RESULT BTreeVI::remove(Slice key) { // TODO: remove fat tuple cr::activeTX().markAsWrite(); - cr::Worker::my().mLogging.walEnsureEnoughSpace(PAGE_SIZE * 1); + cr::Worker::my().mLogging.walEnsureEnoughSpace(FLAGS_page_size * 1); JUMPMU_TRY() { BTreeExclusiveIterator iterator(*static_cast(this)); diff --git a/source/storage/btree/BTreeVI.hpp b/source/storage/btree/BTreeVI.hpp index 13958813..994ee576 100644 --- a/source/storage/btree/BTreeVI.hpp +++ b/source/storage/btree/BTreeVI.hpp @@ -851,7 +851,7 @@ class BTreeVI : public BTreeLL { private: static inline u64 maxFatTupleLength() { - return EFFECTIVE_PAGE_SIZE - 1000; + return BTreeNode::Size() - 1000; } public: diff --git a/source/storage/btree/FatTupleDifferentAttributes.cpp b/source/storage/btree/FatTupleDifferentAttributes.cpp index 4f1eab77..f9951e5c 100644 --- a/source/storage/btree/FatTupleDifferentAttributes.cpp +++ b/source/storage/btree/FatTupleDifferentAttributes.cpp @@ -305,8 +305,8 @@ cont : { if (fat_tuple->total_space < maxFatTupleLength()) { auto new_fat_tuple_length = std::min(maxFatTupleLength(), fat_tuple->total_space * 2); - auto buffer = utils::ArrayOnStack(PAGE_SIZE); - ENSURE(iterator.value().length() <= PAGE_SIZE); + auto buffer = utils::ArrayOnStack(FLAGS_page_size); + ENSURE(iterator.value().length() <= FLAGS_page_size); std::memcpy(buffer, iterator.value().data(), iterator.value().length()); // const bool did_extend = iterator.extendPayload(new_fat_tuple_length); diff --git a/source/storage/btree/core/BTreeExclusiveIterator.hpp b/source/storage/btree/core/BTreeExclusiveIterator.hpp index cf17679b..b30f2448 100644 --- a/source/storage/btree/core/BTreeExclusiveIterator.hpp +++ b/source/storage/btree/core/BTreeExclusiveIterator.hpp @@ -137,7 +137,7 @@ class BTreeExclusiveIterator : public BTreePessimisticIterator { } bool extendPayload(const u16 new_length) { - if (new_length >= EFFECTIVE_PAGE_SIZE) { + if (new_length >= BTreeNode::Size()) { return false; } ENSURE(mSlotId != -1 && new_length > mGuardedLeaf->ValSize(mSlotId)); @@ -240,7 +240,7 @@ class BTreeExclusiveIterator : public BTreePessimisticIterator { // Returns true if it tried to merge bool mergeIfNeeded() { if (mGuardedLeaf->freeSpaceAfterCompaction() >= - BTreeNodeHeader::sUnderFullSize) { + BTreeNode::UnderFullSize()) { mGuardedLeaf.unlock(); mSlotId = -1; JUMPMU_TRY() { diff --git a/source/storage/btree/core/BTreeGeneric.cpp b/source/storage/btree/core/BTreeGeneric.cpp index 7673ab6c..f203f224 100644 --- a/source/storage/btree/core/BTreeGeneric.cpp +++ b/source/storage/btree/core/BTreeGeneric.cpp @@ -50,7 +50,7 @@ void BTreeGeneric::Init(TREEID btreeId, Config config) { } void BTreeGeneric::trySplit(BufferFrame& toSplit, s16 favoredSplitPos) { - cr::Worker::my().mLogging.walEnsureEnoughSpace(PAGE_SIZE * 1); + cr::Worker::my().mLogging.walEnsureEnoughSpace(FLAGS_page_size * 1); auto parentHandler = findParentEager(*this, toSplit); auto guardedParent = parentHandler.GetGuardedParent(); auto guardedChild = GuardedBufferFrame( @@ -242,8 +242,8 @@ bool BTreeGeneric::tryMerge(BufferFrame& to_merge, bool swizzleSibling) { GuardedBufferFrame guardedChild = GuardedBufferFrame( guardedParent, parentHandler.mChildSwip.CastTo()); int posInParent = parentHandler.mPosInParent; - if (isMetaNode(guardedParent) || guardedChild->freeSpaceAfterCompaction() < - BTreeNodeHeader::sUnderFullSize) { + if (isMetaNode(guardedParent) || + guardedChild->freeSpaceAfterCompaction() < BTreeNode::UnderFullSize()) { guardedParent.unlock(); guardedChild.unlock(); return false; @@ -348,7 +348,7 @@ bool BTreeGeneric::tryMerge(BufferFrame& to_merge, bool swizzleSibling) { GuardedBufferFrame guardedMeta(mMetaNodeSwip); if (!isMetaNode(guardedParent) && guardedParent->freeSpaceAfterCompaction() >= - BTreeNode::sUnderFullSize) { + BTreeNode::UnderFullSize()) { if (tryMerge(*guardedParent.mBf, true)) { WorkerCounters::myCounters().dt_merge_parent_succ[mTreeId]++; } else { @@ -378,7 +378,7 @@ s16 BTreeGeneric::mergeLeftIntoRight( bool full_merge_or_nothing) { // TODO: corner cases: new upper fence is larger than the older one. u32 space_upper_bound = xGuardedLeft->mergeSpaceUpperBound(xGuardedRight); - if (space_upper_bound <= EFFECTIVE_PAGE_SIZE) { + if (space_upper_bound <= BTreeNode::Size()) { // Do a full merge TODO: threshold bool succ = xGuardedLeft->merge(lhsSlotId, xGuardedParent, xGuardedRight); static_cast(succ); @@ -399,7 +399,7 @@ s16 BTreeGeneric::mergeLeftIntoRight( xGuardedLeft->ValSize(s_i); if (space_upper_bound + (xGuardedLeft->getFullKeyLen(s_i) - xGuardedRight->mLowerFence.length) < - EFFECTIVE_PAGE_SIZE * 1.0) { + BTreeNode::Size() * 1.0) { till_slot_id = s_i + 1; break; } @@ -409,7 +409,7 @@ s16 BTreeGeneric::mergeLeftIntoRight( assert((space_upper_bound + (xGuardedLeft->getFullKeyLen(till_slot_id - 1) - xGuardedRight->mLowerFence.length)) < - EFFECTIVE_PAGE_SIZE * 1.0); + BTreeNode::Size() * 1.0); assert(till_slot_id > 0); u16 copy_from_count = xGuardedLeft->mNumSeps - till_slot_id; @@ -424,14 +424,16 @@ s16 BTreeGeneric::mergeLeftIntoRight( } { - BTreeNode tmp(true); - tmp.setFences(Slice(newLeftUpperFence, newLeftUpperFenceSize), - xGuardedRight->GetUpperFence()); + auto nodeBuf = utils::ArrayOnStack(BTreeNode::Size()); + auto tmp = BTreeNode::Init(nodeBuf, true); - xGuardedLeft->copyKeyValueRange(&tmp, 0, till_slot_id, copy_from_count); - xGuardedRight->copyKeyValueRange(&tmp, copy_from_count, 0, + tmp->setFences(Slice(newLeftUpperFence, newLeftUpperFenceSize), + xGuardedRight->GetUpperFence()); + + xGuardedLeft->copyKeyValueRange(tmp, 0, till_slot_id, copy_from_count); + xGuardedRight->copyKeyValueRange(tmp, copy_from_count, 0, xGuardedRight->mNumSeps); - memcpy(xGuardedRight.GetPagePayloadPtr(), &tmp, sizeof(BTreeNode)); + memcpy(xGuardedRight.GetPagePayloadPtr(), tmp, sizeof(BTreeNode)); xGuardedRight->makeHint(); // Nothing to do for the right node's separator @@ -439,13 +441,15 @@ s16 BTreeGeneric::mergeLeftIntoRight( Slice(newLeftUpperFence, newLeftUpperFenceSize)) == 1); } { - BTreeNode tmp(true); - tmp.setFences(xGuardedLeft->GetLowerFence(), - Slice(newLeftUpperFence, newLeftUpperFenceSize)); + auto nodeBuf = utils::ArrayOnStack(BTreeNode::Size()); + auto tmp = BTreeNode::Init(nodeBuf, true); + + tmp->setFences(xGuardedLeft->GetLowerFence(), + Slice(newLeftUpperFence, newLeftUpperFenceSize)); // ------------------------------------------------------------------------------------- - xGuardedLeft->copyKeyValueRange(&tmp, 0, 0, + xGuardedLeft->copyKeyValueRange(tmp, 0, 0, xGuardedLeft->mNumSeps - copy_from_count); - memcpy(xGuardedLeft.GetPagePayloadPtr(), &tmp, sizeof(BTreeNode)); + memcpy(xGuardedLeft.GetPagePayloadPtr(), tmp, sizeof(BTreeNode)); xGuardedLeft->makeHint(); // ------------------------------------------------------------------------------------- assert(xGuardedLeft->compareKeyWithBoundaries( @@ -642,7 +646,7 @@ void BTreeGeneric::printInfos(uint64_t totalSize) { guardedParent->mRightMostChildSwip); uint64_t cnt = countPages(); cout << "nodes:" << cnt << " innerNodes:" << countInner() - << " space:" << (cnt * EFFECTIVE_PAGE_SIZE) / (float)totalSize + << " space:" << (cnt * BTreeNode::Size()) / (float)totalSize << " height:" << mHeight << " rootCnt:" << guardedRightMost->mNumSeps << " bytesFree:" << bytesFree() << endl; } diff --git a/source/storage/btree/core/BTreeGeneric.hpp b/source/storage/btree/core/BTreeGeneric.hpp index a988dd61..c684447e 100644 --- a/source/storage/btree/core/BTreeGeneric.hpp +++ b/source/storage/btree/core/BTreeGeneric.hpp @@ -391,7 +391,7 @@ inline SpaceCheckResult BTreeGeneric::checkSpaceUtilization(BufferFrame& bf) { } inline void BTreeGeneric::Checkpoint(BufferFrame& bf, void* dest) { - std::memcpy(dest, &bf.page, PAGE_SIZE); + std::memcpy(dest, &bf.page, FLAGS_page_size); auto destPage = reinterpret_cast(dest); auto destNode = reinterpret_cast(destPage->mPayload); diff --git a/source/storage/btree/core/BTreeNode.cpp b/source/storage/btree/core/BTreeNode.cpp index 2b37001c..d8d09c26 100644 --- a/source/storage/btree/core/BTreeNode.cpp +++ b/source/storage/btree/core/BTreeNode.cpp @@ -6,8 +6,6 @@ #include -#include - namespace leanstore { namespace storage { namespace btree { @@ -105,11 +103,14 @@ s32 BTreeNode::insert(Slice key, Slice val) { void BTreeNode::compactify() { u16 should = freeSpaceAfterCompaction(); static_cast(should); - BTreeNode tmp(mIsLeaf); - tmp.setFences(GetLowerFence(), GetUpperFence()); - copyKeyValueRange(&tmp, 0, 0, mNumSeps); - tmp.mRightMostChildSwip = mRightMostChildSwip; - memcpy(reinterpret_cast(this), &tmp, sizeof(BTreeNode)); + + auto tmpNodeBuf = utils::ArrayOnStack(BTreeNode::Size()); + auto tmp = BTreeNode::Init(tmpNodeBuf, mIsLeaf); + + tmp->setFences(GetLowerFence(), GetUpperFence()); + copyKeyValueRange(tmp, 0, 0, mNumSeps); + tmp->mRightMostChildSwip = mRightMostChildSwip; + memcpy(reinterpret_cast(this), tmp, sizeof(BTreeNode)); makeHint(); assert(freeSpace() == should); } @@ -117,11 +118,14 @@ void BTreeNode::compactify() { u32 BTreeNode::mergeSpaceUpperBound( ExclusiveGuardedBufferFrame& xGuardedRight) { DCHECK(xGuardedRight->mIsLeaf); - BTreeNode tmp(true); - tmp.setFences(GetLowerFence(), xGuardedRight->GetUpperFence()); - u32 leftGrow = (mPrefixSize - tmp.mPrefixSize) * mNumSeps; + + auto tmpNodeBuf = utils::ArrayOnStack(BTreeNode::Size()); + auto tmp = BTreeNode::Init(tmpNodeBuf, true); + + tmp->setFences(GetLowerFence(), xGuardedRight->GetUpperFence()); + u32 leftGrow = (mPrefixSize - tmp->mPrefixSize) * mNumSeps; u32 rightGrow = - (xGuardedRight->mPrefixSize - tmp.mPrefixSize) * xGuardedRight->mNumSeps; + (xGuardedRight->mPrefixSize - tmp->mPrefixSize) * xGuardedRight->mNumSeps; u32 spaceUpperBound = mSpaceUsed + xGuardedRight->mSpaceUsed + (reinterpret_cast(slot + mNumSeps + xGuardedRight->mNumSeps) - @@ -142,36 +146,41 @@ bool BTreeNode::merge(u16 slotId, if (mIsLeaf) { assert(xGuardedRight->mIsLeaf); assert(xGuardedParent->isInner()); - BTreeNode tmp(mIsLeaf); - tmp.setFences(GetLowerFence(), xGuardedRight->GetUpperFence()); - u16 leftGrow = (mPrefixSize - tmp.mPrefixSize) * mNumSeps; - u16 rightGrow = (xGuardedRight->mPrefixSize - tmp.mPrefixSize) * + + auto tmpNodeBuf = utils::ArrayOnStack(BTreeNode::Size()); + auto tmp = BTreeNode::Init(tmpNodeBuf, true); + + tmp->setFences(GetLowerFence(), xGuardedRight->GetUpperFence()); + u16 leftGrow = (mPrefixSize - tmp->mPrefixSize) * mNumSeps; + u16 rightGrow = (xGuardedRight->mPrefixSize - tmp->mPrefixSize) * xGuardedRight->mNumSeps; u16 spaceUpperBound = mSpaceUsed + xGuardedRight->mSpaceUsed + (reinterpret_cast(slot + mNumSeps + xGuardedRight->mNumSeps) - RawPtr()) + leftGrow + rightGrow; - if (spaceUpperBound > EFFECTIVE_PAGE_SIZE) { + if (spaceUpperBound > BTreeNode::Size()) { return false; } - copyKeyValueRange(&tmp, 0, 0, mNumSeps); - xGuardedRight->copyKeyValueRange(&tmp, mNumSeps, 0, - xGuardedRight->mNumSeps); + copyKeyValueRange(tmp, 0, 0, mNumSeps); + xGuardedRight->copyKeyValueRange(tmp, mNumSeps, 0, xGuardedRight->mNumSeps); xGuardedParent->removeSlot(slotId); // ------------------------------------------------------------------------------------- xGuardedRight->mHasGarbage |= mHasGarbage; // ------------------------------------------------------------------------------------- - memcpy(xGuardedRight.GetPagePayloadPtr(), &tmp, sizeof(BTreeNode)); + memcpy(xGuardedRight.GetPagePayloadPtr(), tmp, sizeof(BTreeNode)); xGuardedRight->makeHint(); return true; } else { // Inner node assert(!xGuardedRight->mIsLeaf); assert(xGuardedParent->isInner()); - BTreeNode tmp(mIsLeaf); - tmp.setFences(GetLowerFence(), xGuardedRight->GetUpperFence()); - u16 leftGrow = (mPrefixSize - tmp.mPrefixSize) * mNumSeps; - u16 rightGrow = (xGuardedRight->mPrefixSize - tmp.mPrefixSize) * + + auto tmpNodeBuf = utils::ArrayOnStack(BTreeNode::Size()); + auto tmp = BTreeNode::Init(tmpNodeBuf, mIsLeaf); + + tmp->setFences(GetLowerFence(), xGuardedRight->GetUpperFence()); + u16 leftGrow = (mPrefixSize - tmp->mPrefixSize) * mNumSeps; + u16 rightGrow = (xGuardedRight->mPrefixSize - tmp->mPrefixSize) * xGuardedRight->mNumSeps; u16 extraKeyLength = xGuardedParent->getFullKeyLen(slotId); u16 spaceUpperBound = @@ -179,23 +188,23 @@ bool BTreeNode::merge(u16 slotId, (reinterpret_cast(slot + mNumSeps + xGuardedRight->mNumSeps) - RawPtr()) + leftGrow + rightGrow + - spaceNeeded(extraKeyLength, sizeof(SwipType), tmp.mPrefixSize); - if (spaceUpperBound > EFFECTIVE_PAGE_SIZE) + spaceNeeded(extraKeyLength, sizeof(SwipType), tmp->mPrefixSize); + if (spaceUpperBound > BTreeNode::Size()) return false; - copyKeyValueRange(&tmp, 0, 0, mNumSeps); + copyKeyValueRange(tmp, 0, 0, mNumSeps); // Allocate in the stack, freed when the calling function exits. - u8* extraKey = (u8*)alloca(extraKeyLength * sizeof(u8)); + auto extraKey = utils::ArrayOnStack(extraKeyLength); xGuardedParent->copyFullKey(slotId, extraKey); - tmp.storeKeyValue( + tmp->storeKeyValue( mNumSeps, Slice(extraKey, extraKeyLength), Slice(reinterpret_cast(&mRightMostChildSwip), sizeof(SwipType))); - tmp.mNumSeps++; - xGuardedRight->copyKeyValueRange(&tmp, tmp.mNumSeps, 0, + tmp->mNumSeps++; + xGuardedRight->copyKeyValueRange(tmp, tmp->mNumSeps, 0, xGuardedRight->mNumSeps); xGuardedParent->removeSlot(slotId); - tmp.mRightMostChildSwip = xGuardedRight->mRightMostChildSwip; - tmp.makeHint(); - memcpy(xGuardedRight.GetPagePayloadPtr(), &tmp, sizeof(BTreeNode)); + tmp->mRightMostChildSwip = xGuardedRight->mRightMostChildSwip; + tmp->makeHint(); + memcpy(xGuardedRight.GetPagePayloadPtr(), tmp, sizeof(BTreeNode)); return true; } } @@ -255,7 +264,7 @@ void BTreeNode::copyKeyValueRange(BTreeNode* dst, u16 dstSlot, u16 srcSlot, void BTreeNode::copyKeyValue(u16 srcSlot, BTreeNode* dst, u16 dstSlot) { u16 fullLength = getFullKeyLen(srcSlot); - auto key = (u8*)alloca(fullLength * sizeof(u8)); + auto key = utils::ArrayOnStack(fullLength); copyFullKey(srcSlot, key); dst->storeKeyValue(dstSlot, Slice(key, fullLength), Value(srcSlot)); } @@ -392,12 +401,14 @@ Swip& BTreeNode::lookupInner(Slice key) { void BTreeNode::split(ExclusiveGuardedBufferFrame& xGuardedParent, ExclusiveGuardedBufferFrame& xGuardedLeft, u16 sepSlot, u8* sepKey, u16 sepLength) { - DCHECK(sepSlot < (EFFECTIVE_PAGE_SIZE / sizeof(SwipType))); + DCHECK(sepSlot < (BTreeNode::Size() / sizeof(SwipType))); DCHECK(xGuardedParent->canInsert(sepLength, sizeof(SwipType))); xGuardedLeft->setFences(GetLowerFence(), Slice(sepKey, sepLength)); - BTreeNode tmp(mIsLeaf); - BTreeNode* nodeRight = &tmp; + + auto tmpNodeBuf = utils::ArrayOnStack(BTreeNode::Size()); + auto nodeRight = BTreeNode::Init(tmpNodeBuf, mIsLeaf); + nodeRight->setFences(Slice(sepKey, sepLength), GetUpperFence()); auto swip = xGuardedLeft.swip(); xGuardedParent->insert(Slice(sepKey, sepLength), @@ -440,7 +451,7 @@ bool BTreeNode::remove(Slice key) { void BTreeNode::reset() { mSpaceUsed = mUpperFence.length + mLowerFence.length; - mDataOffset = EFFECTIVE_PAGE_SIZE - mSpaceUsed; + mDataOffset = BTreeNode::Size() - mSpaceUsed; mNumSeps = 0; } diff --git a/source/storage/btree/core/BTreeNode.hpp b/source/storage/btree/core/BTreeNode.hpp index bd0137c1..005a0bcb 100644 --- a/source/storage/btree/core/BTreeNode.hpp +++ b/source/storage/btree/core/BTreeNode.hpp @@ -10,7 +10,6 @@ #include "rapidjson/document.h" #include -#include #include #include #include @@ -42,8 +41,6 @@ static inline u8 swap(u8 x) { class BTreeNodeHeader { public: - static const u16 sUnderFullSize = EFFECTIVE_PAGE_SIZE * 0.6; - static const u16 sKWayMergeThreshold = EFFECTIVE_PAGE_SIZE * 0.45; static const u16 sHintCount = 16; struct SeparatorInfo { @@ -79,7 +76,14 @@ class BTreeNodeHeader { /// @note !!! does not include the header, but includes fences !!! u16 mSpaceUsed = 0; - u16 mDataOffset = static_cast(EFFECTIVE_PAGE_SIZE); + /// Data offset of the current slot in the BTreeNode. The BTreeNode is + /// organized as follows: + /// + /// | BTreeNodeHeader | info of slot 0..N | ... | data of slot N..0 | + /// + /// It's initialized to the total size of the btree node, reduced and assigned + /// to each slot when the number of slots is increasing. + u16 mDataOffset; u16 mPrefixSize = 0; @@ -89,7 +93,7 @@ class BTreeNodeHeader { bool mHasGarbage = false; public: - BTreeNodeHeader(bool isLeaf) : mIsLeaf(isLeaf) { + BTreeNodeHeader(bool isLeaf, u16 size) : mIsLeaf(isLeaf), mDataOffset(size) { } ~BTreeNodeHeader() { @@ -145,20 +149,19 @@ class BTreeNode : public BTreeNodeHeader { }; public: - // Just to make sizeof(BTreeNode) == EFFECTIVE_PAGE_SIZE - static constexpr u64 sSlotCapacity = - (EFFECTIVE_PAGE_SIZE - sizeof(BTreeNodeHeader)) / (sizeof(Slot)); - - static constexpr u64 sLeftSpaceToWaste = - (EFFECTIVE_PAGE_SIZE - sizeof(BTreeNodeHeader)) % (sizeof(Slot)); - -public: - Slot slot[sSlotCapacity]; - - u8 padding[sLeftSpaceToWaste]; + Slot slot[]; public: - BTreeNode(bool isLeaf) : BTreeNodeHeader(isLeaf) { + /// Creates a BTreeNode. Since BTreeNode creations and utilizations are + /// critical, please use ExclusiveGuardedBufferFrame::InitPayload() or + /// BTreeNode::Init() to construct a BTreeNode on an existing buffer which has + /// at least BTreeNode::Size() bytes: + /// 1. ExclusiveGuardedBufferFrame::InitPayload() creates a BTreeNode on the + /// holding BufferFrame. + /// 2. BTreeNode::Init(): creates a BTreeNode on the providing buffer. The + /// size of the underlying buffer to store a BTreeNode can be obtained + /// through BTreeNode::Size() + BTreeNode(bool isLeaf) : BTreeNodeHeader(isLeaf, BTreeNode::Size()) { } public: @@ -167,12 +170,12 @@ class BTreeNode : public BTreeNodeHeader { } u16 freeSpaceAfterCompaction() { - return EFFECTIVE_PAGE_SIZE - + return BTreeNode::Size() - (reinterpret_cast(slot + mNumSeps) - RawPtr()) - mSpaceUsed; } double fillFactorAfterCompaction() { - return (1 - (freeSpaceAfterCompaction() * 1.0 / EFFECTIVE_PAGE_SIZE)); + return (1 - (freeSpaceAfterCompaction() * 1.0 / BTreeNode::Size())); } bool hasEnoughSpaceFor(u32 space_needed) { @@ -257,7 +260,7 @@ class BTreeNode : public BTreeNodeHeader { const u16 old_total_length = keySizeWithoutPrefix + ValSize(slot_id); const u16 new_total_length = keySizeWithoutPrefix + new_payload_length; // Allocate a block that will be freed when the calling function exits. - u8* key = (u8*)alloca(keySizeWithoutPrefix * sizeof(u8)); + auto key = utils::ArrayOnStack(keySizeWithoutPrefix); std::memcpy(key, KeyDataWithoutPrefix(slot_id), keySizeWithoutPrefix); mSpaceUsed -= old_total_length; if (mDataOffset == slot[slot_id].offset && 0) { @@ -611,10 +614,21 @@ class BTreeNode : public BTreeNodeHeader { // fallback to the normal compare return shrinkSearchRange(lower, upper, key); } -}; -static_assert(sizeof(BTreeNode) == EFFECTIVE_PAGE_SIZE, - "BTreeNode must be equal to one page"); +public: + template + inline static BTreeNode* Init(void* addr, Args&&... args) { + return new (addr) BTreeNode(std::forward(args)...); + } + + inline static u16 Size() { + return static_cast(FLAGS_page_size - sizeof(Page)); + } + + inline static u16 UnderFullSize() { + return BTreeNode::Size() * 0.6; + } +}; } // namespace btree } // namespace storage diff --git a/source/storage/btree/core/BTreePessimisticIterator.hpp b/source/storage/btree/core/BTreePessimisticIterator.hpp index 1f4d5bb8..726ce4e7 100644 --- a/source/storage/btree/core/BTreePessimisticIterator.hpp +++ b/source/storage/btree/core/BTreePessimisticIterator.hpp @@ -128,7 +128,7 @@ class BTreePessimisticIterator : public BTreePessimisticIteratorInterface { public: BTreePessimisticIterator(BTreeGeneric& tree, const LATCH_FALLBACK_MODE mode = LATCH_FALLBACK_MODE::SHARED) - : mBTree(tree), mode(mode), mBuffer(PAGE_SIZE, 0) { + : mBTree(tree), mode(mode), mBuffer(FLAGS_page_size, 0) { } void enterLeafCallback(LeafCallback cb) { diff --git a/source/storage/buffer-manager/AsyncWriteBuffer.cpp b/source/storage/buffer-manager/AsyncWriteBuffer.cpp index fe9ccbde..4078e5e2 100644 --- a/source/storage/buffer-manager/AsyncWriteBuffer.cpp +++ b/source/storage/buffer-manager/AsyncWriteBuffer.cpp @@ -13,8 +13,8 @@ namespace leanstore { namespace storage { AsyncWriteBuffer::AsyncWriteBuffer(int fd, u64 page_size, u64 batch_max_size) - : fd(fd), page_size(page_size), batch_max_size(batch_max_size) { - write_buffer = make_unique(batch_max_size); + : fd(fd), page_size(page_size), batch_max_size(batch_max_size), + mWriteBuffer(FLAGS_page_size * batch_max_size) { write_buffer_commands = make_unique(batch_max_size); iocbs = make_unique(batch_max_size); iocbs_ptr = make_unique(batch_max_size); @@ -59,8 +59,8 @@ void AsyncWriteBuffer::AddToIOBatch(BufferFrame& bf, PID pageId) { write_buffer_commands[slot].bf = &bf; write_buffer_commands[slot].mPageId = pageId; bf.page.mMagicDebuging = pageId; - std::memcpy(&write_buffer[slot], &bf.page, page_size); - void* write_buffer_slot_ptr = &write_buffer[slot]; + void* write_buffer_slot_ptr = GetWriteBuffer(slot); + std::memcpy(write_buffer_slot_ptr, &bf.page, page_size); io_prep_pwrite(/* iocb */ &iocbs[slot], /* fd */ fd, /* buf */ write_buffer_slot_ptr, /* count */ page_size, /* offset */ page_size * pageId); @@ -96,11 +96,12 @@ void AsyncWriteBuffer::IterateFlushedBfs( std::function callback, u64 numFlushedBfs) { for (u64 i = 0; i < numFlushedBfs; i++) { const auto slot = - (u64(events[i].data) - u64(write_buffer.get())) / page_size; + (u64(events[i].data) - u64(mWriteBuffer.Get())) / page_size; DCHECK(events[i].res == page_size); explainIfNot(events[i].res2 == 0); - auto flushedLSN = write_buffer[slot].mPSN; + auto flushedPage = reinterpret_cast(GetWriteBuffer(slot)); + auto flushedLSN = flushedPage->mPSN; auto flushedBf = write_buffer_commands[slot].bf; callback(*flushedBf, flushedLSN); } diff --git a/source/storage/buffer-manager/AsyncWriteBuffer.hpp b/source/storage/buffer-manager/AsyncWriteBuffer.hpp index b14e0245..872a46bd 100644 --- a/source/storage/buffer-manager/AsyncWriteBuffer.hpp +++ b/source/storage/buffer-manager/AsyncWriteBuffer.hpp @@ -2,6 +2,7 @@ #include "BufferFrame.hpp" #include "Units.hpp" +#include "utils/Misc.hpp" #include #include @@ -25,7 +26,7 @@ class AsyncWriteBuffer { u64 batch_max_size; u64 pending_requests = 0; - std::unique_ptr write_buffer; + utils::AlignedBuffer<512> mWriteBuffer; std::unique_ptr write_buffer_commands; std::unique_ptr iocbs; std::unique_ptr iocbs_ptr; @@ -35,6 +36,10 @@ class AsyncWriteBuffer { bool full(); + u8* GetWriteBuffer(u64 slot) { + return &mWriteBuffer.Get()[slot * FLAGS_page_size]; + } + void AddToIOBatch(BufferFrame& bf, PID pageId); u64 SubmitIORequest(); diff --git a/source/storage/buffer-manager/BufferFrame.hpp b/source/storage/buffer-manager/BufferFrame.hpp index 8a32ccb1..c3d8f475 100644 --- a/source/storage/buffer-manager/BufferFrame.hpp +++ b/source/storage/buffer-manager/BufferFrame.hpp @@ -4,6 +4,7 @@ #include "Units.hpp" #include "sync-primitives/HybridGuard.hpp" #include "utils/JsonUtil.hpp" +#include "utils/Misc.hpp" #include #include @@ -13,13 +14,9 @@ #include #include -#include - namespace leanstore { namespace storage { -const u64 PAGE_SIZE = 4096; // 4KB - /// Used for contention based split. See more details in: "Contention and Space /// Management in B-Trees" class ContentionStats { @@ -132,7 +129,7 @@ class BufferFrameHeader { /// Page is the content stored in the disk file. Page id is not here because it /// is determined by the offset in the disk file, no need to store it /// explicitly. -class alignas(512) Page { +class Page { public: /// Short for "page sequence number", increased when a page is modified. A /// page is "dirty" when mPSN > mFlushedPSN in the header. @@ -150,11 +147,13 @@ class alignas(512) Page { u64 mMagicDebuging; /// The data stored in this page. The btree node content is stored here. - u8 mPayload[PAGE_SIZE - sizeof(mPSN) - sizeof(mGSN) - sizeof(mBTreeId) - - sizeof(mMagicDebuging)]; -}; + u8 mPayload[]; -static constexpr u64 EFFECTIVE_PAGE_SIZE = sizeof(Page::mPayload); +public: + u64 CRC() { + return utils::CRC(mPayload, FLAGS_page_size - sizeof(Page)); + } +}; /// The unit of buffer pool. Buffer pool is partitioned into several partitions, /// and each partition is composed of BufferFrames. A BufferFrame is used to @@ -169,13 +168,13 @@ class BufferFrame { public: /// The control part. Information used by buffer manager, concurrent /// transaction control, etc. are stored here. - BufferFrameHeader header; + alignas(512) BufferFrameHeader header; // The persisted data part. Each page maps to a underlying disk page. It's // persisted to disk when the checkpoint happens, or when the storage is // shutdown. It should be recovered based on the old page content and the // write-ahead log of the page. - Page page; + alignas(512) Page page; public: BufferFrame() { @@ -220,10 +219,12 @@ class BufferFrame { void ToJSON(rapidjson::Value* resultObj, rapidjson::Value::AllocatorType& allocator); -}; -static_assert(sizeof(Page) == PAGE_SIZE, "The total sizeof page"); -// static_assert((sizeof(BufferFrame) - sizeof(Page)) == 512, ""); +public: + static size_t Size() { + return 512 + FLAGS_page_size; + } +}; // ----------------------------------------------------------------------------- // BufferFrame diff --git a/source/storage/buffer-manager/BufferFrameProvider.hpp b/source/storage/buffer-manager/BufferFrameProvider.hpp index 56ba2365..211ab2a2 100644 --- a/source/storage/buffer-manager/BufferFrameProvider.hpp +++ b/source/storage/buffer-manager/BufferFrameProvider.hpp @@ -83,7 +83,7 @@ class BufferFrameProvider { atomic mKeepRunning; const u64 mNumBfs; - BufferFrame* mBfs; + u8* mBufferPool; const u64 mNumPartitions; const u64 mPartitionsMask; @@ -100,13 +100,13 @@ class BufferFrameProvider { public: BufferFrameProvider(u64 id, const std::string& threadName, u64 numBfs, - BufferFrame* bfs, u64 numPartitions, u64 partitionMask, + u8* bfs, u64 numPartitions, u64 partitionMask, std::vector>& partitions, int fd) : mId(id), mThreadName(threadName), mThread(nullptr), mKeepRunning(false), - mNumBfs(numBfs), mBfs(bfs), mNumPartitions(numPartitions), + mNumBfs(numBfs), mBufferPool(bfs), mNumPartitions(numPartitions), mPartitionsMask(partitionMask), mPartitions(partitions), mFD(fd), - mAsyncWriteBuffer(fd, PAGE_SIZE, FLAGS_write_buffer_size) { + mAsyncWriteBuffer(fd, FLAGS_page_size, FLAGS_write_buffer_size) { mCoolCandidateBfs.reserve(FLAGS_buffer_frame_recycle_batch_size); mEvictCandidateBfs.reserve(FLAGS_buffer_frame_recycle_batch_size); } @@ -128,7 +128,6 @@ class BufferFrameProvider { if (mThread == nullptr) { mKeepRunning = true; mThread = std::make_unique(&BufferFrameProvider::Run, this); - mThread->detach(); } } @@ -180,7 +179,8 @@ class BufferFrameProvider { inline BufferFrame* RandomBufferFrame() { auto i = utils::RandomGenerator::getRand(0, mNumBfs); - return &mBfs[i]; + auto bfAddr = &mBufferPool[i * BufferFrame::Size()]; + return reinterpret_cast(bfAddr); } inline Partition& randomPartition() { @@ -257,8 +257,7 @@ inline void BufferFrameProvider::EvictFlushedBf( optimisticGuard.mGuard.ToExclusiveMayJump(); if (FLAGS_crc_check && cooledBf.header.crc) { - DCHECK(utils::CRC(cooledBf.page.mPayload, EFFECTIVE_PAGE_SIZE) == - cooledBf.header.crc); + DCHECK(cooledBf.page.CRC() == cooledBf.header.crc); } DCHECK(!cooledBf.isDirty()); DCHECK(!cooledBf.header.mIsBeingWrittenBack); @@ -545,8 +544,7 @@ inline void BufferFrameProvider::PrepareAsyncWriteBuffer( // performs crc check if necessary if (FLAGS_crc_check) { - cooledBf->header.crc = - utils::CRC(cooledBf->page.mPayload, EFFECTIVE_PAGE_SIZE); + cooledBf->header.crc = cooledBf->page.CRC(); } // TODO: preEviction callback according to TREEID diff --git a/source/storage/buffer-manager/BufferManager.cpp b/source/storage/buffer-manager/BufferManager.cpp index aa61496a..5b25c75b 100644 --- a/source/storage/buffer-manager/BufferManager.cpp +++ b/source/storage/buffer-manager/BufferManager.cpp @@ -34,8 +34,8 @@ thread_local BufferFrame* BufferManager::sTlsLastReadBf = nullptr; std::unique_ptr BufferManager::sInstance = nullptr; BufferManager::BufferManager(s32 fd) : mPageFd(fd) { - mNumBfs = FLAGS_buffer_pool_size / sizeof(BufferFrame); - const u64 totalMemSize = sizeof(BufferFrame) * (mNumBfs + mNumSaftyBfs); + mNumBfs = FLAGS_buffer_pool_size / BufferFrame::Size(); + const u64 totalMemSize = BufferFrame::Size() * (mNumBfs + mNumSaftyBfs); // Init buffer pool with zero-initialized buffer frames. Use mmap with flags // MAP_PRIVATE and MAP_ANONYMOUS, no underlying file desciptor to allocate @@ -53,9 +53,9 @@ BufferManager::BufferManager(s32 fd) : mPageFd(fd) { << ", FLAGS_buffer_pool_size=" << FLAGS_buffer_pool_size << ", totalMemSize=" << totalMemSize; - mBfs = reinterpret_cast(underlyingBuf); - madvise(mBfs, totalMemSize, MADV_HUGEPAGE); - madvise(mBfs, totalMemSize, MADV_DONTFORK); + mBufferPool = reinterpret_cast(underlyingBuf); + madvise(mBufferPool, totalMemSize, MADV_HUGEPAGE); + madvise(mBufferPool, totalMemSize, MADV_DONTFORK); } // Initialize mPartitions @@ -69,17 +69,13 @@ BufferManager::BufferManager(s32 fd) : mPageFd(fd) { i, mNumPartitions, freeBfsLimitPerPartition)); } - // zerofill all the buffer frames - utils::Parallelize::parallelRange(totalMemSize, [&](u64 begin, u64 end) { - memset(reinterpret_cast(mBfs) + begin, 0, end - begin); - }); - // spread these buffer frames to all the partitions utils::Parallelize::parallelRange(mNumBfs, [&](u64 begin, u64 end) { u64 partitionId = 0; for (u64 i = begin; i < end; i++) { auto& partition = getPartition(partitionId); - partition.mFreeBfList.PushFront(*new (mBfs + i) BufferFrame()); + auto bfAddr = &mBufferPool[i * BufferFrame::Size()]; + partition.mFreeBfList.PushFront(*new (bfAddr) BufferFrame()); partitionId = (partitionId + 1) % mNumPartitions; } }); @@ -95,7 +91,7 @@ void BufferManager::StartBufferFrameProviders() { mBfProviders.reserve(FLAGS_pp_threads); for (auto i = 0u; i < FLAGS_pp_threads; ++i) { mBfProviders.push_back(std::move(std::make_unique( - i, "leanstore_bf_provider_" + std::to_string(i), mNumBfs, mBfs, + i, "leanstore_bf_provider_" + std::to_string(i), mNumBfs, mBufferPool, mNumPartitions, mPartitionsMask, mPartitions, mPageFd))); } @@ -131,15 +127,17 @@ void BufferManager::CheckpointAllBufferFrames() { StopBufferFrameProviders(); utils::Parallelize::parallelRange(mNumBfs, [&](u64 begin, u64 end) { - Page page; + utils::AlignedBuffer<512> alignedBuffer(FLAGS_page_size); + auto buffer = alignedBuffer.Get(); for (u64 i = begin; i < end; i++) { - auto& bf = mBfs[i]; + auto bfAddr = &mBufferPool[i * BufferFrame::Size()]; + auto& bf = *reinterpret_cast(bfAddr); bf.header.mLatch.LockExclusively(); if (!bf.isFree()) { - TreeRegistry::sInstance->Checkpoint(bf.page.mBTreeId, bf, &page); - s64 ret = - pwrite(mPageFd, &page, PAGE_SIZE, bf.header.mPageId * PAGE_SIZE); - DCHECK(ret == PAGE_SIZE); + TreeRegistry::sInstance->Checkpoint(bf.page.mBTreeId, bf, buffer); + auto ret = pwrite(mPageFd, buffer, FLAGS_page_size, + bf.header.mPageId * FLAGS_page_size); + DCHECK_EQ(ret, FLAGS_page_size); } bf.header.mLatch.UnlockExclusively(); } @@ -147,12 +145,14 @@ void BufferManager::CheckpointAllBufferFrames() { } void BufferManager::CheckpointBufferFrame(BufferFrame& bf) { - Page page; + utils::AlignedBuffer<512> alignedBuffer(FLAGS_page_size); + auto buffer = alignedBuffer.Get(); bf.header.mLatch.LockExclusively(); if (!bf.isFree()) { - TreeRegistry::sInstance->Checkpoint(bf.page.mBTreeId, bf, &page); - s64 ret = pwrite(mPageFd, &page, PAGE_SIZE, bf.header.mPageId * PAGE_SIZE); - ENSURE(ret == PAGE_SIZE); + TreeRegistry::sInstance->Checkpoint(bf.page.mBTreeId, bf, buffer); + auto ret = pwrite(mPageFd, buffer, FLAGS_page_size, + bf.header.mPageId * FLAGS_page_size); + DCHECK_EQ(ret, FLAGS_page_size); } bf.header.mLatch.UnlockExclusively(); } @@ -182,8 +182,9 @@ Partition& BufferManager::randomPartition() { } BufferFrame& BufferManager::randomBufferFrame() { - auto randOrdinal = utils::RandomGenerator::getRand(0, mNumBfs); - return mBfs[randOrdinal]; + auto i = utils::RandomGenerator::getRand(0, mNumBfs); + auto bfAddr = &mBufferPool[i * BufferFrame::Size()]; + return *reinterpret_cast(bfAddr); } BufferFrame& BufferManager::AllocNewPage() { @@ -287,7 +288,7 @@ BufferFrame* BufferManager::ResolveSwipMayJump(HybridGuard& swipGuard, bf.header.state = STATE::LOADED; bf.header.mPageId = pageId; if (FLAGS_crc_check) { - bf.header.crc = utils::CRC(bf.page.mPayload, EFFECTIVE_PAGE_SIZE); + bf.header.crc = bf.page.CRC(); } // 5. Publish the buffer frame @@ -382,10 +383,11 @@ BufferFrame* BufferManager::ResolveSwipMayJump(HybridGuard& swipGuard, void BufferManager::ReadPageSync(PID pageId, void* destination) { DCHECK(u64(destination) % 512 == 0); - s64 bytesLeft = PAGE_SIZE; + s64 bytesLeft = FLAGS_page_size; do { - auto bytesRead = pread(mPageFd, destination, bytesLeft, - pageId * PAGE_SIZE + (PAGE_SIZE - bytesLeft)); + auto bytesRead = + pread(mPageFd, destination, bytesLeft, + pageId * FLAGS_page_size + (FLAGS_page_size - bytesLeft)); if (bytesRead < 0) { LOG(ERROR) << "pread failed" << ", error= " << bytesRead << ", pageId=" << pageId; @@ -424,7 +426,7 @@ void BufferManager::WritePageSync(BufferFrame& bf) { guardedBf.ToExclusiveMayJump(); auto pageId = bf.header.mPageId; auto& partition = getPartition(pageId); - pwrite(mPageFd, &bf.page, PAGE_SIZE, pageId * PAGE_SIZE); + pwrite(mPageFd, &bf.page, FLAGS_page_size, pageId * FLAGS_page_size); bf.reset(); guardedBf.unlock(); partition.mFreeBfList.PushFront(bf); @@ -452,8 +454,8 @@ void BufferManager::StopBufferFrameProviders() { BufferManager::~BufferManager() { StopBufferFrameProviders(); - u64 totalMemSize = sizeof(BufferFrame) * (mNumBfs + mNumSaftyBfs); - munmap(mBfs, totalMemSize); + u64 totalMemSize = BufferFrame::Size() * (mNumBfs + mNumSaftyBfs); + munmap(mBufferPool, totalMemSize); } void BufferManager::DoWithBufferFrameIf( @@ -463,7 +465,8 @@ void BufferManager::DoWithBufferFrameIf( DCHECK(condition != nullptr); DCHECK(action != nullptr); for (u64 i = begin; i < end; i++) { - auto& bf = mBfs[i]; + auto bfAddr = &mBufferPool[i * BufferFrame::Size()]; + auto& bf = *reinterpret_cast(bfAddr); bf.header.mLatch.LockExclusively(); if (condition(bf)) { action(bf); diff --git a/source/storage/buffer-manager/BufferManager.hpp b/source/storage/buffer-manager/BufferManager.hpp index d7c71dec..9746511c 100644 --- a/source/storage/buffer-manager/BufferManager.hpp +++ b/source/storage/buffer-manager/BufferManager.hpp @@ -56,7 +56,7 @@ class BufferManager { public: /// All the managed buffer frames in the memory. - BufferFrame* mBfs; + u8* mBufferPool; /// FD for disk files storing pages. const int mPageFd; @@ -128,8 +128,8 @@ class BufferManager { /// Reads the page at pageId to the destination buffer. All the pages are /// stored in one file (mPageFd), page id (pageId) determines the offset of /// the pageId-th page in the underlying file: - /// 1. offset of pageId-th page: pageId * PAGE_SIZE - /// 2. size of each page: PAGE_SIZE + /// 1. offset of pageId-th page: pageId * FLAGS_page_size + /// 2. size of each page: FLAGS_page_size void ReadPageSync(PID pageId, void* destination); /// Reads the page at pageId, returns the buffer frame containing that page. diff --git a/source/storage/buffer-manager/Partition.hpp b/source/storage/buffer-manager/Partition.hpp index d60d3163..7068da2d 100644 --- a/source/storage/buffer-manager/Partition.hpp +++ b/source/storage/buffer-manager/Partition.hpp @@ -111,7 +111,7 @@ class Partition { } else { const u64 pageId = mNextPageId; mNextPageId += mPageIdDistance; - ENSURE(pageId * PAGE_SIZE <= FLAGS_db_file_capacity); + ENSURE(pageId * FLAGS_page_size <= FLAGS_db_file_capacity); return pageId; } } diff --git a/source/utils/Misc.hpp b/source/utils/Misc.hpp index fde873ca..b89b5f4b 100644 --- a/source/utils/Misc.hpp +++ b/source/utils/Misc.hpp @@ -100,6 +100,32 @@ template inline T* ArrayOnStack(size_t n) { return reinterpret_cast(alloca(n * sizeof(T))); } +template class AlignedBuffer { +public: + alignas(Alignment) u8* mBuffer; + +public: + AlignedBuffer(size_t size) + : mBuffer(reinterpret_cast(std::aligned_alloc(Alignment, size))) { + } + + ~AlignedBuffer() { + if (mBuffer != nullptr) { + free(mBuffer); + mBuffer = nullptr; + } + } + +public: + u8* Get() { + return mBuffer; + } + + template T* CastTo() { + return reinterpret_cast(mBuffer); + } +}; + struct Timer { std::atomic& mTimeCounterUS; diff --git a/tests/BufferFrameTest.cpp b/tests/BufferFrameTest.cpp new file mode 100644 index 00000000..3a3daf46 --- /dev/null +++ b/tests/BufferFrameTest.cpp @@ -0,0 +1,14 @@ +#include "storage/buffer-manager/BufferFrame.hpp" + +#include + +namespace leanstore { + +using namespace leanstore::storage; + +// Test buffer frame related sizes +TEST(BufferFrameTest, BufferFrameSize) { + EXPECT_EQ(BufferFrame::Size() - 512, FLAGS_page_size); +} + +} // namespace leanstore \ No newline at end of file diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 021126f2..47b21734 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -35,5 +35,6 @@ endfunction(leanstore_add_test_without_gtest_main) # Add tests leanstore_add_test(BTreeLLTest) leanstore_add_test(BTreeVILoggingAndRecoveryTest) +leanstore_add_test(BufferFrameTest) leanstore_add_test_without_gtest_main(BTreeVITest) \ No newline at end of file