diff --git a/patches/mimalloc-v2.2.4/5_skip_defrag_targets.patch b/patches/mimalloc-v2.2.4/5_skip_defrag_targets.patch new file mode 100644 index 000000000000..c4878b5bdacd --- /dev/null +++ b/patches/mimalloc-v2.2.4/5_skip_defrag_targets.patch @@ -0,0 +1,108 @@ +commit 0000000000000000000000000000000000000000 +Author: Dragonfly Defrag Hackathon +Date: Tue May 5 17:30:00 2026 +0000 + + feat: skip defrag-targeted pages in mi_malloc + + Adds a defrag_skip byte to mi_page_t and a public API + mi_page_set_defrag_skip(page_addr, skip). When set, mi_malloc skips + the page in the small-size fast path, the queue-head fast path, and + mi_page_queue_find_free_ex. Prevents new allocations from landing + on a page that phased defrag is trying to drain. + +--- a/include/mimalloc/internal.h ++++ b/include/mimalloc/internal.h +@@ -516,7 +516,14 @@ + mi_assert_internal(size <= (MI_SMALL_SIZE_MAX + MI_PADDING_SIZE)); + const size_t idx = _mi_wsize_from_size(size); + mi_assert_internal(idx < MI_PAGES_DIRECT); +- return heap->pages_free_direct[idx]; ++ mi_page_t* page = heap->pages_free_direct[idx]; ++ // dragonfly: when the cached small-page is a defrag target, force the ++ // generic slow path so the allocation goes through `mi_find_free_page` -> ++ // `mi_page_queue_find_free_ex` which skips defrag-targeted pages. ++ if (mi_unlikely(page->defrag_skip)) { ++ return (mi_page_t*) &_mi_page_empty; ++ } ++ return page; + } + + // Segment that contains the pointer +--- a/include/mimalloc/types.h ++++ b/include/mimalloc/types.h +@@ -337,6 +337,7 @@ + uint16_t used; // number of blocks in use (including blocks in `thread_free`) + uint8_t block_size_shift; // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`) + uint8_t heap_tag; // tag of the owning heap, used to separate heaps by object type ++ uint8_t defrag_skip; // dragonfly: when nonzero, alloc paths skip this page (it is being drained by defrag) + // padding + size_t block_size; // size available in each block (always `>0`) + uint8_t* page_start; // start of the page area containing the blocks +--- a/src/init.c ++++ b/src/init.c +@@ -26,6 +26,7 @@ + 0, // used + 0, // block size shift + 0, // heap tag ++ 0, // defrag_skip (dragonfly) + 0, // block_size + NULL, // page_start + #if (MI_PADDING || MI_ENCODE_FREELIST) +--- a/src/page.c ++++ b/src/page.c +@@ -697,6 +697,7 @@ + page->keys[1] = _mi_heap_random_next(heap); + #endif + page->free_is_zero = page->is_zero_init; ++ page->defrag_skip = 0; // dragonfly: fresh page is not a defrag target + #if MI_DEBUG>2 + if (page->is_zero_init) { + mi_track_mem_defined(page->page_start, page_size); +@@ -763,6 +764,14 @@ + while (page != NULL) + { + mi_page_t* next = page->next; // remember next ++ ++ // dragonfly: pages tagged by defrag are being drained; skip them so new ++ // allocations don't refill targets while EVACUATE moves entries off. ++ if (page->defrag_skip) { ++ page = next; ++ continue; ++ } ++ + #if MI_STAT + count++; + #endif +@@ -860,6 +869,12 @@ + + // check the first page: we even do this with candidate search or otherwise we re-search every time + mi_page_t* page = pq->first; ++ // dragonfly: skip the queue-head fast path when it points at a defrag ++ // target so the search falls through to mi_page_queue_find_free_ex which ++ // walks past target pages. ++ if (page != NULL && page->defrag_skip) { ++ page = NULL; ++ } + if (page != NULL) { + #if (MI_SECURE>=3) // in secure mode, we extend half the time to increase randomness + if (page->capacity < page->reserved && ((_mi_heap_random_next(heap) & 1) == 1)) { +--- a/src/alloc.c ++++ b/src/alloc.c +@@ -711,6 +711,17 @@ + return result; + } + ++// dragonfly: mark a page so that mi_malloc skips it when picking a page to ++// allocate from. Used by phased defrag to prevent EVACUATE moves from ++// refilling target pages that we are trying to drain. `page_addr` must be ++// a value previously returned in mi_page_usage_stats_t::page_address (i.e. ++// a `(uintptr_t)mi_page_t*`). ++void mi_page_set_defrag_skip(uintptr_t page_addr, bool skip) mi_attr_noexcept { ++ if (page_addr == 0) return; ++ mi_page_t* page = (mi_page_t*) page_addr; ++ page->defrag_skip = (skip ? 1 : 0); ++} ++ + // ------------------------------------------------------ + // ensure explicit external inline definitions are emitted! + // ------------------------------------------------------ diff --git a/patches/mimalloc-v2.2.4/6_dfly_underutil_callback.patch b/patches/mimalloc-v2.2.4/6_dfly_underutil_callback.patch new file mode 100644 index 000000000000..837353a81018 --- /dev/null +++ b/patches/mimalloc-v2.2.4/6_dfly_underutil_callback.patch @@ -0,0 +1,84 @@ +commit 0000000000000000000000000000000000000000 +Author: Dragonfly Defrag Hackathon +Date: Tue May 5 18:00:00 2026 +0000 + + feat: underutilized-page callback for reactive defrag + + Adds a public API mi_dfly_set_underutil_callback(cb) that fires on + local-thread free when a page's used count drops below a configured + threshold for the first time. Lets phased defrag enqueue pages + reactively instead of doing a full prime-table CENSUS scan to + discover them. + +--- a/include/mimalloc.h ++++ b/include/mimalloc.h +@@ -271,6 +271,13 @@ + + mi_decl_export bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_blocks, mi_block_visit_fun* visitor, void* arg); + ++// dragonfly: callback fired on local-thread free when a page's used count ++// crosses the configured threshold downward. Lets phased defrag enqueue ++// pages reactively instead of doing a full prime-table CENSUS scan. ++typedef void (*mi_dfly_underutil_callback_t)(uintptr_t page_addr); ++mi_decl_export void mi_dfly_set_underutil_callback(mi_dfly_underutil_callback_t cb); ++mi_decl_export void mi_dfly_set_underutil_threshold_pct(uint8_t pct); ++ + // Experimental + mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept; + mi_decl_nodiscard mi_decl_export bool mi_is_redirected(void) mi_attr_noexcept; +--- a/src/free.c ++++ b/src/free.c +@@ -18,6 +18,24 @@ + static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block); + static void mi_stat_free(const mi_page_t* page, const mi_block_t* block); + ++// ------------------------------------------------------ ++// Dragonfly: underutilized-page callback ++// Fired on local-thread free when the page's used count crosses the ++// configured threshold downward. Lets defrag enqueue pages reactively ++// instead of doing a full prime-table CENSUS scan. ++// ------------------------------------------------------ ++static mi_dfly_underutil_callback_t _mi_dfly_underutil_cb = NULL; ++static uint8_t _mi_dfly_underutil_pct = 80; ++ ++void mi_dfly_set_underutil_callback(mi_dfly_underutil_callback_t cb) { ++ _mi_dfly_underutil_cb = cb; ++} ++ ++void mi_dfly_set_underutil_threshold_pct(uint8_t pct) { ++ if (pct > 100) pct = 100; ++ _mi_dfly_underutil_pct = pct; ++} ++ + + // ------------------------------------------------------ + // Free +@@ -44,12 +62,28 @@ + // actual free: push on the local free list + mi_block_set_next(page, block, page->local_free); + page->local_free = block; ++ // dragonfly: decide whether to fire the underutilized-page callback BEFORE ++ // _mi_page_retire below. _mi_page_retire may call _mi_page_free which ++ // returns the page metadata to the segment, after which reading ++ // page->used / page->capacity is a UAF. We gate on page->used > 1 so that ++ // after --, used > 0 (page not retired and still alive), which makes ++ // (uintptr_t)page a valid address to hand to the callback. ++ bool fire_underutil_cb = false; ++ if (mi_unlikely(_mi_dfly_underutil_cb != NULL && page->used > 1)) { ++ const uint32_t cap_thr = (uint32_t)page->capacity * _mi_dfly_underutil_pct; ++ const uint32_t prev_x100 = (uint32_t)page->used * 100; ++ const uint32_t cur_x100 = (uint32_t)(page->used - 1) * 100; ++ fire_underutil_cb = (prev_x100 > cap_thr && cur_x100 <= cap_thr); ++ } + if mi_unlikely(--page->used == 0) { + _mi_page_retire(page); + } + else if mi_unlikely(check_full && mi_page_is_in_full(page)) { + _mi_page_unfull(page); + } ++ if (fire_underutil_cb) { ++ _mi_dfly_underutil_cb((uintptr_t)page); ++ } + } + + // Adjust a block that was allocated aligned, to the actual start of the block in the page. diff --git a/src/core/dash.h b/src/core/dash.h index a2710122d2b3..5b57355e0bbd 100644 --- a/src/core/dash.h +++ b/src/core/dash.h @@ -412,14 +412,18 @@ class DashTable : public detail::DashTableBase { return stash_unloaded_; } + // Advances cursor by exactly one logical bucket in bucket-major order, without + // visiting bucket contents. Used by sampled walkers (e.g. defrag CENSUS) to + // skip buckets between Traverse calls. Returns Cursor::end() once the table + // is exhausted. + Cursor AdvanceCursorBucketOrder(Cursor cursor); + private: enum class InsertMode { kInsertIfNotFound, kForceInsert, }; - Cursor AdvanceCursorBucketOrder(Cursor cursor); - template std::pair InsertInternal(U&& key, V&& value, EvictionPolicy& policy, InsertMode mode); diff --git a/src/core/page_usage/CMakeLists.txt b/src/core/page_usage/CMakeLists.txt index 207668767244..17654130b81e 100644 --- a/src/core/page_usage/CMakeLists.txt +++ b/src/core/page_usage/CMakeLists.txt @@ -1,2 +1,2 @@ -add_library(dfly_page_usage page_usage_stats.cc) +add_library(dfly_page_usage page_usage_stats.cc page_usage_visitors.cc) target_link_libraries(dfly_page_usage base TRDP::hdr_histogram redis_lib absl::strings) diff --git a/src/core/page_usage/page_usage_stats.cc b/src/core/page_usage/page_usage_stats.cc index a62c533cfa61..7d4b1d80da59 100644 --- a/src/core/page_usage/page_usage_stats.cc +++ b/src/core/page_usage/page_usage_stats.cc @@ -207,9 +207,9 @@ uint64_t PageUsage::UsedQuotaCycles() const { } bool PageUsage::IsPageForObjectUnderUtilized(void* object) { - mi_page_usage_stats_t stat; - zmalloc_page_is_underutilized(object, threshold_, collect_stats_ == CollectPageStats::YES, &stat); - return ConsumePageStats(stat); + return ConsumePageStats(mi_heap_page_is_underutilized(static_cast(zmalloc_heap), + object, threshold_, + collect_stats_ == CollectPageStats::YES)); } bool PageUsage::IsPageForObjectUnderUtilized(mi_heap_t* heap, void* object) { diff --git a/src/core/page_usage/page_usage_stats.h b/src/core/page_usage/page_usage_stats.h index c6f5c127bb50..db99875fc244 100644 --- a/src/core/page_usage/page_usage_stats.h +++ b/src/core/page_usage/page_usage_stats.h @@ -20,7 +20,10 @@ namespace dfly { class CycleQuota { public: static constexpr uint64_t kMaxQuota = std::numeric_limits::max(); - static constexpr uint64_t kDefaultDefragQuota = 150; + // 40000 here is ~10ms of real time because helio's CycleClock mixes raw rdtsc + // with abseil's shifted frequency, making FromUsec/ToUsec ~4x off on x86. + // Once the helio bug is fixed, drop this to 10000. + static constexpr uint64_t kDefaultDefragQuota = 40'000; explicit CycleQuota(uint64_t quota_usec); @@ -83,9 +86,11 @@ class PageUsage { uint64_t UsedQuotaCycles() const; + // Returns true when the object on the page should be reallocated. Subclasses + // (Evacuator, CensusTaker) override to short-circuit or extend the decision. + // Out-of-line in page_usage_stats.cc. virtual bool IsPageForObjectUnderUtilized(void* object); - - bool IsPageForObjectUnderUtilized(mi_heap_t* heap, void* object); + virtual bool IsPageForObjectUnderUtilized(mi_heap_t* heap, void* object); CollectedPageStats CollectedStats() const { return unique_pages_.CollectedStats(); @@ -107,6 +112,31 @@ class PageUsage { bool QuotaDepleted() const; + virtual bool ShouldStop() const { + return false; + } + + // Read-only walkers (e.g. CENSUS) never reallocate, so callers can skip + // pre/post sizing work that only matters when an object may move. + virtual bool IsReadOnly() const { + return false; + } + + // When true, the traversal should also defrag keys (it->first) in addition + // to values. Only the phased algorithm (CENSUS + EVACUATE) enables this. + virtual bool ShouldDefragKeys() const { + return false; + } + + // Walkers may stash the bucket cursor about to be visited so that downstream + // Observe() calls can attribute candidates back to a bucket. Default no-op. + virtual void SetCurrentBucketCursor(uint64_t /*cursor*/) { + } + + float threshold() const { + return threshold_; + } + void ExtendQuota(uint64_t quota_usec); private: @@ -136,6 +166,7 @@ class PageUsage { CycleQuota quota_; + protected: // For use in testing, forces reallocate check to always return true bool force_reallocate_{false}; }; diff --git a/src/core/page_usage/page_usage_visitors.cc b/src/core/page_usage/page_usage_visitors.cc new file mode 100644 index 000000000000..56dacad9d4ce --- /dev/null +++ b/src/core/page_usage/page_usage_visitors.cc @@ -0,0 +1,612 @@ +// Copyright 2026, DragonflyDB authors. All rights reserved. +// See LICENSE for licensing terms. +// + +#include "core/page_usage/page_usage_visitors.h" + +#include + +#define MI_BUILD_RELEASE 1 +#include + +#include +#include +#include + +#include "base/flags.h" +#include "base/logging.h" + +extern "C" { +#include "redis/zmalloc.h" +} + +ABSL_FLAG(bool, defrag_use_skip_bit, false, + "If true, mark target pages with mimalloc's defrag_skip bit so EVAC moves don't " + "refill them. Disable to A/B compare against an unmarked baseline."); + +ABSL_FLAG(bool, defrag_keys, false, + "If true, the phased defragmenter also defragments key allocations " + "(it->first) in addition to values. Set to false to measure the " + "incremental benefit of key defrag."); + +ABSL_FLAG(double, defrag_skip_percentile, 0.5, + "Fraction of the target plan (sorted by retention_score, most-fragmented first) " + "to apply the mimalloc defrag_skip bit to. 0.5 (default) marks the top half " + "and lets the bottom half stay refillable, which empirically gives the best " + "floor-vs-bulge tradeoff. 1.0 marks every target (max reclaim, biggest bulge). " + "Lower values shrink the lockout footprint: only the most-fragmented top-K " + "pages are protected from refill while higher-utilization targets stay " + "refillable."); + +extern "C" { +// Dragonfly mimalloc patch: tell mi_malloc to skip a page during defrag so +// EVACUATE moves don't refill pages we're trying to drain. +void mi_page_set_defrag_skip(uintptr_t page_addr, bool skip); +} + +namespace dfly { + +namespace { + +constexpr uint64_t kPerTargetSlotCostBytes = 16 * 1024; + +uint64_t ReclaimableBytes(uint16_t capacity_blocks, uint16_t used_blocks, uint32_t block_size) { + if (used_blocks >= capacity_blocks) + return 0; + return uint64_t(capacity_blocks - used_blocks) * block_size; +} + +uint64_t MoveBytes(uint16_t used_blocks, uint32_t block_size) { + return uint64_t(used_blocks) * block_size; +} + +uint64_t ReclaimableBytes(const TargetPage& target) { + return ReclaimableBytes(target.capacity_blocks, target.blocks_at_census, target.block_size); +} + +uint64_t MoveBytes(const TargetPage& target) { + return MoveBytes(target.blocks_at_census, target.block_size); +} + +float ComputeRetentionScore(uint16_t capacity_blocks, uint16_t used_blocks, uint32_t block_size, + uint64_t per_block_move_cost_bytes) { + const uint64_t reclaim = ReclaimableBytes(capacity_blocks, used_blocks, block_size); + const uint64_t move = MoveBytes(used_blocks, block_size); + const uint64_t cost = + move + uint64_t(used_blocks) * per_block_move_cost_bytes + kPerTargetSlotCostBytes; + return static_cast(static_cast(reclaim) / std::max(1, cost)); +} + +void PopulateAgg(PageAgg& agg, const mi_page_usage_stats_t& stat, float score) { + agg.page_address = stat.page_address; + agg.block_size = static_cast(stat.block_size); + agg.capacity_blocks = stat.capacity; + agg.used_blocks = stat.used; + agg.flags = stat.flags; + ++agg.observed_movable_blocks; + ++agg.generation; + agg.retention_score = score; +} + +TargetFilterReason ClassifyForTarget(const PageAgg& agg) { + if (agg.observed_movable_blocks == 0) + return TargetFilterReason::kNoObservedBlocks; + if (agg.used_blocks == 0) + return TargetFilterReason::kAlreadyEmpty; + if (agg.observed_movable_blocks > agg.used_blocks) + return TargetFilterReason::kStaleObservation; + if (agg.observed_movable_blocks < agg.used_blocks) + return TargetFilterReason::kHasImmovableData; + return TargetFilterReason::kKeep; +} + +TargetPage MakeTargetPage(const PageAgg& agg) { + TargetPage tp; + tp.page_address = agg.page_address; + tp.block_size = agg.block_size; + tp.capacity_blocks = agg.capacity_blocks; + tp.blocks_at_census = agg.used_blocks; + tp.retention_score_at_census = agg.retention_score; + return tp; +} + +// Wrapper around mimalloc's defrag_skip setter. Gated by a runtime flag so +// experiments can disable the skip-bit logic and observe whether refills on +// drained pages re-emerge. +// +// SHARP EDGE: mi_page_set_defrag_skip writes through the page address as +// mi_page_t*. If that memory has been unmapped (page retired -> segment +// freed -> OS reclaim, particularly under Dragonfly's aggressive purge +// settings) or reused for something else, the write segfaults or silently +// corrupts unrelated state. The window opens between SELECT_TARGETS adding +// the page to the plan and ~TargetPlan clearing the bit at end-of-cycle; +// any external drain-to-empty plus retire-and-unmap during that window +// makes the page address stale. +// +// The EvacDecide success / revalidation paths clear the bit *before* the +// triggering move runs (so the page is still mapped). The destructor sweep +// is the riskier site — it touches every plan target unconditionally. To +// harden, options are: (a) move the skip flag to a per-shard side table +// keyed by page address, (b) add a mimalloc refcount keeping target pages +// mapped for the cycle, (c) validate the page is still in a known segment +// before writing. (a) is the cleanest if it ever bites in production. +void SetDefragSkipIfEnabled(uintptr_t page_addr, bool skip) { + if (absl::GetFlag(FLAGS_defrag_use_skip_bit)) { + mi_page_set_defrag_skip(page_addr, skip); + } +} + +void AttributeBlockSkip(EvacStats& stats, RevalidationFailureReason reason, uint32_t block_size) { + switch (reason) { + case RevalidationFailureReason::kHeapMismatch: + ++stats.blocks_revalidation_heap_mismatch; + stats.bytes_revalidation_heap_mismatch += block_size; + break; + case RevalidationFailureReason::kActiveMallocPage: + ++stats.blocks_revalidation_active_malloc_page; + stats.bytes_revalidation_active_malloc_page += block_size; + break; + case RevalidationFailureReason::kFullPage: + ++stats.blocks_revalidation_full_page; + stats.bytes_revalidation_full_page += block_size; + break; + case RevalidationFailureReason::kAboveThreshold: + ++stats.blocks_revalidation_above_threshold; + stats.bytes_revalidation_above_threshold += block_size; + break; + case RevalidationFailureReason::kNone: + break; + } +} + +void RecordFirstFailure(TargetPage* target, EvacStats& stats, RevalidationFailureReason reason, + uint32_t block_size) { + target->revalidation_failed = true; + target->failure_reason = reason; + ++stats.blocks_skipped_revalidation_failed; + stats.bytes_skipped_revalidation_failed += block_size; + AttributeBlockSkip(stats, reason, block_size); + switch (reason) { + case RevalidationFailureReason::kHeapMismatch: + ++stats.targets_revalidation_heap_mismatch; + break; + case RevalidationFailureReason::kActiveMallocPage: + ++stats.targets_revalidation_active_malloc_page; + break; + case RevalidationFailureReason::kFullPage: + ++stats.targets_revalidation_full_page; + break; + case RevalidationFailureReason::kAboveThreshold: + ++stats.targets_revalidation_above_threshold; + break; + case RevalidationFailureReason::kNone: + break; + } + ++stats.targets_abandoned_revalidation; +} + +} // namespace + +void CensusStats::Merge(const CensusStats& other) { + allocations_seen += other.allocations_seen; + allocations_recorded += other.allocations_recorded; + skipped_above_threshold += other.skipped_above_threshold; + skipped_full_page += other.skipped_full_page; + skipped_wrong_heap += other.skipped_wrong_heap; + skipped_active_malloc_page += other.skipped_active_malloc_page; + skipped_low_score += other.skipped_low_score; + pages_evicted_from_retained += other.pages_evicted_from_retained; + heap_rebuilds += other.heap_rebuilds; +} + +void PlanStats::Merge(const PlanStats& other) { + targets_kept += other.targets_kept; + filtered_no_observed_blocks += other.filtered_no_observed_blocks; + filtered_stale += other.filtered_stale; + filtered_has_immovable_data += other.filtered_has_immovable_data; + filtered_already_empty += other.filtered_already_empty; + truncated_by_cap += other.truncated_by_cap; + selected_capacity_bytes_at_census += other.selected_capacity_bytes_at_census; + selected_used_bytes_at_census += other.selected_used_bytes_at_census; + selected_reclaimable_bytes_at_census += other.selected_reclaimable_bytes_at_census; + truncated_reclaimable_bytes += other.truncated_reclaimable_bytes; + filtered_immovable_reclaimable_bytes += other.filtered_immovable_reclaimable_bytes; +} + +void EvacStats::Merge(const EvacStats& other) { + blocks_skipped_not_target += other.blocks_skipped_not_target; + blocks_skipped_target_done += other.blocks_skipped_target_done; + blocks_skipped_revalidation_failed += other.blocks_skipped_revalidation_failed; + blocks_move_committed += other.blocks_move_committed; + bytes_skipped_target_done += other.bytes_skipped_target_done; + bytes_skipped_revalidation_failed += other.bytes_skipped_revalidation_failed; + bytes_move_committed += other.bytes_move_committed; + targets_revalidation_heap_mismatch += other.targets_revalidation_heap_mismatch; + targets_revalidation_active_malloc_page += other.targets_revalidation_active_malloc_page; + targets_revalidation_full_page += other.targets_revalidation_full_page; + targets_revalidation_above_threshold += other.targets_revalidation_above_threshold; + blocks_revalidation_heap_mismatch += other.blocks_revalidation_heap_mismatch; + blocks_revalidation_active_malloc_page += other.blocks_revalidation_active_malloc_page; + blocks_revalidation_full_page += other.blocks_revalidation_full_page; + blocks_revalidation_above_threshold += other.blocks_revalidation_above_threshold; + bytes_revalidation_heap_mismatch += other.bytes_revalidation_heap_mismatch; + bytes_revalidation_active_malloc_page += other.bytes_revalidation_active_malloc_page; + bytes_revalidation_full_page += other.bytes_revalidation_full_page; + bytes_revalidation_above_threshold += other.bytes_revalidation_above_threshold; + targets_abandoned_revalidation += other.targets_abandoned_revalidation; + targets_completed_during_evac += other.targets_completed_during_evac; +} + +PageCensus::PageCensus(CensusStats* stats, size_t max_retained_pages, + uint64_t per_block_move_cost_bytes) + : stats_(stats), + max_retained_pages_(max_retained_pages), + per_block_move_cost_bytes_(per_block_move_cost_bytes) { +} + +void PageCensus::Observe(const mi_page_usage_stats_t& stat, uint64_t bucket_cursor) { + ++stats_->allocations_seen; + + if (stat.flags & MI_DFLY_HEAP_MISMATCH) { + ++stats_->skipped_wrong_heap; + return; + } + if (stat.flags & MI_DFLY_PAGE_USED_FOR_MALLOC) { + ++stats_->skipped_active_malloc_page; + return; + } + if (stat.flags & MI_DFLY_PAGE_FULL) { + ++stats_->skipped_full_page; + return; + } + if ((stat.flags & MI_DFLY_PAGE_BELOW_THRESHOLD) == 0) { + ++stats_->skipped_above_threshold; + return; + } + + // Object lives on a candidate page; remember its bucket so EVACUATE can + // skip buckets that contain no candidates at all. + cursor_hints_.insert(bucket_cursor); + + const float new_score = ComputeRetentionScore( + stat.capacity, stat.used, static_cast(stat.block_size), per_block_move_cost_bytes_); + + if constexpr (kEnableTopK) { + auto add_entry = [&] { + PageAgg& agg = pages_[stat.page_address]; + PopulateAgg(agg, stat, new_score); + worst_retained_.push({stat.page_address, new_score, agg.generation}); + ++stats_->allocations_recorded; + }; + + if (auto it = pages_.find(stat.page_address); it != pages_.end()) { + PopulateAgg(it->second, stat, new_score); + worst_retained_.push({stat.page_address, new_score, it->second.generation}); + ++stats_->allocations_recorded; + } else if (pages_.size() < max_retained_pages_) { + add_entry(); + } else { + while (!worst_retained_.empty()) { + const HeapEntry top = worst_retained_.top(); + auto evict_it = pages_.find(top.page_address); + if (evict_it == pages_.end() || evict_it->second.generation != top.generation) { + worst_retained_.pop(); + continue; + } + if (new_score > top.score) { + worst_retained_.pop(); + pages_.erase(evict_it); + ++stats_->pages_evicted_from_retained; + add_entry(); + } else { + ++stats_->skipped_low_score; + } + break; + } + } + + if (worst_retained_.size() > 2 * max_retained_pages_) { + RebuildHeap(); + ++stats_->heap_rebuilds; + } + } else { + if (auto it = pages_.find(stat.page_address); it != pages_.end()) { + PopulateAgg(it->second, stat, new_score); + } else { + CHECK_LT(pages_.size(), max_retained_pages_) + << "PageCensus exceeded max_retained_pages_=" << max_retained_pages_ + << " with kEnableTopK=false"; + PageAgg& agg = pages_[stat.page_address]; + PopulateAgg(agg, stat, new_score); + } + ++stats_->allocations_recorded; + } +} + +void PageCensus::ObservePage(const mi_page_usage_stats_t& stat) { + ++stats_->allocations_seen; + + if ((stat.flags & MI_DFLY_PAGE_BELOW_THRESHOLD) == 0) { + ++stats_->skipped_above_threshold; + return; + } + + const float new_score = ComputeRetentionScore( + stat.capacity, stat.used, static_cast(stat.block_size), per_block_move_cost_bytes_); + + if constexpr (!kEnableTopK) { + if (!pages_.contains(stat.page_address)) { + CHECK_LT(pages_.size(), max_retained_pages_) + << "PageCensus exceeded max_retained_pages_=" << max_retained_pages_ + << " with kEnableTopK=false"; + } + } + + PageAgg& agg = pages_[stat.page_address]; + agg.page_address = stat.page_address; + agg.block_size = static_cast(stat.block_size); + agg.capacity_blocks = stat.capacity; + agg.used_blocks = stat.used; + agg.flags = stat.flags; + // No per-object visibility on this path; assume every used block is movable. + // EVAC's per-page revalidation drops pages whose blocks turn out immovable. + agg.observed_movable_blocks = stat.used; + agg.generation = 1; + agg.retention_score = new_score; + + ++stats_->allocations_recorded; + + if constexpr (kEnableTopK) { + worst_retained_.push({stat.page_address, new_score, agg.generation}); + if (worst_retained_.size() > 2 * max_retained_pages_) { + RebuildHeap(); + ++stats_->heap_rebuilds; + } + } +} + +void PageCensus::RebuildHeap() { + std::vector entries; + entries.reserve(pages_.size()); + for (const auto& [addr, agg] : pages_) { + entries.push_back({addr, agg.retention_score, agg.generation}); + } + worst_retained_ = std::priority_queue(WorseFirst{}, std::move(entries)); +} + +std::vector PageCensus::TakeCursorHints() { + std::vector out(cursor_hints_.begin(), cursor_hints_.end()); + cursor_hints_.clear(); + std::ranges::sort(out); + return out; +} + +TargetPlan::TargetPlan(PlanStats* stats) : stats_(stats) { +} + +TargetPlan::~TargetPlan() { + // Clear the mimalloc defrag_skip bit on every active target so the page + // becomes eligible for new allocations again. Tail entries are not marked + // (only active plan entries are), so they need no clear. + for (const TargetPage& tp : targets_) { + SetDefragSkipIfEnabled(tp.page_address, false); + } +} + +void TargetPlan::BuildFrom(const PageCensus& census, size_t max_targets) { + targets_.clear(); + address_to_index_.clear(); + *stats_ = PlanStats{}; + + std::vector candidates; + candidates.reserve(census.pages().size()); + + for (const auto& agg : census.pages() | std::views::values) { + switch (ClassifyForTarget(agg)) { + case TargetFilterReason::kKeep: + candidates.push_back(MakeTargetPage(agg)); + break; + case TargetFilterReason::kNoObservedBlocks: + ++stats_->filtered_no_observed_blocks; + break; + case TargetFilterReason::kAlreadyEmpty: + ++stats_->filtered_already_empty; + break; + case TargetFilterReason::kStaleObservation: + ++stats_->filtered_stale; + break; + case TargetFilterReason::kHasImmovableData: + ++stats_->filtered_has_immovable_data; + stats_->filtered_immovable_reclaimable_bytes += + uint64_t(agg.capacity_blocks - agg.used_blocks) * agg.block_size; + break; + } + } + + std::ranges::sort(candidates, [](const TargetPage& a, const TargetPage& b) { + if (a.retention_score_at_census != b.retention_score_at_census) + return a.retention_score_at_census > b.retention_score_at_census; + const uint64_t a_reclaim = ReclaimableBytes(a); + const uint64_t b_reclaim = ReclaimableBytes(b); + if (a_reclaim != b_reclaim) + return a_reclaim > b_reclaim; + const uint64_t a_move = MoveBytes(a); + const uint64_t b_move = MoveBytes(b); + if (a_move != b_move) + return a_move < b_move; + return a.page_address < b.page_address; + }); + + max_targets = std::min(max_targets, candidates.size()); + if (candidates.size() > max_targets) { + stats_->truncated_by_cap = candidates.size() - max_targets; + for (size_t i = max_targets; i < candidates.size(); ++i) { + const TargetPage& tp = candidates[i]; + stats_->truncated_reclaimable_bytes += + uint64_t(tp.capacity_blocks - tp.blocks_at_census) * tp.block_size; + } + candidates.resize(max_targets); + } + + targets_ = std::move(candidates); + address_to_index_.reserve(targets_.size()); + // Selective skip: targets_ is sorted descending by retention_score, which + // correlates inversely with used/capacity. Head of the vector = most + // fragmented. Apply skip_bit only to the top fraction so high-utilization + // targets stay refillable, shrinking lockout pressure on the workload. + const double skip_pct = std::clamp(absl::GetFlag(FLAGS_defrag_skip_percentile), 0.0, 1.0); + const size_t skip_count = static_cast(static_cast(targets_.size()) * skip_pct); + for (size_t i = 0; i < targets_.size(); ++i) { + address_to_index_[targets_[i].page_address] = i; + if (i < skip_count) { + // Tell mimalloc to skip this page in alloc paths; EVACUATE moves should + // not refill pages we are about to drain. + SetDefragSkipIfEnabled(targets_[i].page_address, true); + } + } + stats_->targets_kept = targets_.size(); + pending_targets_ = targets_.size(); + + for (const TargetPage& tp : targets_) { + stats_->selected_capacity_bytes_at_census += uint64_t(tp.capacity_blocks) * tp.block_size; + stats_->selected_used_bytes_at_census += uint64_t(tp.blocks_at_census) * tp.block_size; + stats_->selected_reclaimable_bytes_at_census += + uint64_t(tp.capacity_blocks - tp.blocks_at_census) * tp.block_size; + } +} + +bool TargetPlan::Contains(uintptr_t addr) const { + return address_to_index_.contains(addr); +} + +const TargetPage* TargetPlan::Find(uintptr_t addr) const { + const auto it = address_to_index_.find(addr); + return it == address_to_index_.end() ? nullptr : &targets_[it->second]; +} + +TargetPage* TargetPlan::FindMut(uintptr_t addr) { + const auto it = address_to_index_.find(addr); + return it == address_to_index_.end() ? nullptr : &targets_[it->second]; +} + +EvacOutcome EvacDecide(TargetPlan& plan, TargetPage* target, const mi_page_usage_stats_t& stat, + EvacStats& stats) { + if (target->revalidation_failed) { + ++stats.blocks_skipped_revalidation_failed; + stats.bytes_skipped_revalidation_failed += stat.block_size; + AttributeBlockSkip(stats, target->failure_reason, stat.block_size); + return EvacOutcome::kRevalidationFailed; + } + // Order matches the precedence in CENSUS skip-checks. The first matching + // flag is attributed; targets_revalidation_* sums to targets_abandoned_revalidation. + if (stat.flags & MI_DFLY_HEAP_MISMATCH) { + RecordFirstFailure(target, stats, RevalidationFailureReason::kHeapMismatch, stat.block_size); + SetDefragSkipIfEnabled(target->page_address, false); + plan.NotifyTargetDone(); + return EvacOutcome::kRevalidationFailed; + } + if (stat.flags & MI_DFLY_PAGE_USED_FOR_MALLOC) { + RecordFirstFailure(target, stats, RevalidationFailureReason::kActiveMallocPage, + stat.block_size); + SetDefragSkipIfEnabled(target->page_address, false); + plan.NotifyTargetDone(); + return EvacOutcome::kRevalidationFailed; + } + if (stat.flags & MI_DFLY_PAGE_FULL) { + RecordFirstFailure(target, stats, RevalidationFailureReason::kFullPage, stat.block_size); + SetDefragSkipIfEnabled(target->page_address, false); + plan.NotifyTargetDone(); + return EvacOutcome::kRevalidationFailed; + } + if ((stat.flags & MI_DFLY_PAGE_BELOW_THRESHOLD) == 0) { + RecordFirstFailure(target, stats, RevalidationFailureReason::kAboveThreshold, stat.block_size); + SetDefragSkipIfEnabled(target->page_address, false); + plan.NotifyTargetDone(); + return EvacOutcome::kRevalidationFailed; + } + if (target->blocks_evacuated >= target->blocks_at_census) { + ++stats.blocks_skipped_target_done; + stats.bytes_skipped_target_done += stat.block_size; + return EvacOutcome::kTargetAlreadyDone; + } + ++target->blocks_evacuated; + ++stats.blocks_move_committed; + stats.bytes_move_committed += stat.block_size; + if (target->blocks_evacuated == target->blocks_at_census) { + // First-time completion: drop the skip bit so the now-drained page can be + // reused by mi_malloc immediately if needed (no need to wait for plan + // teardown). + SetDefragSkipIfEnabled(target->page_address, false); + plan.NotifyTargetDone(); + ++stats.targets_completed_during_evac; + } + return EvacOutcome::kCommitMove; +} + +EvacOutcome EvacDecide(TargetPlan& plan, const mi_page_usage_stats_t& stat, EvacStats& stats) { + TargetPage* target = plan.FindMut(stat.page_address); + if (target == nullptr) { + ++stats.blocks_skipped_not_target; + return EvacOutcome::kNotATarget; + } + return EvacDecide(plan, target, stat, stats); +} + +CensusTaker::CensusTaker(PageCensus* census, float threshold, CycleQuota quota) + : PageUsage(CollectPageStats::NO, threshold, quota), census_(census), threshold_(threshold) { +} + +bool CensusTaker::IsPageForObjectUnderUtilized(void* object) { + mi_page_usage_stats_t stat = mi_heap_page_is_underutilized( + static_cast(zmalloc_heap), object, threshold_, /*collect_stats=*/true); + census_->Observe(stat, current_cursor_); + return false; +} + +bool CensusTaker::IsPageForObjectUnderUtilized(mi_heap_t* heap, void* object) { + mi_page_usage_stats_t stat = + mi_heap_page_is_underutilized(heap, object, threshold_, /*collect_stats=*/true); + census_->Observe(stat, current_cursor_); + return false; +} + +bool CensusTaker::ShouldDefragKeys() const { + return ::absl::GetFlag(FLAGS_defrag_keys); +} + +Evacuator::Evacuator(TargetPlan* plan, float threshold, EvacStats* evac_stats, CycleQuota quota) + : PageUsage(CollectPageStats::NO, threshold, quota), + plan_(plan), + threshold_(threshold), + evac_stats_(evac_stats) { +} + +bool Evacuator::ShouldDefragKeys() const { + return ::absl::GetFlag(FLAGS_defrag_keys); +} + +bool Evacuator::IsPageForObjectUnderUtilized(void* object) { + const uintptr_t addr = reinterpret_cast(_mi_ptr_page(object)); + TargetPage* target = plan_->FindMut(addr); + if (target == nullptr) { + ++evac_stats_->blocks_skipped_not_target; + return false; + } + const mi_page_usage_stats_t stat = mi_heap_page_is_underutilized( + static_cast(zmalloc_heap), object, threshold_, /*collect_stats=*/true); + return EvacDecide(*plan_, target, stat, *evac_stats_) == EvacOutcome::kCommitMove; +} + +bool Evacuator::IsPageForObjectUnderUtilized(mi_heap_t* heap, void* object) { + const uintptr_t addr = reinterpret_cast(_mi_ptr_page(object)); + TargetPage* target = plan_->FindMut(addr); + if (target == nullptr) { + ++evac_stats_->blocks_skipped_not_target; + return false; + } + const mi_page_usage_stats_t stat = + mi_heap_page_is_underutilized(heap, object, threshold_, /*collect_stats=*/true); + return EvacDecide(*plan_, target, stat, *evac_stats_) == EvacOutcome::kCommitMove; +} + +} // namespace dfly diff --git a/src/core/page_usage/page_usage_visitors.h b/src/core/page_usage/page_usage_visitors.h new file mode 100644 index 000000000000..01da85fe6e56 --- /dev/null +++ b/src/core/page_usage/page_usage_visitors.h @@ -0,0 +1,338 @@ +// Copyright 2026, DragonflyDB authors. All rights reserved. +// See LICENSE for licensing terms. +// + +#pragma once + +#include +#include + +#include +#include +#include +#include + +#include "core/page_usage/page_usage_stats.h" + +extern "C" { +#include "redis/zmalloc.h" +mi_page_usage_stats_t mi_heap_page_is_underutilized(mi_heap_t* heap, void* p, float ratio, + bool collect_stats); +} + +namespace dfly { + +struct PageAgg { + uintptr_t page_address = 0; + + uint32_t block_size = 0; + float retention_score = 0.0f; + + uint16_t capacity_blocks = 0; + uint16_t used_blocks = 0; + uint16_t observed_movable_blocks = 0; + uint16_t generation = 0; + + uint8_t flags = 0; +}; + +struct CensusStats { + uint64_t allocations_seen = 0; + uint64_t allocations_recorded = 0; + uint64_t skipped_above_threshold = 0; + uint64_t skipped_full_page = 0; + uint64_t skipped_wrong_heap = 0; + uint64_t skipped_active_malloc_page = 0; + uint64_t skipped_low_score = 0; + + // Top-K bookkeeping + uint64_t pages_evicted_from_retained = 0; + uint64_t heap_rebuilds = 0; + + void Merge(const CensusStats& other); +}; + +class PageCensus { + public: + static constexpr size_t kDefaultMaxRetainedPages = 300'000; + + // When false, PageCensus skips the top-k heap entirely and just inserts every + // observed page into pages_. Cheap (no priority_queue work per Observe), but + // loses the cap-and-evict-worst guard: the map hard-crashes on a new entry + // once it reaches max_retained_pages_. Flip to true to restore heap-based + // eviction so workloads exceeding the cap stay bounded. + static constexpr bool kEnableTopK = false; + + explicit PageCensus(CensusStats* stats, size_t max_retained_pages = kDefaultMaxRetainedPages, + uint64_t per_block_move_cost_bytes = 256); + + // bucket_cursor is the DashTable cursor of the bucket the observed object + // currently lives in. Recorded so EVACUATE can restrict its walk to buckets + // known to contain at least one candidate object. Pass 0 if unknown + // (callers outside the hot defrag path may not have a cursor). + void Observe(const mi_page_usage_stats_t& stat, uint64_t bucket_cursor = 0); + + // Page-level observe used by the underutil-set fast path: caller has already + // verified the page is in-heap, non-full, and below threshold. We don't have + // per-object visibility, so we assume every used block is movable and let + // EVACUATE's per-page revalidation correct any wrongly-classified entries. + // No bucket cursor is recorded (cursor_hints stays empty in this path). + void ObservePage(const mi_page_usage_stats_t& stat); + + const CensusStats& stats() const { + return *stats_; + } + + const absl::flat_hash_map& pages() const { + return pages_; + } + + const absl::flat_hash_set& cursor_hints() const { + return cursor_hints_; + } + + // Move-out accessor: lets SELECT_TARGETS hand the hint set off to + // DefragTaskState before the census itself is released. Returns a sorted + // vector so the EVACUATE walker can iterate deterministically and resume + // from a saved cursor index across DoDefrag invocations. + std::vector TakeCursorHints(); + + private: + void RebuildHeap(); + + struct HeapEntry { + uintptr_t page_address; + float score; + uint16_t generation; + }; + + struct WorseFirst { + bool operator()(const HeapEntry& a, const HeapEntry& b) const { + return a.score > b.score; + } + }; + + absl::flat_hash_map pages_; + std::priority_queue, WorseFirst> worst_retained_; + // Buckets that observed at least one object on a candidate (under-threshold) + // page. Consumed by EVACUATE to skip buckets with no targets. + absl::flat_hash_set cursor_hints_; + CensusStats* stats_; + size_t max_retained_pages_; + uint64_t per_block_move_cost_bytes_; +}; + +enum class TargetStatus : uint8_t { + kPending, + kSuccess, + kPartial, + kFailed, +}; + +enum class TargetFilterReason : uint8_t { + kKeep, + kNoObservedBlocks, // observed_movable_blocks == 0 (defensive) + kStaleObservation, // observed_movable_blocks > used_blocks + kHasImmovableData, // observed_movable_blocks < used_blocks (non-movables pin the page) + kAlreadyEmpty, // used_blocks == 0 +}; + +enum class RevalidationFailureReason : uint8_t { + kNone = 0, + kHeapMismatch, + kActiveMallocPage, + kFullPage, + kAboveThreshold, +}; + +struct TargetPage { + // Snapshot from census (immutable after BuildFrom). + uintptr_t page_address = 0; + uint32_t block_size = 0; + uint16_t capacity_blocks = 0; + uint16_t blocks_at_census = 0; // used_blocks at census time + float retention_score_at_census = 0.0f; + + // Mutated during EVACUATE. + uint16_t blocks_evacuated = 0; + uint16_t evacuation_failures = 0; + TargetStatus status = TargetStatus::kPending; + bool revalidation_failed = false; + // Set on the first revalidation failure; consulted on sticky-skip branches + // to attribute subsequent block/byte skips to the originating reason. + RevalidationFailureReason failure_reason = RevalidationFailureReason::kNone; +}; + +struct PlanStats { + uint64_t targets_kept = 0; + uint64_t filtered_no_observed_blocks = 0; + uint64_t filtered_stale = 0; + uint64_t filtered_has_immovable_data = 0; + uint64_t filtered_already_empty = 0; + uint64_t truncated_by_cap = 0; // pages dropped because over max_targets + + uint64_t selected_capacity_bytes_at_census = 0; + uint64_t selected_used_bytes_at_census = 0; + uint64_t selected_reclaimable_bytes_at_census = 0; + + uint64_t truncated_reclaimable_bytes = 0; + uint64_t filtered_immovable_reclaimable_bytes = 0; + + void Merge(const PlanStats& other); +}; + +enum class EvacOutcome : uint8_t { + kNotATarget, // page is not in the plan + kTargetAlreadyDone, // target's blocks_evacuated already at blocks_at_census + kRevalidationFailed, // page state shifted (full / above threshold / heap mismatch / etc.) + kCommitMove, // caller should perform the move; counter pre-bumped +}; + +struct EvacStats { + uint64_t blocks_skipped_not_target = 0; + uint64_t blocks_skipped_target_done = 0; + uint64_t blocks_skipped_revalidation_failed = 0; + + uint64_t blocks_move_committed = 0; + + uint64_t bytes_skipped_target_done = 0; + uint64_t bytes_skipped_revalidation_failed = 0; + uint64_t bytes_move_committed = 0; + + uint64_t targets_revalidation_heap_mismatch = 0; + uint64_t targets_revalidation_active_malloc_page = 0; + uint64_t targets_revalidation_full_page = 0; + uint64_t targets_revalidation_above_threshold = 0; + + // Block/byte breakdown of revalidation skips by originating reason. Sums to + // blocks_skipped_revalidation_failed / bytes_skipped_revalidation_failed. + uint64_t blocks_revalidation_heap_mismatch = 0; + uint64_t blocks_revalidation_active_malloc_page = 0; + uint64_t blocks_revalidation_full_page = 0; + uint64_t blocks_revalidation_above_threshold = 0; + uint64_t bytes_revalidation_heap_mismatch = 0; + uint64_t bytes_revalidation_active_malloc_page = 0; + uint64_t bytes_revalidation_full_page = 0; + uint64_t bytes_revalidation_above_threshold = 0; + + uint64_t targets_abandoned_revalidation = 0; + uint64_t targets_completed_during_evac = 0; + + void Merge(const EvacStats& other); +}; + +class TargetPlan { + public: + explicit TargetPlan(PlanStats* stats); + ~TargetPlan(); + + // Non-copyable, non-movable: destructor clears mimalloc defrag_skip bits on + // active targets, so move-from would double-clear (harmless) but copies + // would set bits this object doesn't own. + TargetPlan(const TargetPlan&) = delete; + TargetPlan& operator=(const TargetPlan&) = delete; + TargetPlan(TargetPlan&&) = delete; + TargetPlan& operator=(TargetPlan&&) = delete; + + // Default `max_targets` is effectively unlimited; selective skip-bit (top + // skip_pct fraction) bounds lockout pressure. Tests pass explicit small + // values to exercise the cap path. + void BuildFrom(const PageCensus& census, size_t max_targets = std::numeric_limits::max()); + + const std::vector& targets() const { + return targets_; + } + + const PlanStats& stats() const { + return *stats_; + } + + bool Contains(uintptr_t addr) const; + + const TargetPage* Find(uintptr_t addr) const; + TargetPage* FindMut(uintptr_t addr); + + size_t size() const { + return targets_.size(); + } + + bool empty() const { + return targets_.empty(); + } + + bool AllTargetsDone() const { + return pending_targets_ == 0; + } + + void NotifyTargetDone() { + --pending_targets_; + } + + private: + std::vector targets_; // sorted by retention_score desc + absl::flat_hash_map address_to_index_; + PlanStats* stats_; + size_t pending_targets_ = 0; +}; + +// Hot-path variant: caller has already resolved the target (e.g. via +// plan.FindMut). Must be non-null. Skips the redundant lookup that the +// 3-arg variant otherwise performs. +EvacOutcome EvacDecide(TargetPlan& plan, TargetPage* target, const mi_page_usage_stats_t& stat, + EvacStats& stats); + +// Convenience variant: looks up the target from stat.page_address. Returns +// kNotATarget on miss. Kept for test ergonomics; production callers should +// prefer the 4-arg form so they can fold the lookup with their own +// fast-path checks. +EvacOutcome EvacDecide(TargetPlan& plan, const mi_page_usage_stats_t& stat, EvacStats& stats); + +class CensusTaker final : public PageUsage { + public: + CensusTaker(PageCensus* census, float threshold, CycleQuota quota = CycleQuota::Unlimited()); + + // Override to call the mimalloc syscall and feed the resulting stats into + // the census; CensusTaker never reallocates so the return is always false. + bool IsPageForObjectUnderUtilized(void* object) override; + bool IsPageForObjectUnderUtilized(mi_heap_t* heap, void* object) override; + + bool IsReadOnly() const final { + return true; + } + + bool ShouldDefragKeys() const final; + + void SetCurrentBucketCursor(uint64_t cursor) final { + current_cursor_ = cursor; + } + + private: + PageCensus* census_; + float threshold_; + uint64_t current_cursor_ = 0; +}; + +class Evacuator final : public PageUsage { + public: + Evacuator(TargetPlan* plan, float threshold, EvacStats* evac_stats, + CycleQuota quota = CycleQuota::Unlimited()); + + // Override to filter through the plan: a per-object hashmap lookup short- + // circuits the expensive mi_heap_page_is_underutilized syscall when the + // object isn't on a target page. On hit, calls the syscall + EvacDecide. + bool IsPageForObjectUnderUtilized(void* object) override; + bool IsPageForObjectUnderUtilized(mi_heap_t* heap, void* object) override; + + bool ShouldDefragKeys() const final; + + bool ShouldStop() const final { + return plan_->AllTargetsDone(); + } + + private: + TargetPlan* plan_; + float threshold_; + EvacStats* evac_stats_; +}; + +} // namespace dfly diff --git a/src/core/page_usage_stats_test.cc b/src/core/page_usage_stats_test.cc index fec2a7287b08..10c60cd0ab43 100644 --- a/src/core/page_usage_stats_test.cc +++ b/src/core/page_usage_stats_test.cc @@ -49,7 +49,7 @@ std::string GenerateTestJSON(size_t num_objects) { } // Helper to defragment only if a randomly generated value is less than preset probability. For -// benchmarking realistic situations, where some nodes are fragmented and others are not +// benchmarking realistic situations, where some nodes are fragmented and others are not. class SelectiveDefragment : public PageUsage { public: explicit SelectiveDefragment(const double fragmentation_probability) diff --git a/src/external_libs.cmake b/src/external_libs.cmake index e3e270dc5020..d879ac9f0efe 100644 --- a/src/external_libs.cmake +++ b/src/external_libs.cmake @@ -77,6 +77,8 @@ ExternalProject_Add(mimalloc2_project COMMAND patch -p1 -d ${THIRD_PARTY_DIR}/mimalloc2/ -i ${MIMALLOC_PATCH_DIR}/2_return_stat.patch COMMAND patch -p1 -d ${THIRD_PARTY_DIR}/mimalloc2/ -i ${MIMALLOC_PATCH_DIR}/3_track_full_size.patch COMMAND patch -p1 -d ${THIRD_PARTY_DIR}/mimalloc2/ -i ${MIMALLOC_PATCH_DIR}/4_fix_heap_collect.patch + COMMAND patch -p1 -d ${THIRD_PARTY_DIR}/mimalloc2/ -i ${MIMALLOC_PATCH_DIR}/5_skip_defrag_targets.patch + COMMAND patch -p1 -d ${THIRD_PARTY_DIR}/mimalloc2/ -i ${MIMALLOC_PATCH_DIR}/6_dfly_underutil_callback.patch BUILD_COMMAND make mimalloc-static INSTALL_COMMAND make install diff --git a/src/redis/zmalloc.h b/src/redis/zmalloc.h index eb53e169a9c9..ef7fee9bffad 100644 --- a/src/redis/zmalloc.h +++ b/src/redis/zmalloc.h @@ -148,6 +148,11 @@ char* zstrdup(const char* s); void init_zmalloc_threadlocal(void* heap); extern __thread ssize_t zmalloc_used_memory_tl; +// The heap zmalloc operates on. Exposed so C++ hot-path callers can avoid +// the zmalloc_page_is_underutilized wrapper indirection (~3.7ns/call) and +// invoke mi_heap_page_is_underutilized directly. Treat as opaque (cast to +// mi_heap_t*). +extern __thread void* zmalloc_heap; #undef __zm_str #undef __xstr diff --git a/src/redis/zmalloc_mi.c b/src/redis/zmalloc_mi.c index 8209b9d7d35e..5b22cdcde2a1 100644 --- a/src/redis/zmalloc_mi.c +++ b/src/redis/zmalloc_mi.c @@ -13,7 +13,10 @@ #include "zmalloc.h" __thread ssize_t zmalloc_used_memory_tl = 0; -__thread mi_heap_t* zmalloc_heap = NULL; +// Linkage matches the extern declaration in zmalloc.h, which uses void* to +// avoid pulling mimalloc headers into every C++ TU. The actual stored +// pointer is always a mi_heap_t*. +__thread void* zmalloc_heap = NULL; mi_page_usage_stats_t mi_heap_page_is_underutilized(mi_heap_t* heap, void* p, float ratio, bool collect_stats); @@ -175,12 +178,13 @@ int zmalloc_get_allocator_wasted_blocks(float ratio, size_t* allocated, size_t* // Implemented based on this mimalloc code: // https://github.com/microsoft/mimalloc/blob/main/src/heap.c#L27 int zmalloc_get_allocator_fragmentation_step(float ratio, struct fragmentation_info* info) { - if (zmalloc_heap->page_count == 0 || info->bin >= MI_BIN_FULL) { + mi_heap_t* heap = (mi_heap_t*)zmalloc_heap; + if (heap->page_count == 0 || info->bin >= MI_BIN_FULL) { // We avoid iterating over full pages since they are fully utilized. return 0; } - mi_page_queue_t* pq = &zmalloc_heap->pages[info->bin]; + mi_page_queue_t* pq = &heap->pages[info->bin]; const mi_page_t* page = pq->first; while (page != NULL) { const mi_page_t* next = page->next; @@ -205,11 +209,11 @@ int zmalloc_get_allocator_fragmentation_step(float ratio, struct fragmentation_i info->committed_golden = info->committed; // Add total comitted size of MI_BIN_FULL that we do not traverse // as its tracked by zmalloc_heap->full_page_size variable. - info->committed += zmalloc_heap->full_page_size; + info->committed += heap->full_page_size; // TODO: it's a test code that makes sure `full_page_size` is correct. // Remove it once we are confident with the implementation. - mi_page_queue_t* pq = &zmalloc_heap->pages[MI_BIN_FULL]; + mi_page_queue_t* pq = &heap->pages[MI_BIN_FULL]; const mi_page_t* page = pq->first; while (page != NULL) { info->committed_golden += page->capacity * page->block_size; diff --git a/src/server/CMakeLists.txt b/src/server/CMakeLists.txt index f7bbd89d80b5..a05899110f02 100644 --- a/src/server/CMakeLists.txt +++ b/src/server/CMakeLists.txt @@ -82,6 +82,7 @@ endif() # Optionally include tiered_storage which interfaces with tiering_module add_library(dragonfly_lib + defrag.cc engine_shard.cc engine_shard_set.cc config_registry.cc conn_context.cc debugcmd.cc dflycmd.cc error.cc family_utils.cc string_stats.cc ${DF_SEARCH_SRCS} @@ -166,6 +167,7 @@ helio_cxx_test(cluster/cluster_family_test dfly_test_lib LABELS DFLY) helio_cxx_test(acl/acl_family_test dfly_test_lib LABELS DFLY) helio_cxx_test(engine_shard_set_test dfly_test_lib LABELS DFLY) helio_cxx_test(serializer_base_test dfly_test_lib LABELS DFLY) +helio_cxx_test(defrag_test dfly_test_lib LABELS DFLY) add_dependencies(check_dfly dragonfly_test json_family_test list_family_test generic_family_test memcache_parser_test rdb_test journal_test diff --git a/src/server/defrag.cc b/src/server/defrag.cc new file mode 100644 index 000000000000..f17b8f5e41af --- /dev/null +++ b/src/server/defrag.cc @@ -0,0 +1,671 @@ +// Copyright 2026, DragonflyDB authors. All rights reserved. +// See LICENSE for licensing terms. +// + +#include "server/defrag.h" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "base/flags.h" +#include "base/logging.h" +#include "core/page_usage/page_usage_visitors.h" + +ABSL_FLAG(uint64_t, defrag_min_plan_reclaimable_bytes, 64u << 20, + "Minimum bytes-reclaimable threshold the SELECT_TARGETS plan must hit to " + "justify running EVACUATE. Below this, the cycle is skipped (PLAN_SKIPPED) " + "and we return to IDLE without walking the dashtable. The underutil set is " + "left intact so the next cycle picks it up if churn refills the pages above " + "threshold (or new fragmentation appears). Default 64 MiB."); + +extern "C" { +#include "redis/zmalloc.h" +// Dragonfly mimalloc patch: per-process callback fired by mi_free_block_local +// when a page's used count crosses below the configured underutil threshold. +typedef void (*mi_dfly_underutil_callback_t)(uintptr_t page_addr); +void mi_dfly_set_underutil_callback(mi_dfly_underutil_callback_t cb); +void mi_dfly_set_underutil_threshold_pct(uint8_t pct); +} + +namespace dfly { + +namespace { + +#define DEFRAG_STEP_LOG LOG(INFO) + +uint64_t NowNs() { + return absl::GetCurrentTimeNanos(); +} + +double NsToMs(uint64_t ns) { + return static_cast(ns) / 1e6; +} + +std::string FormatMiB(uint64_t bytes) { + return absl::StrFormat("%.2fMiB", static_cast(bytes) / (1024.0 * 1024.0)); +} + +} // namespace + +namespace defrag_underutil { + +namespace { + +thread_local absl::flat_hash_set tl_underutil_pages; + +void OnPageUnderutil(uintptr_t page_addr) { + tl_underutil_pages.insert(page_addr); +} + +} // namespace + +void InitOnce() { + static std::once_flag once; + std::call_once(once, []() { + mi_dfly_set_underutil_callback(&OnPageUnderutil); + LOG(INFO) << "defrag[underutil_cb] registered with mimalloc"; + }); +} + +void SetThresholdPct(uint8_t pct) { + mi_dfly_set_underutil_threshold_pct(pct); +} + +size_t Size() { + return tl_underutil_pages.size(); +} + +std::vector Snapshot() { + return {tl_underutil_pages.begin(), tl_underutil_pages.end()}; +} + +void Remove(uintptr_t page_addr) { + tl_underutil_pages.erase(page_addr); +} + +void Clear() { + tl_underutil_pages.clear(); +} + +bool IsPageMaybeUnderutil(uintptr_t page_addr) { + if (tl_underutil_pages.empty()) { + return true; // bootstrap: no info, fall through to original mimalloc check + } + return tl_underutil_pages.contains(page_addr); +} + +} // namespace defrag_underutil + +void DefragTaskState::UpdateScanState(uint64_t cursor_val) { + cursor = cursor_val; + if (cursor == 0u) { + ++dbid; + } +} + +void DefragTaskState::ResetScanState() { + dbid = 0; + cursor = 0; +} + +void DefragTaskState::FinishCycle() { + phase = DefragPhase::IDLE; + census.reset(); + plan.reset(); + cursor_hints.clear(); + hint_cursor_idx = 0; + ResetScanState(); +} + +void CycleProgress::Merge(const CycleProgress& other) { + targets_complete += other.targets_complete; + targets_partial += other.targets_partial; + targets_no_progress += other.targets_no_progress; + targets_abandoned += other.targets_abandoned; + blocks_total_at_census += other.blocks_total_at_census; + blocks_evacuated += other.blocks_evacuated; + blocks_remaining += other.blocks_remaining; + bytes_total_at_census += other.bytes_total_at_census; + bytes_evacuated += other.bytes_evacuated; + bytes_remaining += other.bytes_remaining; + bytes_freed += other.bytes_freed; +} + +void DefragCycleStats::Merge(const DefragCycleStats& other) { + census.Merge(other.census); + plan.Merge(other.plan); + evac.Merge(other.evac); + verify.Merge(other.verify); + census_db_objects_scanned += other.census_db_objects_scanned; + evac_db_objects_scanned += other.evac_db_objects_scanned; + evac_reallocations += other.evac_reallocations; + evac_key_reallocations += other.evac_key_reallocations; + evac_val_reallocations += other.evac_val_reallocations; + evac_bytes_moved += other.evac_bytes_moved; + census_retained_pages += other.census_retained_pages; + plan_target_pages += other.plan_target_pages; + census_potential_reclaim_bytes += other.census_potential_reclaim_bytes; + census_movable_bytes_observed += other.census_movable_bytes_observed; + // cycle_finished is per-shard semantics; not meaningfully mergeable. + // On the merged report, callers should use phase_per_shard to answer + // "is every shard done?" — cycle_finished stays at its default (false). +} + +DefragMergedReport DefragMergedReport::Merge(std::vector&& shards) { + DefragMergedReport result; + result.shard_summaries.reserve(shards.size()); + std::vector page_usage_list; + page_usage_list.reserve(shards.size()); + + for (DefragShardReport& shard : shards) { + result.shard_summaries.push_back(shard.summary); + result.cycle_stats.Merge(shard.cycle_stats); + page_usage_list.push_back(std::move(shard.page_usage_stats)); + } + // CollectedPageStats::Merge takes a threshold; carry the first-shard value + // forward (all shards are configured with the same threshold). + const float threshold = + page_usage_list.empty() ? 0.0f : static_cast(page_usage_list.front().threshold); + result.page_usage_stats = CollectedPageStats::Merge(std::move(page_usage_list), threshold); + return result; +} + +const char* PhaseName(DefragPhase phase) { + switch (phase) { + case DefragPhase::IDLE: + return "IDLE"; + case DefragPhase::CENSUS: + return "CENSUS"; + case DefragPhase::SELECT_TARGETS: + return "SELECT_TARGETS"; + case DefragPhase::EVACUATE: + return "EVACUATE"; + case DefragPhase::VERIFY: + return "VERIFY"; + } + return "UNKNOWN"; +} + +std::string DefragMergedReport::ToString() const { + std::string out; + const auto& cs = cycle_stats; + + absl::StrAppend(&out, "Per-shard summary:\n"); + absl::StrAppend(&out, " shard | phase_start -> phase_end | duration_us | exit_reason\n"); + for (size_t i = 0; i < shard_summaries.size(); ++i) { + const DefragShardSummary& s = shard_summaries[i]; + const char* exit_reason = s.finished_all_dbs ? "finished" : (s.quota_depleted ? "quota" : "-"); + absl::StrAppend( + &out, " ", absl::Dec(i, absl::PadSpec::kSpacePad5), " | ", + absl::StrFormat("%-14s -> %-14s", PhaseName(s.phase_start), PhaseName(s.phase_end)), " | ", + absl::Dec(s.duration_us, absl::PadSpec::kSpacePad11), " | ", exit_reason, "\n"); + } + + absl::StrAppend(&out, "\n[CENSUS]\n"); + absl::StrAppend(&out, "Allocations seen: ", cs.census.allocations_seen, "\n"); + absl::StrAppend(&out, "Allocations recorded: ", cs.census.allocations_recorded, "\n"); + absl::StrAppend(&out, "Skipped (above threshold): ", cs.census.skipped_above_threshold, "\n"); + absl::StrAppend(&out, "Skipped (full page): ", cs.census.skipped_full_page, "\n"); + absl::StrAppend(&out, "Skipped (wrong heap): ", cs.census.skipped_wrong_heap, "\n"); + absl::StrAppend(&out, "Skipped (active malloc page): ", cs.census.skipped_active_malloc_page, + "\n"); + absl::StrAppend(&out, "Skipped (low score): ", cs.census.skipped_low_score, "\n"); + absl::StrAppend(&out, "Pages evicted from retained: ", cs.census.pages_evicted_from_retained, + "\n"); + absl::StrAppend(&out, "Heap rebuilds: ", cs.census.heap_rebuilds, "\n"); + absl::StrAppend(&out, "DB objects scanned: ", cs.census_db_objects_scanned, "\n"); + absl::StrAppend(&out, "Retained pages (total): ", cs.census_retained_pages, "\n"); + absl::StrAppend( + &out, "Potential reclaimable bytes (observed): ", cs.census_potential_reclaim_bytes, "\n"); + absl::StrAppend(&out, "Movable bytes (observed): ", cs.census_movable_bytes_observed, "\n"); + + absl::StrAppend(&out, "\n[SELECT]\n"); + absl::StrAppend(&out, "Targets kept: ", cs.plan.targets_kept, "\n"); + absl::StrAppend(&out, "Filtered (no observed blocks): ", cs.plan.filtered_no_observed_blocks, + "\n"); + absl::StrAppend(&out, "Filtered (stale): ", cs.plan.filtered_stale, "\n"); + absl::StrAppend(&out, "Filtered (has immovable data): ", cs.plan.filtered_has_immovable_data, + "\n"); + absl::StrAppend(&out, "Filtered (already empty): ", cs.plan.filtered_already_empty, "\n"); + absl::StrAppend(&out, "Truncated by cap: ", cs.plan.truncated_by_cap, "\n"); + absl::StrAppend(&out, "Target pages (total): ", cs.plan_target_pages, "\n"); + absl::StrAppend(&out, "Selected capacity bytes (at census): ", + cs.plan.selected_capacity_bytes_at_census, "\n"); + absl::StrAppend(&out, "Selected used bytes (at census): ", cs.plan.selected_used_bytes_at_census, + "\n"); + absl::StrAppend(&out, "Selected reclaimable bytes (at census): ", + cs.plan.selected_reclaimable_bytes_at_census, "\n"); + absl::StrAppend(&out, "Truncated reclaimable bytes: ", cs.plan.truncated_reclaimable_bytes, "\n"); + absl::StrAppend(&out, "Filtered immovable reclaimable bytes: ", + cs.plan.filtered_immovable_reclaimable_bytes, "\n"); + + absl::StrAppend(&out, "\n[EVACUATE]\n"); + absl::StrAppend(&out, "DB objects scanned: ", cs.evac_db_objects_scanned, "\n"); + absl::StrAppend(&out, "Reallocations: ", cs.evac_reallocations, "\n"); + absl::StrAppend(&out, "Keys reallocated: ", cs.evac_key_reallocations, "\n"); + absl::StrAppend(&out, "Values reallocated: ", cs.evac_val_reallocations, "\n"); + absl::StrAppend(&out, "Bytes moved: ", FormatMiB(cs.evac_bytes_moved), "\n"); + absl::StrAppend(&out, "Blocks moved (committed): ", cs.evac.blocks_move_committed, "\n"); + absl::StrAppend(&out, "Bytes moved (committed): ", cs.evac.bytes_move_committed, "\n"); + absl::StrAppend(&out, "Blocks skipped (not target): ", cs.evac.blocks_skipped_not_target, "\n"); + absl::StrAppend(&out, "Blocks skipped (target done): ", cs.evac.blocks_skipped_target_done, "\n"); + absl::StrAppend(&out, "Blocks skipped (revalidation failed): ", + cs.evac.blocks_skipped_revalidation_failed, "\n"); + absl::StrAppend(&out, "Bytes skipped (target done): ", cs.evac.bytes_skipped_target_done, "\n"); + absl::StrAppend(&out, "Bytes skipped (revalidation failed): ", + cs.evac.bytes_skipped_revalidation_failed, "\n"); + absl::StrAppend(&out, "Targets revalidation (heap mismatch): ", + cs.evac.targets_revalidation_heap_mismatch, "\n"); + absl::StrAppend(&out, "Targets revalidation (active malloc page): ", + cs.evac.targets_revalidation_active_malloc_page, "\n"); + absl::StrAppend( + &out, "Targets revalidation (full page): ", cs.evac.targets_revalidation_full_page, "\n"); + absl::StrAppend(&out, "Targets revalidation (above threshold): ", + cs.evac.targets_revalidation_above_threshold, "\n"); + absl::StrAppend(&out, "Blocks revalidation (heap mismatch): ", + cs.evac.blocks_revalidation_heap_mismatch, "\n"); + absl::StrAppend(&out, "Blocks revalidation (active malloc page): ", + cs.evac.blocks_revalidation_active_malloc_page, "\n"); + absl::StrAppend(&out, "Blocks revalidation (full page): ", cs.evac.blocks_revalidation_full_page, + "\n"); + absl::StrAppend(&out, "Blocks revalidation (above threshold): ", + cs.evac.blocks_revalidation_above_threshold, "\n"); + absl::StrAppend( + &out, "Bytes revalidation (heap mismatch): ", cs.evac.bytes_revalidation_heap_mismatch, "\n"); + absl::StrAppend(&out, "Bytes revalidation (active malloc page): ", + cs.evac.bytes_revalidation_active_malloc_page, "\n"); + absl::StrAppend(&out, "Bytes revalidation (full page): ", cs.evac.bytes_revalidation_full_page, + "\n"); + absl::StrAppend(&out, "Bytes revalidation (above threshold): ", + cs.evac.bytes_revalidation_above_threshold, "\n"); + absl::StrAppend( + &out, "Targets abandoned (revalidation): ", cs.evac.targets_abandoned_revalidation, "\n"); + absl::StrAppend(&out, "Targets completed during evac: ", cs.evac.targets_completed_during_evac, + "\n"); + + absl::StrAppend(&out, "\n[VERIFY]\n"); + absl::StrAppend(&out, "Targets complete: ", cs.verify.targets_complete, "\n"); + absl::StrAppend(&out, "Targets partial: ", cs.verify.targets_partial, "\n"); + absl::StrAppend(&out, "Targets no progress: ", cs.verify.targets_no_progress, "\n"); + absl::StrAppend(&out, "Targets abandoned: ", cs.verify.targets_abandoned, "\n"); + absl::StrAppend(&out, "Blocks total (at census): ", cs.verify.blocks_total_at_census, "\n"); + absl::StrAppend(&out, "Blocks evacuated: ", cs.verify.blocks_evacuated, "\n"); + absl::StrAppend(&out, "Blocks remaining: ", cs.verify.blocks_remaining, "\n"); + absl::StrAppend(&out, "Bytes total (at census): ", cs.verify.bytes_total_at_census, "\n"); + absl::StrAppend(&out, "Bytes evacuated: ", cs.verify.bytes_evacuated, "\n"); + absl::StrAppend(&out, "Bytes remaining: ", cs.verify.bytes_remaining, "\n"); + + absl::StrAppend(&out, "\n[PAGE USAGE]\n", page_usage_stats.ToString()); + + absl::StripTrailingAsciiWhitespace(&out); + return out; +} + +// Build a usage-stat struct directly from a page address, validating that the +// page still belongs to `heap`, isn't empty, isn't full, and is still below +// threshold. Returns true on success; on false the caller should drop the +// address from the underutil set (page recovered, was reclaimed, or never was +// ours). Threshold is fractional in [0, 1] matching the dragonfly setting. +static bool BuildPageStatFromAddress(uintptr_t page_addr, mi_heap_t* heap, float threshold, + mi_page_usage_stats_t* out) { + if (page_addr == 0) + return false; + mi_page_t* page = reinterpret_cast(page_addr); + + if (mi_page_heap(page) != heap) + return false; + + const uint16_t used = page->used; + const uint16_t cap = page->capacity; + if (used == 0 || cap == 0) + return false; + if (used >= cap) + return false; // full; mimalloc may have re-filled it + + const float used_ratio = static_cast(used) / static_cast(cap); + if (used_ratio > threshold) + return false; // recovered above threshold + + out->page_address = page_addr; + out->block_size = mi_page_block_size(page); + out->capacity = cap; + out->reserved = page->reserved; + out->used = used; + out->flags = MI_DFLY_PAGE_BELOW_THRESHOLD; + return true; +} + +CycleProgress RunVerify(const TargetPlan& plan) { + CycleProgress p; + for (const TargetPage& target : plan.targets()) { + const bool is_complete = target.blocks_evacuated >= target.blocks_at_census; + if (target.blocks_evacuated == 0) { + ++p.targets_no_progress; + } else if (is_complete) { + ++p.targets_complete; + } else { + ++p.targets_partial; + } + if (target.revalidation_failed) { + ++p.targets_abandoned; + } + + const uint16_t evac_clamped = std::min(target.blocks_evacuated, target.blocks_at_census); + p.blocks_total_at_census += target.blocks_at_census; + p.blocks_evacuated += evac_clamped; + p.bytes_total_at_census += uint64_t(target.blocks_at_census) * target.block_size; + p.bytes_evacuated += uint64_t(evac_clamped) * target.block_size; + + if (is_complete) { + p.bytes_freed += + uint64_t(target.capacity_blocks - target.blocks_at_census) * target.block_size; + } + } + p.blocks_remaining = p.blocks_total_at_census - p.blocks_evacuated; + p.bytes_remaining = p.bytes_total_at_census - p.bytes_evacuated; + return p; +} + +void DefragIdleStep(DefragTaskState* state, float threshold) { + state->cycle_stats = {}; + state->ResetScanState(); + ++state->cycle_id; + const uint64_t now = NowNs(); + state->cycle_start_ns = now; + state->phase_start_ns = now; + state->phase_active_ns = 0; + + state->census.emplace(&state->cycle_stats.census, PageCensus::kDefaultMaxRetainedPages, + state->per_block_move_cost_bytes); + LOG(INFO) << absl::StrFormat("defrag[CYCLE_START] shard=%u cycle=%llu threshold=%.2f", + state->shard_id, state->cycle_id, threshold); + state->phase = DefragPhase::CENSUS; +} + +void DefragCensusStep(DefragTaskState* state, float threshold, CycleQuota quota, + const DbSliceWalker& walk) { + const uint64_t step_start_ns = NowNs(); + + // Reactive fast path: if mimalloc has flagged any pages via the underutil + // callback, hydrate the census from that set and skip the dashtable walk. + // Falls back to the legacy walk if the set is empty (bootstrap, or workload + // with no recent threshold-crossing frees). + const size_t underutil_set_size = defrag_underutil::Size(); + LOG_FIRST_N(INFO, 8) << absl::StrFormat("defrag[CENSUS_ENTRY] shard=%u cycle=%llu set_size=%zu", + state->shard_id, state->cycle_id, underutil_set_size); + if (underutil_set_size > 0) { + auto* heap = static_cast(zmalloc_heap); + const std::vector snapshot = defrag_underutil::Snapshot(); + size_t recovered = 0; + for (uintptr_t addr : snapshot) { + mi_page_usage_stats_t stat; + if (!BuildPageStatFromAddress(addr, heap, threshold, &stat)) { + defrag_underutil::Remove(addr); + ++recovered; + continue; + } + state->census->ObservePage(stat); + } + + state->cycle_stats.census_retained_pages = state->census->pages().size(); + for (const auto& agg : state->census->pages() | std::views::values) { + state->cycle_stats.census_potential_reclaim_bytes += + uint64_t(agg.capacity_blocks - agg.used_blocks) * agg.block_size; + state->cycle_stats.census_movable_bytes_observed += + uint64_t(agg.observed_movable_blocks) * agg.block_size; + } + + const uint64_t now = NowNs(); + state->phase_active_ns += now - step_start_ns; + DEFRAG_STEP_LOG << absl::StrFormat( + "defrag[CENSUS_REACTIVE] shard=%u cycle=%llu set_in=%zu retained=%zu recovered=%zu " + "potential_reclaim=%s movable_observed=%s took=%.1fms cpu=%.1fms", + state->shard_id, state->cycle_id, underutil_set_size, + state->cycle_stats.census_retained_pages, recovered, + FormatMiB(state->cycle_stats.census_potential_reclaim_bytes), + FormatMiB(state->cycle_stats.census_movable_bytes_observed), + NsToMs(now - state->phase_start_ns), NsToMs(state->phase_active_ns)); + + state->phase_start_ns = now; + state->phase_active_ns = 0; + state->phase = DefragPhase::SELECT_TARGETS; + return; + } + + // Fallback: full dashtable walk. + CensusTaker visitor(&*state->census, threshold, quota); + const DbSliceResult result = walk(&visitor, /*hints=*/nullptr, /*hint_cursor=*/nullptr); + state->cycle_stats.census_db_objects_scanned += result.attempts; + if (!result.finished_all_dbs) { + state->phase_active_ns += NowNs() - step_start_ns; + return; + } + + // Aggregate page-level totals here so the [CENSUS] log can report them + // before SELECT_TARGETS runs. + state->cycle_stats.census_retained_pages = state->census->pages().size(); + for (const auto& agg : state->census->pages() | std::views::values) { + state->cycle_stats.census_potential_reclaim_bytes += + uint64_t(agg.capacity_blocks - agg.used_blocks) * agg.block_size; + state->cycle_stats.census_movable_bytes_observed += + uint64_t(agg.observed_movable_blocks) * agg.block_size; + } + + const CensusStats& c = state->cycle_stats.census; + const uint64_t now = NowNs(); + state->phase_active_ns += now - step_start_ns; + DEFRAG_STEP_LOG << absl::StrFormat( + "defrag[CENSUS] shard=%u cycle=%llu db_objects=%llu retained=%zu/%zu " + "recorded/seen=%llu/%llu cursor_hints=%zu potential_reclaim=%s movable_observed=%s " + "skipped{above_thr=%llu full=%llu wrong_heap=%llu active=%llu low_score=%llu} " + "topk{evicted=%llu rebuilds=%llu} took=%.1fms cpu=%.1fms", + state->shard_id, state->cycle_id, state->cycle_stats.census_db_objects_scanned, + state->cycle_stats.census_retained_pages, PageCensus::kDefaultMaxRetainedPages, + c.allocations_recorded, c.allocations_seen, state->census->cursor_hints().size(), + FormatMiB(state->cycle_stats.census_potential_reclaim_bytes), + FormatMiB(state->cycle_stats.census_movable_bytes_observed), c.skipped_above_threshold, + c.skipped_full_page, c.skipped_wrong_heap, c.skipped_active_malloc_page, c.skipped_low_score, + c.pages_evicted_from_retained, c.heap_rebuilds, NsToMs(now - state->phase_start_ns), + NsToMs(state->phase_active_ns)); + + state->phase_start_ns = now; + state->phase_active_ns = 0; + state->phase = DefragPhase::SELECT_TARGETS; +} + +void DefragSelectTargetsStep(DefragTaskState* state) { + const uint64_t step_start_ns = NowNs(); + state->plan.emplace(&state->cycle_stats.plan); + state->plan->BuildFrom(*state->census); + state->cycle_stats.plan_target_pages = state->plan->size(); + // Hand the bucket-cursor hints off to the task state so EVACUATE can use + // them after we release the census itself (the page map is large). + state->cursor_hints = state->census->TakeCursorHints(); + state->hint_cursor_idx = 0; + state->census.reset(); + // EVACUATE walks the prime table again from the start. + state->ResetScanState(); + + const PlanStats& p = state->cycle_stats.plan; + const uint64_t now = NowNs(); + state->phase_active_ns += now - step_start_ns; + DEFRAG_STEP_LOG << absl::StrFormat( + "defrag[PLAN] shard=%u cycle=%llu targets=%zu/%zu kept=%llu reclaimable=%s " + "filtered{no_obs=%llu stale=%llu immovable=%llu empty=%llu} truncated_by_cap=%llu " + "filtered_immovable=%s truncated=%s took=%.1fms cpu=%.1fms", + state->shard_id, state->cycle_id, state->cycle_stats.plan_target_pages, + state->cycle_stats.census_retained_pages, p.targets_kept, + FormatMiB(p.selected_reclaimable_bytes_at_census), p.filtered_no_observed_blocks, + p.filtered_stale, p.filtered_has_immovable_data, p.filtered_already_empty, p.truncated_by_cap, + FormatMiB(p.filtered_immovable_reclaimable_bytes), FormatMiB(p.truncated_reclaimable_bytes), + NsToMs(now - state->phase_start_ns), NsToMs(state->phase_active_ns)); + + // Skip EVAC when the prize is too small to justify the dashtable walk. The + // underutil set is left intact: future cycles re-enter via reactive CENSUS + // and re-plan; if churn pushes more pages below threshold, the plan grows + // back above the bar naturally. + const uint64_t min_reclaimable = absl::GetFlag(FLAGS_defrag_min_plan_reclaimable_bytes); + if (p.selected_reclaimable_bytes_at_census < min_reclaimable) { + LOG(INFO) << absl::StrFormat( + "defrag[PLAN_SKIPPED] shard=%u cycle=%llu reclaimable=%s threshold=%s targets=%zu", + state->shard_id, state->cycle_id, FormatMiB(p.selected_reclaimable_bytes_at_census), + FormatMiB(min_reclaimable), state->cycle_stats.plan_target_pages); + state->cycle_stats.cycle_finished = true; + state->FinishCycle(); + return; + } + + state->phase_start_ns = now; + state->phase_active_ns = 0; + state->phase = DefragPhase::EVACUATE; +} + +void DefragEvacuateStep(DefragTaskState* state, float threshold, CycleQuota quota, + const DbSliceWalker& walk) { + const uint64_t step_start_ns = NowNs(); + Evacuator visitor(&*state->plan, threshold, &state->cycle_stats.evac, quota); + const bool use_hints = !state->cursor_hints.empty(); + const DbSliceResult result = walk(&visitor, use_hints ? &state->cursor_hints : nullptr, + use_hints ? &state->hint_cursor_idx : nullptr); + state->cycle_stats.evac_db_objects_scanned += result.attempts; + state->cycle_stats.evac_reallocations += result.reallocations; + state->cycle_stats.evac_key_reallocations += result.key_reallocations; + state->cycle_stats.evac_val_reallocations += result.val_reallocations; + state->cycle_stats.evac_bytes_moved += result.bytes_moved; + if (!result.finished_all_dbs && !state->plan->AllTargetsDone()) { + state->phase_active_ns += NowNs() - step_start_ns; + return; + } + + const EvacStats& e = state->cycle_stats.evac; + const uint64_t attempted = e.blocks_move_committed + e.blocks_skipped_revalidation_failed; + const double commit_pct = + attempted == 0 ? 0.0 : 100.0 * static_cast(e.blocks_move_committed) / attempted; + const uint64_t now = NowNs(); + state->phase_active_ns += now - step_start_ns; + DEFRAG_STEP_LOG << absl::StrFormat( + "defrag[EVACUATE] shard=%u cycle=%llu db_objects=%llu " + "reallocs=%llu(keys=%llu vals=%llu) bytes_moved=%s " + "commit=%llu/%llu (%.1f%%) bytes_committed=%s " + "skipped_blocks{not_target=%llu target_done=%llu revalid=%llu} " + "reval_fail{heap=%llu active=%llu full=%llu above_thr=%llu} " + "abandoned=%llu completed_during_evac=%llu took=%.1fms cpu=%.1fms", + state->shard_id, state->cycle_id, state->cycle_stats.evac_db_objects_scanned, + state->cycle_stats.evac_reallocations, state->cycle_stats.evac_key_reallocations, + state->cycle_stats.evac_val_reallocations, FormatMiB(state->cycle_stats.evac_bytes_moved), + e.blocks_move_committed, attempted, commit_pct, FormatMiB(e.bytes_move_committed), + e.blocks_skipped_not_target, e.blocks_skipped_target_done, + e.blocks_skipped_revalidation_failed, e.targets_revalidation_heap_mismatch, + e.targets_revalidation_active_malloc_page, e.targets_revalidation_full_page, + e.targets_revalidation_above_threshold, e.targets_abandoned_revalidation, + e.targets_completed_during_evac, NsToMs(now - state->phase_start_ns), + NsToMs(state->phase_active_ns)); + + state->phase_start_ns = now; + state->phase_active_ns = 0; + state->phase = DefragPhase::VERIFY; +} + +void DefragVerifyStep(DefragTaskState* state) { + const uint64_t step_start_ns = NowNs(); + state->cycle_stats.verify = RunVerify(*state->plan); + state->cycle_stats.cycle_finished = true; + + const CycleProgress& v = state->cycle_stats.verify; + // complete/partial/no_progress are mutually exclusive and cover every target; + // abandoned is a parallel dimension (revalidation_failed) that overlaps with + // those three, so it is not part of the denominator. + const uint64_t total_targets = v.targets_complete + v.targets_partial + v.targets_no_progress; + const double done_pct = + total_targets == 0 ? 0.0 : 100.0 * static_cast(v.targets_complete) / total_targets; + const double bytes_pct = + v.bytes_total_at_census == 0 + ? 0.0 + : 100.0 * static_cast(v.bytes_evacuated) / v.bytes_total_at_census; + const uint64_t planned_reclaim = state->cycle_stats.plan.selected_reclaimable_bytes_at_census; + const double freed_pct = + planned_reclaim == 0 ? 0.0 : 100.0 * static_cast(v.bytes_freed) / planned_reclaim; + const uint64_t now = NowNs(); + state->phase_active_ns += now - step_start_ns; + DEFRAG_STEP_LOG << absl::StrFormat( + "defrag[VERIFY] shard=%u cycle=%llu targets{done=%llu/%llu (%.1f%%) " + "partial=%llu none=%llu abandoned=%llu} " + "bytes{moved=%s/%s (%.1f%%) freed=%s/%s (%.1f%%) remaining=%s} took=%.1fms cpu=%.1fms", + state->shard_id, state->cycle_id, v.targets_complete, total_targets, done_pct, + v.targets_partial, v.targets_no_progress, v.targets_abandoned, FormatMiB(v.bytes_evacuated), + FormatMiB(v.bytes_total_at_census), bytes_pct, FormatMiB(v.bytes_freed), + FormatMiB(planned_reclaim), freed_pct, FormatMiB(v.bytes_remaining), + NsToMs(now - state->phase_start_ns), NsToMs(state->phase_active_ns)); + + const double cycle_ms = NsToMs(now - state->cycle_start_ns); + const double freed_mib_per_s = + cycle_ms <= 0.0 + ? 0.0 + : (static_cast(v.bytes_freed) / (1024.0 * 1024.0)) / (cycle_ms / 1000.0); + LOG(INFO) << absl::StrFormat( + "defrag[CYCLE_DONE] shard=%u cycle=%llu targets_done=%llu/%llu (%.1f%%) " + "bytes_freed=%s/%s (%.1f%%) bytes_moved=%s cycle_took=%.1fms freed_rate=%.1fMiB/s", + state->shard_id, state->cycle_id, v.targets_complete, total_targets, done_pct, + FormatMiB(v.bytes_freed), FormatMiB(planned_reclaim), freed_pct, FormatMiB(v.bytes_evacuated), + cycle_ms, freed_mib_per_s); + + state->FinishCycle(); +} + +namespace { + +struct StepTransition { + DefragPhase before; + DefragPhase after; +}; + +StepTransition RunPhaseStep(DefragTaskState* state, float threshold, CycleQuota quota, + const DbSliceWalker& walk) { + const DefragPhase before = state->phase; + switch (state->phase) { + case DefragPhase::IDLE: + DefragIdleStep(state, threshold); + break; + case DefragPhase::CENSUS: + DefragCensusStep(state, threshold, quota, walk); + break; + case DefragPhase::SELECT_TARGETS: + DefragSelectTargetsStep(state); + break; + case DefragPhase::EVACUATE: + DefragEvacuateStep(state, threshold, quota, walk); + break; + case DefragPhase::VERIFY: + DefragVerifyStep(state); + break; + } + return {before, state->phase}; +} + +bool CycleEnded(StepTransition t) { + if (t.after != DefragPhase::IDLE) + return false; + // Normal end (VERIFY -> IDLE) and PLAN_SKIPPED bail-out (SELECT_TARGETS -> + // IDLE) both terminate the cycle. + return t.before == DefragPhase::VERIFY || t.before == DefragPhase::SELECT_TARGETS; +} + +} // namespace + +void RunPhaseDefrag(DefragTaskState* state, float threshold, CycleQuota quota, + const DbSliceWalker& walk) { + StepTransition t; + do { + t = RunPhaseStep(state, threshold, quota, walk); + } while (!quota.Depleted() && !CycleEnded(t)); +} + +} // namespace dfly diff --git a/src/server/defrag.h b/src/server/defrag.h new file mode 100644 index 000000000000..f6b1c800a1d2 --- /dev/null +++ b/src/server/defrag.h @@ -0,0 +1,223 @@ +// Copyright 2026, DragonflyDB authors. All rights reserved. +// See LICENSE for licensing terms. +// + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "core/page_usage/page_usage_stats.h" +#include "core/page_usage/page_usage_visitors.h" + +namespace dfly { + +// Tracks pages that mimalloc has signalled (via the dragonfly underutil +// callback) as having dropped below the defrag utilization threshold during a +// free. Storage is per-thread (via thread_local) so each shard observes only +// its own heap. Replaces the dashtable-walking CENSUS as the source of +// candidate target pages. +namespace defrag_underutil { + +// Register the mimalloc callback once per process. Safe to call from any +// thread; only the first call performs the registration. Subsequent calls are +// no-ops. +void InitOnce(); + +// Set the threshold (in percent, 0-100) used by mimalloc to decide when a page +// has crossed below the underutil watermark during a free. Must match the +// dragonfly-side `mem_defrag_page_utilization_threshold` so census and EVAC +// use the same definition of "underutilized". +void SetThresholdPct(uint8_t pct); + +// Number of pages currently tracked on this thread's set. +size_t Size(); + +// Returns a copy of the page-address set as a vector. Does not modify the +// set; callers drop entries explicitly via Remove. Returned order is +// unspecified. +std::vector Snapshot(); + +// Drop a page from this thread's set. Used by VERIFY to retire targets that +// were successfully drained, and by the new CENSUS to drop entries that have +// since recovered above threshold. +void Remove(uintptr_t page_addr); + +// Clear the entire set on this thread. +void Clear(); + +// Filter predicate for the legacy DefragIfNeeded fast path. Returns true when +// the page is "interesting" (a candidate for the expensive +// mi_heap_page_is_underutilized check): +// - empty set ⇒ bootstrap, no info available, return true so callers +// fall through to the original behavior; +// - non-empty set ⇒ return true iff the page appears in the set. +// Conservative-positive: pages that crossed threshold without a recent free +// will be missed until the next free on them lands them in the set. +bool IsPageMaybeUnderutil(uintptr_t page_addr); + +} // namespace defrag_underutil + +enum class DefragPhase : uint8_t { + IDLE, + CENSUS, + SELECT_TARGETS, + EVACUATE, + VERIFY, +}; + +struct CycleProgress { + uint64_t targets_complete = 0; // blocks_evacuated >= blocks_at_census + uint64_t targets_partial = 0; // 0 < blocks_evacuated < blocks_at_census + uint64_t targets_no_progress = 0; // blocks_evacuated == 0 + uint64_t targets_abandoned = 0; // revalidation_failed; orthogonal to the trio above + + uint64_t blocks_total_at_census = 0; + uint64_t blocks_evacuated = 0; + uint64_t blocks_remaining = 0; + + uint64_t bytes_total_at_census = 0; + uint64_t bytes_evacuated = 0; + uint64_t bytes_remaining = 0; + + // Bytes mimalloc can return once fully-drained source pages are reclaimed: + // sum over completed targets of (capacity - used)_at_census * block_size. + // Compares directly to PlanStats::selected_reclaimable_bytes_at_census. + uint64_t bytes_freed = 0; + + void Merge(const CycleProgress& other); +}; + +CycleProgress RunVerify(const TargetPlan& plan); + +struct DefragCycleStats { + CensusStats census; + PlanStats plan; + EvacStats evac; + CycleProgress verify; + + uint64_t census_db_objects_scanned = 0; + uint64_t evac_db_objects_scanned = 0; + uint64_t evac_reallocations = 0; + uint64_t evac_key_reallocations = 0; // keys whose allocation was moved + uint64_t evac_val_reallocations = 0; // values whose allocation was moved + uint64_t evac_bytes_moved = 0; // total bytes read+written during reallocations + + size_t census_retained_pages = 0; + size_t plan_target_pages = 0; + + uint64_t census_potential_reclaim_bytes = 0; + uint64_t census_movable_bytes_observed = 0; + + bool cycle_finished = false; + + void Merge(const DefragCycleStats& other); +}; + +struct DefragShardSummary { + DefragPhase phase_start = DefragPhase::IDLE; // phase on entry to DoDefrag + DefragPhase phase_end = DefragPhase::IDLE; // phase on exit from DoDefrag + uint64_t duration_us = 0; // wall-clock time spent in DoDefrag + bool quota_depleted = false; // visitor exhausted its CycleQuota + bool finished_all_dbs = false; // legacy: walked all dbs; phased: cycle complete +}; + +struct DefragShardReport { + DefragShardSummary summary; // per-shard exit info + DefragCycleStats cycle_stats; // empty on the legacy path + CollectedPageStats page_usage_stats; // search-index defrag stats from PageUsage + bool work_pending = false; // bg-task scheduler hint: true = high priority +}; + +struct DefragMergedReport { + std::vector shard_summaries; // index = shard_id + DefragCycleStats cycle_stats; // summed across shards + CollectedPageStats page_usage_stats; // merged via CollectedPageStats::Merge + + static DefragMergedReport Merge(std::vector&& shards); + + std::string ToString() const; +}; + +const char* PhaseName(DefragPhase phase); + +struct DefragTaskState { + // Cycle position, used by both legacy and phased paths. + size_t dbid = 0; + uint64_t cursor = 0; + + // Threshold-gate state, consulted before starting a new cycle. + time_t last_check_time = 0; + float page_utilization_threshold = 0.8f; + + // Per-block move-cost weight in the page retention score: + // score = reclaim / (move_bytes + used_blocks * per_block_move_cost + slot_overhead) + // Higher values penalize many-entry pages more strongly, pushing pages with + // small block sizes (more entries per page) toward the back of the candidate + // ordering. Useful for wide/mixed workloads where evacuating small-block + // pages is expensive per byte reclaimed. + uint64_t per_block_move_cost_bytes = 256; + + // Phased-only state, untouched in legacy mode. + DefragPhase phase = DefragPhase::IDLE; + std::optional census; + std::optional plan; + // Bucket cursors observed during CENSUS that contained at least one object + // on a candidate page. Moved here from PageCensus before SELECT_TARGETS + // releases the census, then consumed by EVACUATE. Sorted vector + cursor + // index lets the hinted walker resume mid-iteration across DoDefrag calls + // when one EVAC quota slice can't drain the full hint set. + std::vector cursor_hints; + size_t hint_cursor_idx = 0; + DefragCycleStats cycle_stats; + + uint16_t shard_id = 0; + uint64_t cycle_id = 0; + uint64_t cycle_start_ns = 0; + uint64_t phase_start_ns = 0; + // CPU time spent doing actual work in the current phase, summed across + // DoDefrag invocations. Resets at each phase transition. Distinguishes + // CPU-only effort from wall-clock (phase_start_ns -> now), which includes + // idle gaps between invocations. + uint64_t phase_active_ns = 0; + + void UpdateScanState(uint64_t cursor_val); + + void ResetScanState(); + + void FinishCycle(); +}; + +struct DbSliceResult { + uint64_t attempts = 0; // # of (key, value) pairs visited + uint64_t reallocations = 0; // # of entries where key or value was reallocated + uint64_t key_reallocations = 0; // # of keys reallocated + uint64_t val_reallocations = 0; // # of values reallocated + uint64_t bytes_moved = 0; // bytes read+written across all reallocations + bool finished_all_dbs = false; +}; + +// Walker callable. If `hints` is non-null and non-empty, the walker should +// visit only the buckets listed in the hint vector starting at *hint_cursor +// (used by EVACUATE to skip buckets without candidate objects); the walker +// updates *hint_cursor to where it stopped so the next call can resume. If +// `hints` is null, the walker performs a full slice walk (used by CENSUS). +using DbSliceWalker = std::function* hints, + size_t* hint_cursor)>; + +void DefragIdleStep(DefragTaskState* state, float threshold); +void DefragCensusStep(DefragTaskState* state, float threshold, CycleQuota quota, + const DbSliceWalker& walk); +void DefragSelectTargetsStep(DefragTaskState* state); +void DefragEvacuateStep(DefragTaskState* state, float threshold, CycleQuota quota, + const DbSliceWalker& walk); +void DefragVerifyStep(DefragTaskState* state); + +void RunPhaseDefrag(DefragTaskState* state, float threshold, CycleQuota quota, + const DbSliceWalker& walk); + +} // namespace dfly diff --git a/src/server/defrag_test.cc b/src/server/defrag_test.cc new file mode 100644 index 000000000000..5bc13b331ef0 --- /dev/null +++ b/src/server/defrag_test.cc @@ -0,0 +1,1216 @@ +// Copyright 2026, DragonflyDB authors. All rights reserved. +// See LICENSE for licensing terms. +// + +#include "server/defrag.h" + +#include +#include +#include + +#include + +#include "base/flags.h" +#include "base/gtest.h" +#include "base/logging.h" +#include "core/page_usage/page_usage_visitors.h" + +ABSL_DECLARE_FLAG(bool, defrag_use_skip_bit); + +extern "C" { +#include "redis/zmalloc.h" +mi_page_usage_stats_t mi_heap_page_is_underutilized(mi_heap_t* heap, void* p, float ratio, + bool collect_stats); +} + +namespace dfly { + +namespace { + +mi_page_usage_stats_t MakeStat(uintptr_t addr, uint16_t capacity, uint16_t used, + uint8_t flags = MI_DFLY_PAGE_BELOW_THRESHOLD, + size_t block_size = 64) { + mi_page_usage_stats_t s{}; + s.page_address = addr; + s.block_size = block_size; + s.capacity = capacity; + s.used = used; + s.flags = flags; + return s; +} + +} // namespace + +TEST(PageCensusEvictionTest, EvictsLowestScorePageWhenOverCap) { + if (!PageCensus::kEnableTopK) { + GTEST_SKIP() << "PageCensus::kEnableTopK is false; eviction path inactive"; + } + CensusStats cstats; + PageCensus census(&cstats, /*max_retained_pages=*/4); + + // Scores (cap/used): 5.0, 3.33, 2.5, 2.0 — page 4 is the lowest. + census.Observe(MakeStat(/*addr=*/1, /*capacity=*/10, /*used=*/2)); + census.Observe(MakeStat(/*addr=*/2, /*capacity=*/10, /*used=*/3)); + census.Observe(MakeStat(/*addr=*/3, /*capacity=*/10, /*used=*/4)); + census.Observe(MakeStat(/*addr=*/4, /*capacity=*/10, /*used=*/5)); + + ASSERT_EQ(census.pages().size(), 4u); + EXPECT_EQ(census.stats().pages_evicted_from_retained, 0u); + + // New page with score 2.5 — pushes us over cap, page 4 (score 2.0) should be evicted. + census.Observe(MakeStat(/*addr=*/5, /*capacity=*/10, /*used=*/4)); + + EXPECT_EQ(census.pages().size(), 4u); + EXPECT_EQ(census.stats().pages_evicted_from_retained, 1u); + EXPECT_TRUE(census.pages().contains(1)); + EXPECT_TRUE(census.pages().contains(2)); + EXPECT_TRUE(census.pages().contains(3)); + EXPECT_FALSE(census.pages().contains(4)); + EXPECT_TRUE(census.pages().contains(5)); +} + +TEST(PageCensusEvictionTest, StaleHeapEntryDoesNotEvictWrongPage) { + if (!PageCensus::kEnableTopK) { + GTEST_SKIP() << "PageCensus::kEnableTopK is false; eviction path inactive"; + } + CensusStats cstats; + PageCensus census(&cstats, /*max_retained_pages=*/4); + + // Scores: 5.0, 3.33, 2.5, 2.0. Page 4 starts as the lowest. + census.Observe(MakeStat(/*addr=*/1, /*capacity=*/10, /*used=*/2)); + census.Observe(MakeStat(/*addr=*/2, /*capacity=*/10, /*used=*/3)); + census.Observe(MakeStat(/*addr=*/3, /*capacity=*/10, /*used=*/4)); + census.Observe(MakeStat(/*addr=*/4, /*capacity=*/10, /*used=*/5)); + + // Re-observe page 4 with a much higher score (5.0). The old heap entry + // (score 2.0, gen=1) is now stale — page 3 (score 2.5) is genuinely lowest. + census.Observe(MakeStat(/*addr=*/4, /*capacity=*/20, /*used=*/4)); + + ASSERT_EQ(census.pages().size(), 4u); + EXPECT_EQ(census.stats().pages_evicted_from_retained, 0u); + + // New page with score 5.0 — pushes us over cap. Lazy-pop should skip the + // stale page-4 entry and evict page 3 (the genuinely-lowest live page, + // score 2.5). Use a score strictly above page 3's so page 3 is the + // unambiguous next-lowest after the stale skip. + census.Observe(MakeStat(/*addr=*/6, /*capacity=*/10, /*used=*/2)); + + EXPECT_EQ(census.pages().size(), 4u); + EXPECT_EQ(census.stats().pages_evicted_from_retained, 1u); + EXPECT_TRUE(census.pages().contains(1)); + EXPECT_TRUE(census.pages().contains(2)); + EXPECT_FALSE(census.pages().contains(3)); + EXPECT_TRUE(census.pages().contains(4)); + EXPECT_TRUE(census.pages().contains(6)); +} + +TEST(PageCensusEvictionTest, RebuildFiresWhenHeapDoublesCap) { + if (!PageCensus::kEnableTopK) { + GTEST_SKIP() << "PageCensus::kEnableTopK is false; heap rebuild inactive"; + } + CensusStats cstats; + PageCensus census(&cstats, /*max_retained_pages=*/4); + + // Fill to cap. Heap size = 4. + census.Observe(MakeStat(/*addr=*/1, /*capacity=*/10, /*used=*/2)); + census.Observe(MakeStat(/*addr=*/2, /*capacity=*/10, /*used=*/3)); + census.Observe(MakeStat(/*addr=*/3, /*capacity=*/10, /*used=*/4)); + census.Observe(MakeStat(/*addr=*/4, /*capacity=*/10, /*used=*/5)); + + EXPECT_EQ(census.stats().heap_rebuilds, 0u); + + // Re-observe page 1 repeatedly. Each call pushes a new heap entry without + // changing pages_.size(). Heap grows: 5, 6, 7, 8, 9 — rebuild fires once + // we cross 2 * max_retained_pages_ (i.e., > 8). + for (int i = 0; i < 5; ++i) { + census.Observe(MakeStat(/*addr=*/1, /*capacity=*/10, /*used=*/2)); + } + + EXPECT_GE(census.stats().heap_rebuilds, 1u); + EXPECT_EQ(census.pages().size(), 4u); + EXPECT_EQ(census.stats().pages_evicted_from_retained, 0u); +} + +TEST(PageCensusEvictionTest, RejectsNewPageWithScoreBelowWorstRetained) { + if (!PageCensus::kEnableTopK) { + GTEST_SKIP() << "PageCensus::kEnableTopK is false; reject-on-cap inactive"; + } + CensusStats cstats; + PageCensus census(&cstats, /*max_retained_pages=*/4); + + // Fill to cap with scores 5.0, 3.33, 2.5, 2.0. Worst retained is page 4 at 2.0. + census.Observe(MakeStat(/*addr=*/1, /*capacity=*/10, /*used=*/2)); + census.Observe(MakeStat(/*addr=*/2, /*capacity=*/10, /*used=*/3)); + census.Observe(MakeStat(/*addr=*/3, /*capacity=*/10, /*used=*/4)); + census.Observe(MakeStat(/*addr=*/4, /*capacity=*/10, /*used=*/5)); + + ASSERT_EQ(census.pages().size(), 4u); + EXPECT_EQ(census.stats().skipped_low_score, 0u); + + // New page with score 1.42 (cap=10, used=7) — strictly below the worst retained. + // Should be rejected upfront: not inserted, no eviction, no allocations_recorded bump. + const uint64_t recorded_before = census.stats().allocations_recorded; + census.Observe(MakeStat(/*addr=*/5, /*capacity=*/10, /*used=*/7)); + + EXPECT_EQ(census.pages().size(), 4u); + EXPECT_FALSE(census.pages().contains(5)); + EXPECT_EQ(census.stats().skipped_low_score, 1u); + EXPECT_EQ(census.stats().pages_evicted_from_retained, 0u); + EXPECT_EQ(census.stats().allocations_recorded, recorded_before); +} + +namespace { + +// Drives N observations on a single page so that observed_movable_blocks lands at N +// and used_blocks/capacity reflect the last call. +void ObserveTimes(PageCensus& census, int times, uintptr_t addr, uint16_t capacity, uint16_t used) { + for (int i = 0; i < times; ++i) { + census.Observe(MakeStat(addr, capacity, used)); + } +} + +} // namespace + +TEST(TargetPlanTest, AppliesFilterClassification) { + CensusStats cstats; + PageCensus census(&cstats); + + // KEEP: movable == used. + ObserveTimes(census, /*times=*/4, /*addr=*/1, /*capacity=*/10, /*used=*/4); + // kAlreadyEmpty: used == 0. + ObserveTimes(census, /*times=*/1, /*addr=*/2, /*capacity=*/10, /*used=*/0); + // kStaleObservation: movable (5) > used (3). + ObserveTimes(census, /*times=*/5, /*addr=*/3, /*capacity=*/10, /*used=*/3); + // kHasImmovableData: movable (2) < used (5). + ObserveTimes(census, /*times=*/2, /*addr=*/4, /*capacity=*/10, /*used=*/5); + + PlanStats pstats; + TargetPlan plan(&pstats); + plan.BuildFrom(census); + + EXPECT_EQ(plan.size(), 1u); + EXPECT_EQ(plan.stats().targets_kept, 1u); + EXPECT_EQ(plan.stats().filtered_already_empty, 1u); + EXPECT_EQ(plan.stats().filtered_stale, 1u); + EXPECT_EQ(plan.stats().filtered_has_immovable_data, 1u); + EXPECT_EQ(plan.stats().filtered_no_observed_blocks, 0u); + EXPECT_EQ(plan.stats().truncated_by_cap, 0u); + EXPECT_TRUE(plan.Contains(1)); + EXPECT_FALSE(plan.Contains(2)); + EXPECT_FALSE(plan.Contains(3)); + EXPECT_FALSE(plan.Contains(4)); +} + +TEST(TargetPlanTest, SortsByScoreDescending) { + CensusStats cstats; + PageCensus census(&cstats); + + // Three KEEP-eligible pages with distinct scores: 5.0, 2.5, 2.0. + ObserveTimes(census, /*times=*/2, /*addr=*/100, /*capacity=*/10, /*used=*/2); + ObserveTimes(census, /*times=*/4, /*addr=*/200, /*capacity=*/10, /*used=*/4); + ObserveTimes(census, /*times=*/5, /*addr=*/300, /*capacity=*/10, /*used=*/5); + + PlanStats pstats; + TargetPlan plan(&pstats); + plan.BuildFrom(census); + + ASSERT_EQ(plan.size(), 3u); + EXPECT_EQ(plan.targets()[0].page_address, 100u); + EXPECT_EQ(plan.targets()[1].page_address, 200u); + EXPECT_EQ(plan.targets()[2].page_address, 300u); + EXPECT_GT(plan.targets()[0].retention_score_at_census, + plan.targets()[1].retention_score_at_census); + EXPECT_GT(plan.targets()[1].retention_score_at_census, + plan.targets()[2].retention_score_at_census); +} + +TEST(TargetPlanTest, TruncatesToMaxTargets) { + CensusStats cstats; + PageCensus census(&cstats); + + // Four KEEP-eligible pages with descending scores. + ObserveTimes(census, /*times=*/2, /*addr=*/100, /*capacity=*/10, /*used=*/2); // 5.0 + ObserveTimes(census, /*times=*/3, /*addr=*/200, /*capacity=*/10, /*used=*/3); // 3.33 + ObserveTimes(census, /*times=*/4, /*addr=*/300, /*capacity=*/10, /*used=*/4); // 2.5 + ObserveTimes(census, /*times=*/5, /*addr=*/400, /*capacity=*/10, /*used=*/5); // 2.0 + + PlanStats pstats; + TargetPlan plan(&pstats); + plan.BuildFrom(census, /*max_targets=*/2); + + EXPECT_EQ(plan.size(), 2u); + EXPECT_EQ(plan.stats().targets_kept, 2u); + EXPECT_EQ(plan.stats().truncated_by_cap, 2u); + EXPECT_TRUE(plan.Contains(100)); + EXPECT_TRUE(plan.Contains(200)); + EXPECT_FALSE(plan.Contains(300)); + EXPECT_FALSE(plan.Contains(400)); +} + +TEST(TargetPlanTest, AddressIndexLookup) { + CensusStats cstats; + PageCensus census(&cstats); + ObserveTimes(census, /*times=*/2, /*addr=*/0x1000, /*capacity=*/10, /*used=*/2); + ObserveTimes(census, /*times=*/4, /*addr=*/0x2000, /*capacity=*/10, /*used=*/4); + + PlanStats pstats; + TargetPlan plan(&pstats); + plan.BuildFrom(census); + + ASSERT_EQ(plan.size(), 2u); + + const TargetPage* found = plan.Find(0x1000); + ASSERT_NE(found, nullptr); + EXPECT_EQ(found->page_address, 0x1000u); + EXPECT_EQ(found->blocks_at_census, 2u); + EXPECT_EQ(found->capacity_blocks, 10u); + + EXPECT_EQ(plan.Find(0xDEAD), nullptr); + EXPECT_FALSE(plan.Contains(0xDEAD)); +} + +TEST(TargetPlanTest, BuildFromIsIdempotent) { + CensusStats cstats1; + PageCensus census1(&cstats1); + ObserveTimes(census1, /*times=*/2, /*addr=*/100, /*capacity=*/10, /*used=*/2); + ObserveTimes(census1, /*times=*/4, /*addr=*/200, /*capacity=*/10, /*used=*/4); + + PlanStats pstats; + TargetPlan plan(&pstats); + plan.BuildFrom(census1); + ASSERT_EQ(plan.size(), 2u); + + // Second census with different pages — plan should fully replace its state. + CensusStats cstats2; + PageCensus census2(&cstats2); + ObserveTimes(census2, /*times=*/3, /*addr=*/500, /*capacity=*/10, /*used=*/3); + + plan.BuildFrom(census2); + EXPECT_EQ(plan.size(), 1u); + EXPECT_TRUE(plan.Contains(500)); + EXPECT_FALSE(plan.Contains(100)); + EXPECT_FALSE(plan.Contains(200)); + EXPECT_EQ(plan.stats().targets_kept, 1u); +} + +namespace { + +// Convenience: stat for an eligible page (BELOW_THRESHOLD set, no other dfly flags). +mi_page_usage_stats_t EligibleStat(uintptr_t addr) { + return MakeStat(addr, /*capacity=*/10, /*used=*/4); +} + +} // namespace + +TEST(EvacDecideTest, NotATargetWhenPageMissingFromPlan) { + CensusStats cstats; + PageCensus census(&cstats); + ObserveTimes(census, /*times=*/4, /*addr=*/100, /*capacity=*/10, /*used=*/4); + PlanStats pstats; + TargetPlan plan(&pstats); + plan.BuildFrom(census); + + EvacStats stats; + EvacOutcome outcome = EvacDecide(plan, EligibleStat(/*addr=*/0xDEAD), stats); + + EXPECT_EQ(outcome, EvacOutcome::kNotATarget); + EXPECT_EQ(stats.blocks_skipped_not_target, 1u); + EXPECT_EQ(stats.blocks_skipped_target_done, 0u); + EXPECT_EQ(stats.blocks_skipped_revalidation_failed, 0u); + EXPECT_EQ(stats.blocks_move_committed, 0u); +} + +TEST(EvacDecideTest, CommitsMoveAndBumpsCountersOnFirstCall) { + CensusStats cstats; + PageCensus census(&cstats); + ObserveTimes(census, /*times=*/4, /*addr=*/100, /*capacity=*/10, /*used=*/4); + PlanStats pstats; + TargetPlan plan(&pstats); + plan.BuildFrom(census); + ASSERT_EQ(plan.size(), 1u); + + EvacStats stats; + EvacOutcome outcome = EvacDecide(plan, EligibleStat(/*addr=*/100), stats); + + EXPECT_EQ(outcome, EvacOutcome::kCommitMove); + EXPECT_EQ(stats.blocks_move_committed, 1u); + EXPECT_EQ(stats.blocks_skipped_not_target, 0u); + EXPECT_EQ(stats.blocks_skipped_target_done, 0u); + EXPECT_EQ(stats.blocks_skipped_revalidation_failed, 0u); + const TargetPage* target = plan.Find(100); + ASSERT_NE(target, nullptr); + EXPECT_EQ(target->blocks_evacuated, 1u); + EXPECT_FALSE(target->revalidation_failed); +} + +TEST(EvacDecideTest, ReturnsTargetAlreadyDoneOnceCounterReachesCensus) { + CensusStats cstats; + PageCensus census(&cstats); + ObserveTimes(census, /*times=*/2, /*addr=*/100, /*capacity=*/10, /*used=*/2); + PlanStats pstats; + TargetPlan plan(&pstats); + plan.BuildFrom(census); + ASSERT_EQ(plan.Find(100)->blocks_at_census, 2u); + + EvacStats stats; + EXPECT_EQ(EvacDecide(plan, EligibleStat(100), stats), EvacOutcome::kCommitMove); + EXPECT_EQ(EvacDecide(plan, EligibleStat(100), stats), EvacOutcome::kCommitMove); + EXPECT_EQ(EvacDecide(plan, EligibleStat(100), stats), EvacOutcome::kTargetAlreadyDone); + + EXPECT_EQ(stats.blocks_move_committed, 2u); + EXPECT_EQ(stats.blocks_skipped_target_done, 1u); + EXPECT_EQ(plan.Find(100)->blocks_evacuated, 2u); +} + +TEST(EvacDecideTest, MultiTargetCountersAreIndependent) { + CensusStats cstats; + PageCensus census(&cstats); + ObserveTimes(census, /*times=*/3, /*addr=*/100, /*capacity=*/10, /*used=*/3); + ObserveTimes(census, /*times=*/2, /*addr=*/200, /*capacity=*/10, /*used=*/2); + PlanStats pstats; + TargetPlan plan(&pstats); + plan.BuildFrom(census); + + EvacStats stats; + EXPECT_EQ(EvacDecide(plan, EligibleStat(100), stats), EvacOutcome::kCommitMove); + EXPECT_EQ(EvacDecide(plan, EligibleStat(200), stats), EvacOutcome::kCommitMove); + EXPECT_EQ(EvacDecide(plan, EligibleStat(100), stats), EvacOutcome::kCommitMove); + EXPECT_EQ(EvacDecide(plan, EligibleStat(200), stats), EvacOutcome::kCommitMove); + EXPECT_EQ(EvacDecide(plan, EligibleStat(200), stats), EvacOutcome::kTargetAlreadyDone); + + EXPECT_EQ(plan.Find(100)->blocks_evacuated, 2u); + EXPECT_EQ(plan.Find(200)->blocks_evacuated, 2u); + EXPECT_EQ(stats.blocks_move_committed, 4u); + EXPECT_EQ(stats.blocks_skipped_target_done, 1u); +} + +TEST(EvacDecideTest, RevalidationFailsForIneligibleFlags) { + CensusStats cstats; + PageCensus census(&cstats); + ObserveTimes(census, /*times=*/4, /*addr=*/100, /*capacity=*/10, /*used=*/4); + PlanStats pstats; + TargetPlan plan(&pstats); + plan.BuildFrom(census); + + // Page is now FULL — no longer a defrag candidate. + EvacStats stats; + mi_page_usage_stats_t bad = + MakeStat(/*addr=*/100, /*capacity=*/10, /*used=*/4, /*flags=*/MI_DFLY_PAGE_FULL); + EXPECT_EQ(EvacDecide(plan, bad, stats), EvacOutcome::kRevalidationFailed); + EXPECT_EQ(stats.blocks_skipped_revalidation_failed, 1u); + EXPECT_TRUE(plan.Find(100)->revalidation_failed); + EXPECT_EQ(plan.Find(100)->blocks_evacuated, 0u); +} + +TEST(EvacDecideTest, RevalidationFailureIsSticky) { + CensusStats cstats; + PageCensus census(&cstats); + ObserveTimes(census, /*times=*/4, /*addr=*/100, /*capacity=*/10, /*used=*/4); + PlanStats pstats; + TargetPlan plan(&pstats); + plan.BuildFrom(census); + + EvacStats stats; + // First call: page is no longer below threshold (flags=0). Sticky flag set. + mi_page_usage_stats_t bad = MakeStat(/*addr=*/100, /*capacity=*/10, /*used=*/9, /*flags=*/0); + EXPECT_EQ(EvacDecide(plan, bad, stats), EvacOutcome::kRevalidationFailed); + + // Subsequent calls — even with eligible flags — still fail via the sticky path. + EXPECT_EQ(EvacDecide(plan, EligibleStat(100), stats), EvacOutcome::kRevalidationFailed); + EXPECT_EQ(EvacDecide(plan, EligibleStat(100), stats), EvacOutcome::kRevalidationFailed); + + EXPECT_EQ(stats.blocks_skipped_revalidation_failed, 3u); + EXPECT_EQ(stats.blocks_move_committed, 0u); + EXPECT_EQ(plan.Find(100)->blocks_evacuated, 0u); +} + +TEST(EvacDecideTest, RevalidationBreakdownAttributesBlocksToOriginatingReason) { + CensusStats cstats; + PageCensus census(&cstats); + ObserveTimes(census, /*times=*/4, /*addr=*/100, /*capacity=*/10, /*used=*/4); + ObserveTimes(census, /*times=*/4, /*addr=*/200, /*capacity=*/10, /*used=*/4); + PlanStats pstats; + TargetPlan plan(&pstats); + plan.BuildFrom(census); + + EvacStats stats; + constexpr uint32_t kBlockSize = 64; + + // Target 100 fails with HEAP_MISMATCH; revisit twice on sticky path. + mi_page_usage_stats_t mismatch = + MakeStat(/*addr=*/100, /*capacity=*/10, /*used=*/4, /*flags=*/MI_DFLY_HEAP_MISMATCH); + EXPECT_EQ(EvacDecide(plan, mismatch, stats), EvacOutcome::kRevalidationFailed); + EXPECT_EQ(EvacDecide(plan, EligibleStat(100), stats), EvacOutcome::kRevalidationFailed); + EXPECT_EQ(EvacDecide(plan, EligibleStat(100), stats), EvacOutcome::kRevalidationFailed); + + // Target 200 fails with PAGE_FULL; revisit once. + mi_page_usage_stats_t full = + MakeStat(/*addr=*/200, /*capacity=*/10, /*used=*/4, /*flags=*/MI_DFLY_PAGE_FULL); + EXPECT_EQ(EvacDecide(plan, full, stats), EvacOutcome::kRevalidationFailed); + EXPECT_EQ(EvacDecide(plan, EligibleStat(200), stats), EvacOutcome::kRevalidationFailed); + + // Target-grain counters: one each. + EXPECT_EQ(stats.targets_revalidation_heap_mismatch, 1u); + EXPECT_EQ(stats.targets_revalidation_full_page, 1u); + EXPECT_EQ(stats.targets_revalidation_active_malloc_page, 0u); + EXPECT_EQ(stats.targets_revalidation_above_threshold, 0u); + + // Block-grain breakdown: 3 blocks attributed to heap_mismatch, 2 to full_page. + EXPECT_EQ(stats.blocks_revalidation_heap_mismatch, 3u); + EXPECT_EQ(stats.blocks_revalidation_full_page, 2u); + EXPECT_EQ(stats.blocks_revalidation_active_malloc_page, 0u); + EXPECT_EQ(stats.blocks_revalidation_above_threshold, 0u); + + // Bytes mirror blocks * block_size. + EXPECT_EQ(stats.bytes_revalidation_heap_mismatch, 3u * kBlockSize); + EXPECT_EQ(stats.bytes_revalidation_full_page, 2u * kBlockSize); + + // Aggregates equal the sum of the breakdown. + EXPECT_EQ(stats.blocks_skipped_revalidation_failed, 5u); + EXPECT_EQ(stats.bytes_skipped_revalidation_failed, 5u * kBlockSize); + EXPECT_EQ(stats.blocks_revalidation_heap_mismatch + stats.blocks_revalidation_full_page, + stats.blocks_skipped_revalidation_failed); +} + +TEST(EvacDecideTest, AllTargetsDoneTrueForEmptyPlan) { + PlanStats pstats; + TargetPlan plan(&pstats); + EXPECT_TRUE(plan.AllTargetsDone()); +} + +TEST(EvacDecideTest, AllTargetsDoneFalseAfterBuild) { + CensusStats cstats; + PageCensus census(&cstats); + ObserveTimes(census, /*times=*/4, /*addr=*/100, /*capacity=*/10, /*used=*/4); + PlanStats pstats; + TargetPlan plan(&pstats); + plan.BuildFrom(census); + ASSERT_EQ(plan.size(), 1u); + EXPECT_FALSE(plan.AllTargetsDone()); +} + +TEST(EvacDecideTest, CompletionFlipsAllTargetsDone) { + CensusStats cstats; + PageCensus census(&cstats); + // Single target with blocks_at_census = 2. + ObserveTimes(census, /*times=*/2, /*addr=*/100, /*capacity=*/10, /*used=*/2); + PlanStats pstats; + TargetPlan plan(&pstats); + plan.BuildFrom(census); + + EvacStats stats; + EXPECT_FALSE(plan.AllTargetsDone()); + EXPECT_EQ(EvacDecide(plan, EligibleStat(100), stats), EvacOutcome::kCommitMove); + EXPECT_FALSE(plan.AllTargetsDone()); // 1 of 2 done, target still pending + EXPECT_EQ(EvacDecide(plan, EligibleStat(100), stats), EvacOutcome::kCommitMove); + EXPECT_TRUE(plan.AllTargetsDone()); // 2 of 2 done — target completed + // Subsequent calls don't double-decrement. + EXPECT_EQ(EvacDecide(plan, EligibleStat(100), stats), EvacOutcome::kTargetAlreadyDone); + EXPECT_TRUE(plan.AllTargetsDone()); +} + +TEST(EvacDecideTest, RevalidationFailureFlipsAllTargetsDone) { + CensusStats cstats; + PageCensus census(&cstats); + ObserveTimes(census, /*times=*/4, /*addr=*/100, /*capacity=*/10, /*used=*/4); + PlanStats pstats; + TargetPlan plan(&pstats); + plan.BuildFrom(census); + + EvacStats stats; + EXPECT_FALSE(plan.AllTargetsDone()); + mi_page_usage_stats_t bad = + MakeStat(/*addr=*/100, /*capacity=*/10, /*used=*/4, /*flags=*/MI_DFLY_PAGE_FULL); + EXPECT_EQ(EvacDecide(plan, bad, stats), EvacOutcome::kRevalidationFailed); + EXPECT_TRUE(plan.AllTargetsDone()); // single target now abandoned + // Subsequent sticky calls don't double-decrement. + EXPECT_EQ(EvacDecide(plan, EligibleStat(100), stats), EvacOutcome::kRevalidationFailed); + EXPECT_TRUE(plan.AllTargetsDone()); +} + +TEST(EvacDecideTest, AllTargetsDoneOnlyWhenEveryTargetSettled) { + CensusStats cstats; + PageCensus census(&cstats); + ObserveTimes(census, /*times=*/2, /*addr=*/100, /*capacity=*/10, /*used=*/2); + ObserveTimes(census, /*times=*/2, /*addr=*/200, /*capacity=*/10, /*used=*/2); + PlanStats pstats; + TargetPlan plan(&pstats); + plan.BuildFrom(census); + ASSERT_EQ(plan.size(), 2u); + + EvacStats stats; + EXPECT_FALSE(plan.AllTargetsDone()); + + // Complete target 100 fully — plan still has work pending on 200. + EXPECT_EQ(EvacDecide(plan, EligibleStat(100), stats), EvacOutcome::kCommitMove); + EXPECT_EQ(EvacDecide(plan, EligibleStat(100), stats), EvacOutcome::kCommitMove); + EXPECT_FALSE(plan.AllTargetsDone()); + + // Mix: revalidation-fail target 200 — plan now fully settled. + mi_page_usage_stats_t bad = + MakeStat(/*addr=*/200, /*capacity=*/10, /*used=*/2, /*flags=*/MI_DFLY_PAGE_FULL); + EXPECT_EQ(EvacDecide(plan, bad, stats), EvacOutcome::kRevalidationFailed); + EXPECT_TRUE(plan.AllTargetsDone()); +} + +TEST(VerifyTest, EmptyPlanGivesZeros) { + PlanStats pstats; + TargetPlan plan(&pstats); + CycleProgress p = RunVerify(plan); + EXPECT_EQ(p.targets_complete, 0u); + EXPECT_EQ(p.targets_partial, 0u); + EXPECT_EQ(p.targets_no_progress, 0u); +} + +TEST(VerifyTest, ClassifiesByBlocksEvacuated) { + CensusStats cstats; + PageCensus census(&cstats); + // Three targets, each with blocks_at_census = N (since movable == used). + ObserveTimes(census, /*times=*/4, /*addr=*/100, /*capacity=*/10, /*used=*/4); // complete + ObserveTimes(census, /*times=*/4, /*addr=*/200, /*capacity=*/10, /*used=*/4); // partial + ObserveTimes(census, /*times=*/4, /*addr=*/300, /*capacity=*/10, /*used=*/4); // no progress + + PlanStats pstats; + TargetPlan plan(&pstats); + plan.BuildFrom(census); + ASSERT_EQ(plan.size(), 3u); + + // Simulate EVACUATE outcomes via direct mutation through FindMut. + plan.FindMut(100)->blocks_evacuated = 4; // == blocks_at_census → complete + plan.FindMut(200)->blocks_evacuated = 2; // 0 < x < blocks_at_census → partial + plan.FindMut(300)->blocks_evacuated = 0; // → no progress + + CycleProgress p = RunVerify(plan); + EXPECT_EQ(p.targets_complete, 1u); + EXPECT_EQ(p.targets_partial, 1u); + EXPECT_EQ(p.targets_no_progress, 1u); +} + +TEST(VerifyTest, OvershootCountsAsComplete) { + // Defensive: blocks_evacuated > blocks_at_census shouldn't happen given the + // EvacDecide guard, but verify the boundary check uses >= not ==. + CensusStats cstats; + PageCensus census(&cstats); + ObserveTimes(census, /*times=*/2, /*addr=*/100, /*capacity=*/10, /*used=*/2); + + PlanStats pstats; + TargetPlan plan(&pstats); + plan.BuildFrom(census); + plan.FindMut(100)->blocks_evacuated = 5; // > blocks_at_census (2) + + CycleProgress p = RunVerify(plan); + EXPECT_EQ(p.targets_complete, 1u); + EXPECT_EQ(p.targets_partial, 0u); + EXPECT_EQ(p.targets_no_progress, 0u); +} + +// ===================================================================== +// Microbenchmarks for the CENSUS / EVACUATE per-object hot path. +// Run with: ./defrag_test --benchmark_filter='BM_.*' +// ===================================================================== +namespace { + +constexpr size_t kBenchObjectCount = 10000; +constexpr size_t kBenchBlockSize = 64; + +void InitBenchEnv() { + static bool initialized = false; + if (!initialized) { + init_zmalloc_threadlocal(mi_heap_get_backing()); + initialized = true; + } +} + +// Holds a batch of allocations; frees them on destruction. +struct AllocationBatch { + std::vector pointers; + ~AllocationBatch() { + for (void* p : pointers) { + mi_free(p); + } + } +}; + +AllocationBatch AllocBatch(size_t count, size_t block_size) { + AllocationBatch ab; + ab.pointers.reserve(count); + mi_heap_t* heap = mi_heap_get_default(); + for (size_t i = 0; i < count; ++i) { + ab.pointers.push_back(mi_heap_malloc(heap, block_size)); + } + return ab; +} + +} // namespace + +void BM_PtrPage(benchmark::State& state) { + InitBenchEnv(); + AllocationBatch ab = AllocBatch(state.range(0), kBenchBlockSize); + for (auto _ : state) { + for (void* p : ab.pointers) { + benchmark::DoNotOptimize(_mi_ptr_page(p)); + } + } + state.SetItemsProcessed(state.iterations() * ab.pointers.size()); +} +BENCHMARK(BM_PtrPage)->Arg(kBenchObjectCount); + +void BM_ProbeHeap(benchmark::State& state) { + InitBenchEnv(); + AllocationBatch ab = AllocBatch(state.range(0), kBenchBlockSize); + mi_heap_t* heap = mi_heap_get_default(); + for (auto _ : state) { + for (void* p : ab.pointers) { + auto stat = mi_heap_page_is_underutilized(heap, p, 0.8f, /*collect_stats=*/true); + benchmark::DoNotOptimize(stat); + } + } + state.SetItemsProcessed(state.iterations() * ab.pointers.size()); +} +BENCHMARK(BM_ProbeHeap)->Arg(kBenchObjectCount); + +void BM_ProbeNoHeap(benchmark::State& state) { + InitBenchEnv(); + AllocationBatch ab = AllocBatch(state.range(0), kBenchBlockSize); + for (auto _ : state) { + for (void* p : ab.pointers) { + mi_page_usage_stats_t stat; + zmalloc_page_is_underutilized(p, 0.8f, /*collect_stats=*/true, &stat); + benchmark::DoNotOptimize(stat); + } + } + state.SetItemsProcessed(state.iterations() * ab.pointers.size()); +} +BENCHMARK(BM_ProbeNoHeap)->Arg(kBenchObjectCount); + +void BM_HashFindMiss(benchmark::State& state) { + InitBenchEnv(); + AllocationBatch ab = AllocBatch(state.range(0), kBenchBlockSize); + absl::flat_hash_map empty; + for (auto _ : state) { + for (void* p : ab.pointers) { + uintptr_t addr = reinterpret_cast(_mi_ptr_page(p)); + benchmark::DoNotOptimize(empty.find(addr)); + } + } + state.SetItemsProcessed(state.iterations() * ab.pointers.size()); +} +BENCHMARK(BM_HashFindMiss)->Arg(kBenchObjectCount); + +void BM_HashFindHit(benchmark::State& state) { + InitBenchEnv(); + AllocationBatch ab = AllocBatch(state.range(0), kBenchBlockSize); + absl::flat_hash_map populated; + for (void* p : ab.pointers) { + populated[reinterpret_cast(_mi_ptr_page(p))] = 1; + } + for (auto _ : state) { + for (void* p : ab.pointers) { + uintptr_t addr = reinterpret_cast(_mi_ptr_page(p)); + benchmark::DoNotOptimize(populated.find(addr)); + } + } + state.SetItemsProcessed(state.iterations() * ab.pointers.size()); +} +BENCHMARK(BM_HashFindHit)->Arg(kBenchObjectCount); + +void BM_HashEmplaceFresh(benchmark::State& state) { + InitBenchEnv(); + AllocationBatch ab = AllocBatch(state.range(0), kBenchBlockSize); + for (auto _ : state) { + state.PauseTiming(); + absl::flat_hash_map m; + state.ResumeTiming(); + for (void* p : ab.pointers) { + m.emplace(reinterpret_cast(_mi_ptr_page(p)), 1); + } + benchmark::DoNotOptimize(m); + } + state.SetItemsProcessed(state.iterations() * ab.pointers.size()); +} +BENCHMARK(BM_HashEmplaceFresh)->Arg(kBenchObjectCount); + +void BM_ProbeAndObserve(benchmark::State& state) { + InitBenchEnv(); + AllocationBatch ab = AllocBatch(state.range(0), kBenchBlockSize); + mi_heap_t* heap = mi_heap_get_default(); + for (auto _ : state) { + state.PauseTiming(); + CensusStats cs; + PageCensus census(&cs); + state.ResumeTiming(); + for (void* p : ab.pointers) { + auto stat = mi_heap_page_is_underutilized(heap, p, 0.8f, /*collect_stats=*/true); + census.Observe(stat, /*bucket_cursor=*/0); + } + benchmark::DoNotOptimize(census); + } + state.SetItemsProcessed(state.iterations() * ab.pointers.size()); +} +BENCHMARK(BM_ProbeAndObserve)->Arg(kBenchObjectCount); + +void BM_EvacDecideMiss(benchmark::State& state) { + // Empty plan -> every pointer hits the kNotATarget early-bail. Models + // the 95% non-target case in EVACUATE. + InitBenchEnv(); + AllocationBatch ab = AllocBatch(state.range(0), kBenchBlockSize); + PlanStats ps; + TargetPlan plan(&ps); + for (auto _ : state) { + EvacStats es{}; + for (void* p : ab.pointers) { + uintptr_t addr = reinterpret_cast(_mi_ptr_page(p)); + TargetPage* target = plan.FindMut(addr); + if (target == nullptr) { + ++es.blocks_skipped_not_target; + continue; + } + // Unreachable in this microbench. + auto stat = mi_heap_page_is_underutilized(mi_heap_get_default(), p, 0.8f, true); + EvacDecide(plan, target, stat, es); + } + benchmark::DoNotOptimize(es); + } + state.SetItemsProcessed(state.iterations() * ab.pointers.size()); +} +BENCHMARK(BM_EvacDecideMiss)->Arg(kBenchObjectCount); + +void BM_EvacDecideHit(benchmark::State& state) { + // Plan contains every page our allocations live on -> every pointer hits + // the EvacDecide commit path. Models the 5% on-target case. + InitBenchEnv(); + AllocationBatch ab = AllocBatch(state.range(0), kBenchBlockSize); + mi_heap_t* heap = mi_heap_get_default(); + + // Build a synthetic census whose retention scoring doesn't filter our pages. + CensusStats cs; + PageCensus census(&cs); + for (void* p : ab.pointers) { + mi_page_usage_stats_t s{}; + s.page_address = reinterpret_cast(_mi_ptr_page(p)); + s.block_size = kBenchBlockSize; + s.capacity = 64; + s.used = 4; // very low used -> high retention score + s.flags = MI_DFLY_PAGE_BELOW_THRESHOLD; + census.Observe(s, 0); + } + PlanStats ps; + TargetPlan plan(&ps); + plan.BuildFrom(census); + + for (auto _ : state) { + EvacStats es{}; + for (void* p : ab.pointers) { + uintptr_t addr = reinterpret_cast(_mi_ptr_page(p)); + TargetPage* target = plan.FindMut(addr); + if (target == nullptr) { + ++es.blocks_skipped_not_target; + continue; + } + auto stat = mi_heap_page_is_underutilized(heap, p, 0.8f, /*collect_stats=*/true); + EvacDecide(plan, target, stat, es); + } + benchmark::DoNotOptimize(es); + } + state.SetItemsProcessed(state.iterations() * ab.pointers.size()); +} +BENCHMARK(BM_EvacDecideHit)->Arg(kBenchObjectCount); + +// Same shape as BM_EvacDecideMiss but goes through the full Evacuator: +// bloom precheck rejects every page (empty plan -> bloom is empty -> no +// hashes computed beyond bloom). Compared against BM_EvacDecideMiss this +// shows the bloom's contribution on the no-target hot path. +void BM_EvacDecideMiss_Evacuator(benchmark::State& state) { + InitBenchEnv(); + AllocationBatch ab = AllocBatch(state.range(0), kBenchBlockSize); + PlanStats ps; + TargetPlan plan(&ps); + EvacStats es; + Evacuator visitor(&plan, 0.8f, &es); + for (auto _ : state) { + for (void* p : ab.pointers) { + benchmark::DoNotOptimize(visitor.IsPageForObjectUnderUtilized(p)); + } + } + state.SetItemsProcessed(state.iterations() * ab.pointers.size()); +} +BENCHMARK(BM_EvacDecideMiss_Evacuator)->Arg(kBenchObjectCount); + +// Same shape as BM_EvacDecideHit but goes through the full Evacuator: +// bloom hits, FindMut hits, per-page slice cache fills on first object per +// page and short-circuits mi_heap_page_is_underutilized for siblings. +// Compared against BM_EvacDecideHit this shows the cache's contribution. +void BM_EvacDecideHit_Evacuator(benchmark::State& state) { + InitBenchEnv(); + AllocationBatch ab = AllocBatch(state.range(0), kBenchBlockSize); + + CensusStats cs; + PageCensus census(&cs); + for (void* p : ab.pointers) { + mi_page_usage_stats_t s{}; + s.page_address = reinterpret_cast(_mi_ptr_page(p)); + s.block_size = kBenchBlockSize; + s.capacity = 64; + s.used = 4; + s.flags = MI_DFLY_PAGE_BELOW_THRESHOLD; + census.Observe(s, 0); + } + PlanStats ps; + TargetPlan plan(&ps); + plan.BuildFrom(census); + EvacStats es; + Evacuator visitor(&plan, 0.8f, &es); + + for (auto _ : state) { + for (void* p : ab.pointers) { + benchmark::DoNotOptimize(visitor.IsPageForObjectUnderUtilized(p)); + } + } + state.SetItemsProcessed(state.iterations() * ab.pointers.size()); +} +BENCHMARK(BM_EvacDecideHit_Evacuator)->Arg(kBenchObjectCount); + +namespace { + +// Non-virtual mirror of Evacuator's hot path. Identical body, but no +// inheritance, no vtable, and the method is defined inline so the compiler +// can fold it into the caller. Comparing against BM_EvacDecideHit_Evacuator +// isolates how much of the per-call overhead is virtual dispatch + worse +// inlining vs structural cost (member access through `this`). +class EvacuatorNonVirt { + public: + EvacuatorNonVirt(TargetPlan* plan, float threshold, EvacStats* evac_stats) + : plan_(plan), threshold_(threshold), evac_stats_(evac_stats) { + } + + bool IsPageForObjectUnderUtilized(void* object) { + const uintptr_t addr = reinterpret_cast(_mi_ptr_page(object)); + TargetPage* target = plan_->FindMut(addr); + if (target == nullptr) { + ++evac_stats_->blocks_skipped_not_target; + return false; + } + const mi_page_usage_stats_t stat = mi_heap_page_is_underutilized( + static_cast(zmalloc_heap), object, threshold_, /*collect_stats=*/true); + return EvacDecide(*plan_, target, stat, *evac_stats_) == EvacOutcome::kCommitMove; + } + + private: + TargetPlan* plan_; + float threshold_; + EvacStats* evac_stats_; +}; + +} // namespace + +void BM_EvacDecideHit_NonVirt(benchmark::State& state) { + InitBenchEnv(); + AllocationBatch ab = AllocBatch(state.range(0), kBenchBlockSize); + + CensusStats cs; + PageCensus census(&cs); + for (void* p : ab.pointers) { + mi_page_usage_stats_t s{}; + s.page_address = reinterpret_cast(_mi_ptr_page(p)); + s.block_size = kBenchBlockSize; + s.capacity = 64; + s.used = 4; + s.flags = MI_DFLY_PAGE_BELOW_THRESHOLD; + census.Observe(s, 0); + } + PlanStats ps; + TargetPlan plan(&ps); + plan.BuildFrom(census); + EvacStats es; + EvacuatorNonVirt visitor(&plan, 0.8f, &es); + + for (auto _ : state) { + for (void* p : ab.pointers) { + benchmark::DoNotOptimize(visitor.IsPageForObjectUnderUtilized(p)); + } + } + state.SetItemsProcessed(state.iterations() * ab.pointers.size()); +} +BENCHMARK(BM_EvacDecideHit_NonVirt)->Arg(kBenchObjectCount); + +namespace { + +// Tagged-dispatch variant: simulates the design alternative where PageUsage +// has a non-virtual IsPageForObjectUnderUtilized that switches on a kind_ +// enum and forwards to the concrete subclass's non-virtual impl. All bodies +// are inline so the compiler can see end-to-end. The bench calls through a +// base-class pointer to mimic how the production walker holds PageUsage*. +// +// If this matches NonVirt, tagged dispatch is a viable refactor — no virtual +// dispatch, no template cascade, just an enum + switch in the base class. + +enum class TestVisitorKind : uint8_t { kEvacuator }; + +class TestEvacuatorTagged; + +class TestPageUsageBase { + public: + // Non-virtual, defined inline (after subclass impls) so the switch can + // inline the called method directly. + inline bool IsPageForObjectUnderUtilized(void* object); + + protected: + TestVisitorKind kind_; +}; + +class TestEvacuatorTagged : public TestPageUsageBase { + public: + TestEvacuatorTagged(TargetPlan* plan, float threshold, EvacStats* evac_stats) + : plan_(plan), threshold_(threshold), evac_stats_(evac_stats) { + kind_ = TestVisitorKind::kEvacuator; + } + + // Non-virtual, inline. Body is identical to Evacuator's production impl. + bool IsPageForObjectUnderUtilizedImpl(void* object) { + const uintptr_t addr = reinterpret_cast(_mi_ptr_page(object)); + TargetPage* target = plan_->FindMut(addr); + if (target == nullptr) { + ++evac_stats_->blocks_skipped_not_target; + return false; + } + const mi_page_usage_stats_t stat = mi_heap_page_is_underutilized( + static_cast(zmalloc_heap), object, threshold_, /*collect_stats=*/true); + return EvacDecide(*plan_, target, stat, *evac_stats_) == EvacOutcome::kCommitMove; + } + + private: + TargetPlan* plan_; + float threshold_; + EvacStats* evac_stats_; +}; + +inline bool TestPageUsageBase::IsPageForObjectUnderUtilized(void* object) { + switch (kind_) { + case TestVisitorKind::kEvacuator: + return static_cast(this)->IsPageForObjectUnderUtilizedImpl(object); + } + __builtin_unreachable(); +} + +} // namespace + +void BM_EvacDecideHit_TaggedDispatch(benchmark::State& state) { + InitBenchEnv(); + AllocationBatch ab = AllocBatch(state.range(0), kBenchBlockSize); + + CensusStats cs; + PageCensus census(&cs); + for (void* p : ab.pointers) { + mi_page_usage_stats_t s{}; + s.page_address = reinterpret_cast(_mi_ptr_page(p)); + s.block_size = kBenchBlockSize; + s.capacity = 64; + s.used = 4; + s.flags = MI_DFLY_PAGE_BELOW_THRESHOLD; + census.Observe(s, 0); + } + PlanStats ps; + TargetPlan plan(&ps); + plan.BuildFrom(census); + EvacStats es; + TestEvacuatorTagged visitor(&plan, 0.8f, &es); + // Call through base pointer to mimic how the production walker dispatches. + TestPageUsageBase* base = &visitor; + + for (auto _ : state) { + for (void* p : ab.pointers) { + benchmark::DoNotOptimize(base->IsPageForObjectUnderUtilized(p)); + } + } + state.SetItemsProcessed(state.iterations() * ab.pointers.size()); +} +BENCHMARK(BM_EvacDecideHit_TaggedDispatch)->Arg(kBenchObjectCount); + +// Populated plan + queries that all miss. Models the dominant production +// case: EVAC walks the prime table over millions of objects while the plan +// holds a few thousand target pages — most objects are on non-target pages +// and need fast rejection. Plan is built from synthetic addresses unrelated +// to the real allocations so every query miss hits the populated-map find() +// path (raw variant) or the bloom-rejection path (Evacuator variant). +namespace { + +constexpr size_t kBenchSyntheticPlanSize = 4000; + +void PopulatePlanWithSyntheticAddrs(PageCensus* census, size_t count) { + for (size_t i = 0; i < count; ++i) { + mi_page_usage_stats_t s{}; + // Addresses well above any mimalloc-managed range; 64 KiB stride matches + // mimalloc page alignment so the addresses look plausible to the bloom + // hasher. + s.page_address = 0x100000000ULL + i * 0x10000ULL; + s.block_size = kBenchBlockSize; + s.capacity = 64; + s.used = 4; + s.flags = MI_DFLY_PAGE_BELOW_THRESHOLD; + census->Observe(s, 0); + } +} + +// RAII: turn off the skip-bit setter for the duration of this object. The +// populated-plan benchmarks use synthetic addresses that aren't valid +// mi_page_t*; SetDefragSkipIfEnabled would dereference them and segfault on +// BuildFrom and on ~TargetPlan. Restoring on dtor (declare this BEFORE the +// TargetPlan so the plan destructs first while the flag is still off). +struct DefragSkipBitOff { + bool prev; + DefragSkipBitOff() : prev(absl::GetFlag(FLAGS_defrag_use_skip_bit)) { + absl::SetFlag(&FLAGS_defrag_use_skip_bit, false); + } + ~DefragSkipBitOff() { + absl::SetFlag(&FLAGS_defrag_use_skip_bit, prev); + } +}; + +} // namespace + +void BM_EvacDecideMiss_Populated(benchmark::State& state) { + DefragSkipBitOff skip_bit_off; // declared first → destroyed last + InitBenchEnv(); + AllocationBatch ab = AllocBatch(state.range(0), kBenchBlockSize); + + CensusStats cs; + PageCensus census(&cs); + PopulatePlanWithSyntheticAddrs(&census, kBenchSyntheticPlanSize); + PlanStats ps; + TargetPlan plan(&ps); + plan.BuildFrom(census); + + for (auto _ : state) { + EvacStats es{}; + for (void* p : ab.pointers) { + uintptr_t addr = reinterpret_cast(_mi_ptr_page(p)); + TargetPage* target = plan.FindMut(addr); + if (target == nullptr) { + ++es.blocks_skipped_not_target; + continue; + } + auto stat = mi_heap_page_is_underutilized(mi_heap_get_default(), p, 0.8f, true); + EvacDecide(plan, target, stat, es); + } + benchmark::DoNotOptimize(es); + } + state.SetItemsProcessed(state.iterations() * ab.pointers.size()); +} +BENCHMARK(BM_EvacDecideMiss_Populated)->Arg(kBenchObjectCount); + +void BM_EvacDecideMiss_Populated_Evacuator(benchmark::State& state) { + DefragSkipBitOff skip_bit_off; // declared first → destroyed last + InitBenchEnv(); + AllocationBatch ab = AllocBatch(state.range(0), kBenchBlockSize); + + CensusStats cs; + PageCensus census(&cs); + PopulatePlanWithSyntheticAddrs(&census, kBenchSyntheticPlanSize); + PlanStats ps; + TargetPlan plan(&ps); + plan.BuildFrom(census); + EvacStats es; + Evacuator visitor(&plan, 0.8f, &es); + + for (auto _ : state) { + for (void* p : ab.pointers) { + benchmark::DoNotOptimize(visitor.IsPageForObjectUnderUtilized(p)); + } + } + state.SetItemsProcessed(state.iterations() * ab.pointers.size()); +} +BENCHMARK(BM_EvacDecideMiss_Populated_Evacuator)->Arg(kBenchObjectCount); + +// ===================================================================== +// Microbenchmarks for the underutil-callback per-free overhead. +// +// Each variant allocates a fresh batch of objects (untimed) and frees them +// (timed); the delta between variants isolates the cost the dragonfly +// mimalloc patch adds to mi_free_block_local: +// +// BM_Free_CallbackOff callback unregistered. Only the unconditional +// prev_used load runs; the arith block and +// indirect call are short-circuited by the NULL +// check on _mi_dfly_underutil_cb. +// BM_Free_CallbackNoOp callback registered, body just bumps a counter. +// Adds the per-free arith block (cap_thr, +// prev_x100, cur_x100, two compares) on EVERY +// free, plus an indirect call on edge crossings. +// BM_Free_CallbackInsert callback registered, body inserts into a +// thread_local flat_hash_set. Adds hash work on +// top of NoOp, only on edge crossings. +// +// Differences: +// (NoOp - Off) = per-free arith cost (paid by every delete) +// (Insert - NoOp) = per-edge insert cost amortized over total frees +// +// The "edges/iter" counter reports the number of callback invocations per +// iteration so per-edge cost can be derived. +// Run with: ./defrag_test --benchmark_filter='BM_Free_.*' +// ===================================================================== + +namespace { + +thread_local absl::flat_hash_set g_bench_underutil_set; +thread_local size_t g_bench_underutil_count = 0; + +void BenchCallbackNoOp(uintptr_t /*addr*/) { + ++g_bench_underutil_count; +} + +void BenchCallbackInsert(uintptr_t addr) { + ++g_bench_underutil_count; + g_bench_underutil_set.insert(addr); +} + +void RunFreeBench(benchmark::State& state, size_t count) { + InitBenchEnv(); + size_t total_edges = 0; + for (auto _ : state) { + state.PauseTiming(); + g_bench_underutil_set.clear(); + g_bench_underutil_count = 0; + std::vector ptrs; + ptrs.reserve(count); + mi_heap_t* heap = mi_heap_get_default(); + for (size_t i = 0; i < count; ++i) { + ptrs.push_back(mi_heap_malloc(heap, kBenchBlockSize)); + } + state.ResumeTiming(); + for (void* p : ptrs) { + mi_free(p); + } + state.PauseTiming(); + total_edges += g_bench_underutil_count; + state.ResumeTiming(); + } + state.counters["edges/iter"] = + benchmark::Counter(static_cast(total_edges) / state.iterations()); + state.SetItemsProcessed(state.iterations() * count); +} + +} // namespace + +void BM_Free_CallbackOff(benchmark::State& state) { + mi_dfly_set_underutil_callback(nullptr); + RunFreeBench(state, state.range(0)); +} +BENCHMARK(BM_Free_CallbackOff)->Arg(kBenchObjectCount); + +void BM_Free_CallbackNoOp(benchmark::State& state) { + mi_dfly_set_underutil_threshold_pct(80); + mi_dfly_set_underutil_callback(&BenchCallbackNoOp); + RunFreeBench(state, state.range(0)); + mi_dfly_set_underutil_callback(nullptr); +} +BENCHMARK(BM_Free_CallbackNoOp)->Arg(kBenchObjectCount); + +void BM_Free_CallbackInsert(benchmark::State& state) { + mi_dfly_set_underutil_threshold_pct(80); + mi_dfly_set_underutil_callback(&BenchCallbackInsert); + RunFreeBench(state, state.range(0)); + mi_dfly_set_underutil_callback(nullptr); +} +BENCHMARK(BM_Free_CallbackInsert)->Arg(kBenchObjectCount); + +} // namespace dfly diff --git a/src/server/dragonfly_test.cc b/src/server/dragonfly_test.cc index c2e2204f941e..d3cfebf49636 100644 --- a/src/server/dragonfly_test.cc +++ b/src/server/dragonfly_test.cc @@ -29,6 +29,9 @@ ABSL_DECLARE_FLAG(bool, lua_resp2_legacy_float); ABSL_DECLARE_FLAG(double, eviction_memory_budget_threshold); ABSL_DECLARE_FLAG(std::vector, command_alias); ABSL_DECLARE_FLAG(bool, latency_tracking); +ABSL_DECLARE_FLAG(bool, experimental_defrag); +ABSL_DECLARE_FLAG(uint64_t, defrag_min_plan_reclaimable_bytes); +ABSL_DECLARE_FLAG(bool, defrag_keys); namespace dfly { @@ -883,6 +886,103 @@ TEST_F(DefragDflyEngineTest, DefragEventuallyFinishes) { }); } +// Validates that key defragmentation in the phased algorithm doesn't corrupt data. +// Uses large keys (>18 bytes) to force external heap allocations, fragments them, +// runs defrag, then verifies all keys are still accessible and that both key and +// value reallocations occurred. +TEST_F(DefragDflyEngineTest, KeyDefragIntegrity) { + absl::FlagSaver fs; + absl::SetFlag(&FLAGS_experimental_defrag, true); + absl::SetFlag(&FLAGS_defrag_keys, true); + // Disable the minimum-reclaimable guard so small-scale fragmentation triggers EVACUATE. + absl::SetFlag(&FLAGS_defrag_min_plan_reclaimable_bytes, 0); + + // Use keys > 18 bytes to force external allocation (not inline in CompactObj). + // Use values > 18 bytes to also force external allocation for values. + constexpr int kNumKeys = 5000; + constexpr int kValueSize = 64; + // Threshold: pages with used/capacity < 0.8 are candidates. + constexpr float kThreshold = 0.8f; + + // Populate with large keys and values. + for (int i = 0; i < kNumKeys; ++i) { + string key = absl::StrFormat("long-key-name-for-defrag-test-%05d", i); + string val(kValueSize, 'A' + (i % 26)); + Run({"SET", key, val}); + } + + // Delete every other key to fragment key and value pages. + for (int i = 0; i < kNumKeys; i += 2) { + string key = absl::StrFormat("long-key-name-for-defrag-test-%05d", i); + Run({"DEL", key}); + } + + // Run phased defrag. Track total key and value reallocations across all cycles. + uint64_t total_key_reallocs = 0; + uint64_t total_val_reallocs = 0; + uint64_t total_bytes_moved = 0; + + shard_set->pool()->AwaitFiberOnAll([&](unsigned, ProactorBase*) { + auto* shard = EngineShard::tlocal(); + if (!shard) + return; + + uint64_t shard_key_reallocs = 0; + uint64_t shard_val_reallocs = 0; + uint64_t shard_bytes_moved = 0; + + for (int i = 0; i < 500; ++i) { + PageUsage page_usage{CollectPageStats::NO, kThreshold, + CycleQuota{CycleQuota::kDefaultDefragQuota}}; + page_usage.SetForceReallocate(true); + auto report = shard->DoDefrag(&page_usage); + shard_key_reallocs += report.cycle_stats.evac_key_reallocations; + shard_val_reallocs += report.cycle_stats.evac_val_reallocations; + shard_bytes_moved += report.cycle_stats.evac_bytes_moved; + } + + // Atomic accumulation into shared counters (single shard in this fixture, but safe). + __atomic_fetch_add(&total_key_reallocs, shard_key_reallocs, __ATOMIC_RELAXED); + __atomic_fetch_add(&total_val_reallocs, shard_val_reallocs, __ATOMIC_RELAXED); + __atomic_fetch_add(&total_bytes_moved, shard_bytes_moved, __ATOMIC_RELAXED); + }); + + LOG(INFO) << "Defrag stats: key_reallocs=" << total_key_reallocs + << " val_reallocs=" << total_val_reallocs << " bytes_moved=" << total_bytes_moved; + + // Verify that both key AND value reallocations happened. + EXPECT_GT(total_key_reallocs, 0u) << "Expected key reallocations during defrag"; + EXPECT_GT(total_val_reallocs, 0u) << "Expected value reallocations during defrag"; + EXPECT_GT(total_bytes_moved, 0u) << "Expected non-zero bytes moved during defrag"; + + // Validate all remaining keys are intact after defrag — no corruption. + for (int i = 1; i < kNumKeys; i += 2) { + string key = absl::StrFormat("long-key-name-for-defrag-test-%05d", i); + string expected_val(kValueSize, 'A' + (i % 26)); + auto resp = Run({"GET", key}); + ASSERT_EQ(resp, expected_val) << "Value corrupted after defrag: " << key; + } + + // Verify deleted keys are still gone (no resurrection from dangling pointers). + for (int i = 0; i < kNumKeys; i += 2) { + string key = absl::StrFormat("long-key-name-for-defrag-test-%05d", i); + auto resp = Run({"EXISTS", key}); + ASSERT_THAT(resp, IntArg(0)) << "Deleted key reappeared after defrag: " << key; + } + + // Write new values to the defragged keys — verifies no dangling pointers. + for (int i = 1; i < kNumKeys; i += 2) { + string key = absl::StrFormat("long-key-name-for-defrag-test-%05d", i); + ASSERT_EQ(Run({"SET", key, "new-value"}), "OK"); + } + + // Final read-back. + for (int i = 1; i < kNumKeys; i += 2) { + string key = absl::StrFormat("long-key-name-for-defrag-test-%05d", i); + ASSERT_EQ(Run({"GET", key}), "new-value"); + } +} + TEST_F(DflyEngineTest, Issue752) { // https://github.com/dragonflydb/dragonfly/issues/752 // local_result_ member was not reset between commands diff --git a/src/server/engine_shard.cc b/src/server/engine_shard.cc index 05b50ba35a85..f20d32c1bb40 100644 --- a/src/server/engine_shard.cc +++ b/src/server/engine_shard.cc @@ -21,6 +21,7 @@ extern "C" { } #include "server/blocking_controller.h" #include "server/db_slice.h" +#include "server/defrag.h" #include "server/engine_shard_set.h" #include "server/journal/journal.h" #include "server/namespaces.h" @@ -47,6 +48,29 @@ ABSL_FLAG(float, mem_defrag_page_utilization_threshold, 0.8, "memory page under utilization threshold. Ratio between used and committed size, below " "this, memory in this page will defragmented"); +ABSL_FLAG(bool, enable_bg_defrag, false, + "If true, run periodic defragmentation as an idle background task. " + "Defaults to false on this branch so tests only trigger defrag via " + "explicit MEMORY DEFRAGMENT calls."); + +ABSL_FLAG(bool, experimental_defrag, true, + "When true, run the phased defragmentation strategy (CENSUS / SELECT_TARGETS / " + "EVACUATE / VERIFY) instead of the legacy single-pass defragmenter. Experimental."); + +ABSL_FLAG(bool, disable_huffman_check, true, + "If true, skip the periodic huffman frequency-table check task that fires once " + "key memory crosses 50 MiB on shard 0. The task is currently informational only " + "(it logs the resulting table) and its multi-second prime-table walk overruns " + "the proactor's 500us idle-task budget on large datasets, generating chatter " + "and stealing single-core cycles from the workload. Default true while the " + "task remains a build-and-log experiment; set false to re-enable."); + +ABSL_FLAG(uint64_t, defrag_per_block_move_cost_bytes, 256, + "Per-block move-cost weight in the page retention score. Higher values push " + "pages with many small entries toward the back of the candidate ranking, " + "favoring large-block pages that reclaim more bytes per bucket walked during " + "EVACUATE. Useful for wide/mixed-size workloads."); + ABSL_FLAG(int32_t, hz, 100, "Base frequency at which the server performs other background tasks. " "Warning: not advised to decrease in production."); @@ -78,6 +102,191 @@ namespace { constexpr uint64_t kCursorDoneState = 0u; +// Runs one slice of the prime-table traversal: advances `*dbid` past invalid +// dbs, walks the current db's prime table dispatching DefragIfNeeded through +// `visitor`, and persists the new cursor position via `*cursor`. Stops when +// the visitor's quota depletes, the cursor wraps, or the global namespace +// pointer goes away. Mutates per-db memory accounting on the slice. +DbSliceResult RunPrimeTableSlice(DbSlice* slice, size_t* dbid, uint64_t* cursor, + PageUsage* visitor) { + // Skip past invalid dbs (e.g., dropped, not yet allocated). + while (!slice->IsDbValid(*dbid) && *dbid + 1 < slice->db_array_size()) + ++*dbid; + + if (!slice->IsDbValid(*dbid)) { + return DbSliceResult{.finished_all_dbs = true}; + } + + auto* prime_table = slice->GetTables(*dbid); + PrimeTable::Cursor cur{*cursor}; + DbSliceResult result; + const DbTable* db_table = slice->GetDBTable(*dbid); + + const size_t dbid_before = *dbid; + const uint64_t cursor_before = cur.token(); + const uint64_t start_ns = absl::GetCurrentTimeNanos(); + uint64_t traverse_calls = 0; + + bool quota_depleted = false; + bool should_stop = false; + bool cursor_done = false; + bool namespaces_null = false; + + const bool read_only = visitor->IsReadOnly(); + const bool defrag_keys = visitor->ShouldDefragKeys(); + do { + visitor->SetCurrentBucketCursor(cur.token()); + cur = prime_table->Traverse(cur, [&](PrimeIterator it) { + if (read_only) { + if (defrag_keys && it->first.HasAllocated()) { + it->first.DefragIfNeeded(visitor); + } + it->second.DefragIfNeeded(visitor); + ++result.attempts; + return; + } + const ssize_t orig_val_size = it->second.MallocUsed(); + const bool did_val = it->second.DefragIfNeeded(visitor); + bool did_key = false; + ssize_t orig_key_size = 0; + if (defrag_keys && it->first.HasAllocated()) { + orig_key_size = it->first.MallocUsed(); + did_key = it->first.DefragIfNeeded(visitor); + if (did_key) { + if (const ssize_t delta = it->first.MallocUsed() - orig_key_size; delta != 0) { + db_table->stats.AddTypeMemoryUsage(OBJ_KEY, delta); + } + } + } + ++result.attempts; + if (did_val || did_key) { + ++result.reallocations; + if (did_val) { + result.bytes_moved += static_cast(orig_val_size); + ++result.val_reallocations; + if (const ssize_t delta = it->second.MallocUsed() - orig_val_size; delta != 0) { + db_table->stats.AddTypeMemoryUsage(it->second.ObjType(), delta); + } + } + if (did_key) { + result.bytes_moved += static_cast(orig_key_size); + ++result.key_reallocations; + } + } + }); + ++traverse_calls; + + quota_depleted = visitor->QuotaDepleted(); + should_stop = visitor->ShouldStop(); + cursor_done = !cur; + namespaces_null = !namespaces; + } while (!quota_depleted && !should_stop && !cursor_done && !namespaces_null); + + const double elapsed_ms = static_cast(absl::GetCurrentTimeNanos() - start_ns) / 1e6; + LOG(INFO) << absl::StrFormat( + "defrag[Slice] dbid=%zu cursor=%llu->%llu traverses=%llu attempts=%llu " + "reallocs=%llu(keys=%llu vals=%llu) bytes_moved=%.2fMiB " + "took=%.1fms exit{quota=%d stop=%d cursor_done=%d ns_null=%d}", + dbid_before, cursor_before, cur.token(), traverse_calls, result.attempts, + result.reallocations, result.key_reallocations, result.val_reallocations, + static_cast(result.bytes_moved) / (1024.0 * 1024.0), elapsed_ms, quota_depleted, + should_stop, cursor_done, namespaces_null); + + *cursor = cur.token(); + if (*cursor == kCursorDoneState) { + ++*dbid; + } + return result; +} + +// Hinted variant: visits only the buckets in `hints` for the current dbid, +// resuming from `*cursor_idx` so quota-bounded calls can drain a large hint +// set across multiple invocations. Each hint is replayed via a single +// Traverse(Cursor{hint}, cb) call which the underlying DashTable resolves +// to that one logical bucket (see core/dash.h). On quota exhaustion we +// persist the next index back into *cursor_idx; once we reach hints.size() +// the slice is "finished". +DbSliceResult RunPrimeTableHinted(DbSlice* slice, size_t dbid, const std::vector& hints, + size_t* cursor_idx, PageUsage* visitor) { + if (!slice->IsDbValid(dbid)) { + return DbSliceResult{.finished_all_dbs = true}; + } + + auto* prime_table = slice->GetTables(dbid); + const DbTable* db_table = slice->GetDBTable(dbid); + + DbSliceResult result; + const uint64_t start_ns = absl::GetCurrentTimeNanos(); + bool quota_depleted = false; + bool should_stop = false; + bool namespaces_null = false; + const bool defrag_keys = visitor->ShouldDefragKeys(); + + const size_t start_idx = cursor_idx ? *cursor_idx : 0; + size_t i = start_idx; + + for (; i < hints.size(); ++i) { + const uint64_t h = hints[i]; + PrimeTable::Cursor cur{h}; + prime_table->Traverse(cur, [&](PrimeIterator it) { + const ssize_t orig_val_size = it->second.MallocUsed(); + const bool did_val = it->second.DefragIfNeeded(visitor); + bool did_key = false; + ssize_t orig_key_size = 0; + if (defrag_keys && it->first.HasAllocated()) { + orig_key_size = it->first.MallocUsed(); + did_key = it->first.DefragIfNeeded(visitor); + if (did_key) { + if (const ssize_t delta = it->first.MallocUsed() - orig_key_size; delta != 0) { + db_table->stats.AddTypeMemoryUsage(OBJ_KEY, delta); + } + } + } + ++result.attempts; + if (did_val || did_key) { + ++result.reallocations; + if (did_val) { + result.bytes_moved += static_cast(orig_val_size); + ++result.val_reallocations; + if (const ssize_t delta = it->second.MallocUsed() - orig_val_size; delta != 0) { + db_table->stats.AddTypeMemoryUsage(it->second.ObjType(), delta); + } + } + if (did_key) { + result.bytes_moved += static_cast(orig_key_size); + ++result.key_reallocations; + } + } + }); + + quota_depleted = visitor->QuotaDepleted(); + should_stop = visitor->ShouldStop(); + namespaces_null = !namespaces; + if (quota_depleted || should_stop || namespaces_null) { + ++i; // we finished bucket i; resume from i+1 next call + break; + } + } + + if (cursor_idx) { + *cursor_idx = i; + } + + const size_t hints_visited = i - start_idx; + const double elapsed_ms = static_cast(absl::GetCurrentTimeNanos() - start_ns) / 1e6; + LOG(INFO) << absl::StrFormat( + "defrag[Hinted] dbid=%zu hints_this_call=%zu pos=%zu/%zu attempts=%llu " + "reallocs=%llu(keys=%llu vals=%llu) bytes_moved=%.2fMiB " + "took=%.1fms exit{quota=%d stop=%d ns_null=%d}", + dbid, hints_visited, i, hints.size(), result.attempts, result.reallocations, + result.key_reallocations, result.val_reallocations, + static_cast(result.bytes_moved) / (1024.0 * 1024.0), elapsed_ms, quota_depleted, + should_stop, namespaces_null); + + result.finished_all_dbs = !quota_depleted && !should_stop && i == hints.size(); + return result; +} + bool HasContendedLocks(ShardId shard_id, Transaction* trx, const DbTable* table) { auto is_contended = [table](LockFp fp) { return table->trans_locks.Find(fp)->IsContended(); }; @@ -259,17 +468,14 @@ EngineShard::Stats& EngineShard::Stats::operator+=(const Stats& o) { return *this; } -void EngineShard::DefragTaskState::UpdateScanState(uint64_t cursor_val) { - cursor = cursor_val; - // Once we're done with a db, jump to the next - if (cursor == kCursorDoneState) { - dbid++; - } -} - -void EngineShard::DefragTaskState::ResetScanState() { - dbid = cursor = 0u; -} +enum class DefragSkipReason : uint8_t { + MemoryTooLow, + MemoryBelowThreshold, + CheckWithinInterval, + NotEnoughFragmentation, + CheckInProgress, + NotSkipped, +}; // This function checks 3 things: // 1. Don't try memory fragmentation if we don't use "enough" memory (control by @@ -277,10 +483,12 @@ void EngineShard::DefragTaskState::ResetScanState() { // 2. We have memory blocks that can be better utilized (there is a "wasted memory" in them). // 3. in case the above is OK, make sure that we have a "gap" between usage and commited memory // (control by mem_defrag_waste_threshold flag) -EngineShard::DefragTaskState::SkipReason EngineShard::DefragTaskState::CheckRequired() { - using enum SkipReason; - if (cursor > kCursorDoneState) { - VLOG(2) << "cursor: " << cursor; +DefragSkipReason ShouldStartDefrag(DefragTaskState* state) { + using enum DefragSkipReason; + // Mid-cycle: keep going regardless of memory / interval gates. Legacy uses + // cursor > 0; phased uses phase != IDLE. Either signals "in progress". + if (state->cursor > kCursorDoneState || state->phase != DefragPhase::IDLE) { + VLOG(2) << "cursor: " << state->cursor << " phase: " << static_cast(state->phase); return NotSkipped; } @@ -302,7 +510,7 @@ EngineShard::DefragTaskState::SkipReason EngineShard::DefragTaskState::CheckRequ if (finfo.bin == 0) { // did not start the iterative checking yet const auto now = time(nullptr); - const auto seconds_from_prev_check = now - last_check_time; + const auto seconds_from_prev_check = now - state->last_check_time; const auto mem_defrag_interval = GetFlag(FLAGS_mem_defrag_check_sec_interval); if (seconds_from_prev_check < mem_defrag_interval) { @@ -312,17 +520,17 @@ EngineShard::DefragTaskState::SkipReason EngineShard::DefragTaskState::CheckRequ // start checking. finfo.committed = finfo.committed_golden = 0; finfo.wasted = 0; - page_utilization_threshold = GetFlag(FLAGS_mem_defrag_page_utilization_threshold); + state->page_utilization_threshold = GetFlag(FLAGS_mem_defrag_page_utilization_threshold); } uint64_t start = absl::GetCurrentTimeNanos(); - int res = zmalloc_get_allocator_fragmentation_step(page_utilization_threshold, &finfo); + int res = zmalloc_get_allocator_fragmentation_step(state->page_utilization_threshold, &finfo); uint64_t duration = absl::GetCurrentTimeNanos() - start; VLOG(1) << "Reading memory usage took " << duration << " ns on bin " << finfo.bin - 1; if (res == 0) { // finished checking. - last_check_time = time(nullptr); + state->last_check_time = time(nullptr); if (finfo.committed != finfo.committed_golden) { LOG_FIRST_N(ERROR, 100) << "committed memory computed incorrectly: " << finfo.committed @@ -339,7 +547,7 @@ EngineShard::DefragTaskState::SkipReason EngineShard::DefragTaskState::CheckRequ return CheckInProgress; } -std::optional EngineShard::DoDefrag(PageUsage* page_usage) { +DefragShardReport EngineShard::DoDefrag(PageUsage* page_usage) { // -------------------------------------------------------------------------- // NOTE: This task is running with exclusive access to the shard. // i.e. - Since we are using shared nothing access here, and all access @@ -350,43 +558,88 @@ std::optional EngineShard::DoDefrag(PageUsage* page_usage) { // TODO: enable tiered storage on non-default db slice DbSlice& slice = namespaces->GetDefaultNamespace().GetDbSlice(shard_->shard_id()); - // If we moved to an invalid db, skip as long as it's not the last one - while (!slice.IsDbValid(defrag_state_.dbid) && defrag_state_.dbid + 1 < slice.db_array_size()) - defrag_state_.dbid++; + const uint64_t start_ns = absl::GetCurrentTimeNanos(); + const DefragPhase phase_start = defrag_state_.phase; - // If we found no valid db, we finished traversing and start from scratch next time - if (!slice.IsDbValid(defrag_state_.dbid)) { - defrag_state_.ResetScanState(); - return std::nullopt; - } + // Re-arm page_usage's quota so it counts only DoDefrag's actual work. Without + // this, expensive HLL/hdr_histogram allocation in PageUsage's constructor + // (in the caller) is charged against the defrag budget. + page_usage->ArmQuotaTimer(); - DCHECK(slice.IsDbValid(defrag_state_.dbid)); - auto* prime_table = slice.GetTables(defrag_state_.dbid); - PrimeTable::Cursor cur{defrag_state_.cursor}; - uint64_t reallocations = 0; - uint64_t attempts = 0; + if (GetFlag(FLAGS_experimental_defrag)) { + LOG(INFO) << absl::StrFormat("defrag[DoDefrag] shard=%u enter phase=%s threshold=%.2f", + shard_id_, PhaseName(phase_start), page_usage->threshold()); - DbTable* db_table = slice.GetDBTable(defrag_state_.dbid); - do { - cur = prime_table->Traverse(cur, [&](PrimeIterator it) { - // for each value check whether we should move it because it - // seats on underutilized page of memory, and if so, do it. - const ssize_t original_size = it->second.MallocUsed(); - const bool did = it->second.DefragIfNeeded(page_usage); - attempts++; - if (did) { - reallocations++; - if (const ssize_t delta = it->second.MallocUsed() - original_size; delta != 0) { - db_table->stats.AddTypeMemoryUsage(it->second.ObjType(), delta); - } + // Picked up at IDLE so a new cycle reflects flag changes; mid-cycle changes + // are deferred until the next cycle so CENSUS/EVACUATE see consistent state. + if (defrag_state_.phase == DefragPhase::IDLE) { + defrag_state_.per_block_move_cost_bytes = GetFlag(FLAGS_defrag_per_block_move_cost_bytes); + } + + auto walker = [&](PageUsage* visitor, const std::vector* hints, size_t* hint_cursor) { + if (hints != nullptr) { + return RunPrimeTableHinted(&slice, defrag_state_.dbid, *hints, hint_cursor, visitor); } - }); - } while (!page_usage->QuotaDepleted() && cur && namespaces); + return RunPrimeTableSlice(&slice, &defrag_state_.dbid, &defrag_state_.cursor, visitor); + }; + RunPhaseDefrag(&defrag_state_, page_usage->threshold(), + CycleQuota{CycleQuota::kDefaultDefragQuota}, walker); + + page_usage->ExtendQuota(50); + shard_search_indices_->Defragment(page_usage); + + stats_.defrag_task_invocation_total++; + + DefragShardReport report; + report.summary.phase_start = phase_start; + report.summary.phase_end = defrag_state_.phase; + report.summary.duration_us = (absl::GetCurrentTimeNanos() - start_ns) / 1000; + // TODO(defrag): quota_depleted here reflects page_usage's quota, which arms + // when page_usage is constructed (in the caller, before we entered DoDefrag) + // and only gates the trailing search-index defrag. RunPhaseDefrag uses its + // own internal CycleQuota that's not surfaced. Empirically on an empty DB + // we observe quota_depleted=true while cycle_finished=false, which suggests + // the phased CycleQuota is exhausted inside individual step walks (150us is + // tight) and the cycle never reaches VERIFY in a single invocation. Add + // per-phase logging and consider plumbing the phased quota's depletion bit + // into the summary so we can tell which quota actually stopped us. + report.summary.quota_depleted = page_usage->QuotaDepleted(); + report.summary.finished_all_dbs = defrag_state_.cycle_stats.cycle_finished; + report.cycle_stats = defrag_state_.cycle_stats; + report.page_usage_stats = page_usage->CollectedStats(); + report.work_pending = defrag_state_.phase != DefragPhase::IDLE; + + LOG(INFO) << absl::StrFormat( + "defrag[DoDefrag] shard=%u exit phase=%s->%s duration=%lluus quota_depleted=%d " + "work_pending=%d cycle_finished=%d", + shard_id_, PhaseName(report.summary.phase_start), PhaseName(report.summary.phase_end), + report.summary.duration_us, report.summary.quota_depleted, report.work_pending, + report.summary.finished_all_dbs); + return report; + } + + const DbSliceResult slice_result = + RunPrimeTableSlice(&slice, &defrag_state_.dbid, &defrag_state_.cursor, page_usage); + const uint64_t attempts = slice_result.attempts; + uint64_t reallocations = slice_result.reallocations; + const bool finished_all_dbs = slice_result.finished_all_dbs; + + DefragShardReport report; + report.summary.phase_start = phase_start; // legacy path: always IDLE + report.summary.phase_end = DefragPhase::IDLE; + + if (finished_all_dbs) { + defrag_state_.ResetScanState(); + report.summary.duration_us = (absl::GetCurrentTimeNanos() - start_ns) / 1000; + report.summary.quota_depleted = page_usage->QuotaDepleted(); + report.summary.finished_all_dbs = true; + report.work_pending = false; + return report; + } + const uint64_t used_cycles = page_usage->UsedQuotaCycles(); const uint64_t usec = base::CycleClock::ToUsec(used_cycles); - defrag_state_.UpdateScanState(cur.token()); - page_usage->ExtendQuota(50); const auto [quota_depleted, objects_moved] = shard_search_indices_->Defragment(page_usage); reallocations += objects_moved; @@ -409,7 +662,12 @@ std::optional EngineShard::DoDefrag(PageUsage* page_usage) { slice.shard_id(), used_cycles, usec, cursor_state); } - return page_usage->CollectedStats(); + report.page_usage_stats = page_usage->CollectedStats(); + report.summary.duration_us = (absl::GetCurrentTimeNanos() - start_ns) / 1000; + report.summary.quota_depleted = page_usage->QuotaDepleted(); + report.summary.finished_all_dbs = false; + report.work_pending = true; // !finished_all_dbs path + return report; } // the memory defragmentation task is as follow: @@ -421,21 +679,20 @@ std::optional EngineShard::DoDefrag(PageUsage* page_usage) { // priority. // otherwise lower the task priority so that it would not use the CPU when not required uint32_t EngineShard::DefragTask() { - using enum DefragTaskState::SkipReason; + using enum DefragSkipReason; constexpr uint32_t kRunAtLowPriority = 0u; if (!namespaces) { return kRunAtLowPriority; } - if (auto check_result = defrag_state_.CheckRequired(); check_result == NotSkipped) { + if (auto check_result = ShouldStartDefrag(&defrag_state_); check_result == NotSkipped) { VLOG(2) << shard_id_ << ": need to run defrag memory cursor state: " << defrag_state_.cursor; static const float threshold = GetFlag(FLAGS_mem_defrag_page_utilization_threshold); // TODO (abhijat): implement move ctor for PageUsage so this object can be moved into the task. PageUsage page_usage{CollectPageStats::NO, threshold, CycleQuota{CycleQuota::kDefaultDefragQuota}}; - if (DoDefrag(&page_usage)) { - // we didn't finish the scan + if (DoDefrag(&page_usage).work_pending) { return ProactorBase::kOnIdleMaxLevel; } } else { @@ -474,6 +731,7 @@ EngineShard::EngineShard(util::ProactorBase* pb, mi_heap_t* heap) queue2_(kQueueLen / 2, 2, 2), shard_id_(pb->GetPoolIndex()), mi_resource_(heap) { + defrag_state_.shard_id = shard_id_; queue_.Start(absl::StrCat("shard_queue_", shard_id())); queue2_.Start(absl::StrCat("l2_queue_", shard_id())); } @@ -537,7 +795,9 @@ void EngineShard::StartPeriodicHeartbeatFiber(util::ProactorBase* pb) { fiber_heartbeat_periodic_ = fb2::Fiber(fb_opts, [this, period_ms, heartbeat]() mutable { RunFPeriodically(heartbeat, period_ms, "heartbeat", &fiber_heartbeat_periodic_done_); }); - defrag_task_id_ = pb->AddOnIdleTask([this]() { return DefragTask(); }); + if (absl::GetFlag(FLAGS_enable_bg_defrag)) { + defrag_task_id_ = pb->AddOnIdleTask([this]() { return DefragTask(); }); + } } void EngineShard::StartPeriodicShardHandlerFiber(util::ProactorBase* pb, @@ -568,6 +828,14 @@ void EngineShard::InitThreadLocal(ProactorBase* pb) { SmallString::InitThreadLocal(data_heap); InitTLStatelessAllocMR(shard_->memory_resource()); + // Register the mimalloc underutil callback once (process-wide); each shard + // thread's tracker storage is thread_local, so the single callback dispatches + // naturally to per-shard sets. Threshold is the same value defrag uses to + // classify under-utilized pages. + defrag_underutil::InitOnce(); + const float thr = GetFlag(FLAGS_mem_defrag_page_utilization_threshold); + defrag_underutil::SetThresholdPct(static_cast(std::clamp(thr * 100.0f, 0.0f, 100.0f))); + shard_->shard_search_indices_ = std::make_unique(); } @@ -775,7 +1043,7 @@ void EngineShard::Heartbeat() { stalled_start_ns_ = 0; thread_local bool check_huffman = (shard_id_ == 0); // run it only on shard 0. - if (check_huffman) { + if (check_huffman && !absl::GetFlag(FLAGS_disable_huffman_check)) { auto* ptr = db_slice.GetDBTable(0); if (ptr) { size_t key_usage = ptr->stats.memory_usage_by_type[OBJ_KEY]; diff --git a/src/server/engine_shard.h b/src/server/engine_shard.h index 513fd8b388e8..08322608ba04 100644 --- a/src/server/engine_shard.h +++ b/src/server/engine_shard.h @@ -10,6 +10,7 @@ #include "core/task_queue.h" #include "core/tx_queue.h" #include "server/common_types.h" +#include "server/defrag.h" #include "util/sliding_counter.h" typedef char* sds; @@ -209,8 +210,7 @@ class EngineShard { void FinalizeMulti(Transaction* tx); // Scan the shard with the cursor and apply defragmentation for database entries. - // Returns collected page stats if defragmentation was performed. - std::optional DoDefrag(PageUsage* page_usage); + DefragShardReport DoDefrag(PageUsage* page_usage); uint64_t GetDefragCursor() const { return defrag_state_.cursor; @@ -220,29 +220,6 @@ class EngineShard { size_t CompactTable(double threshold, DbIndex db_idx); private: - struct DefragTaskState { - size_t dbid = 0u; - uint64_t cursor = 0u; - time_t last_check_time = 0; - float page_utilization_threshold = 0.8; - - enum class SkipReason : uint8_t { - MemoryTooLow, - MemoryBelowThreshold, - CheckWithinInterval, - NotEnoughFragmentation, - CheckInProgress, - NotSkipped, - }; - - // check the current threshold and return a reason if we skip the defragmentation - SkipReason CheckRequired(); - - void UpdateScanState(uint64_t cursor_val); - - void ResetScanState(); - }; - struct EvictionTaskState { void Reset(bool rss_eviction_enabled_flag) { rss_eviction_enabled = rss_eviction_enabled_flag; diff --git a/src/server/memory_cmd.cc b/src/server/memory_cmd.cc index 8ec9c0c4a6ae..56d0e97744fc 100644 --- a/src/server/memory_cmd.cc +++ b/src/server/memory_cmd.cc @@ -282,19 +282,20 @@ void MemoryCmd::Run(CmdArgList args) { static const float default_threshold = absl::GetFlag(FLAGS_mem_defrag_page_utilization_threshold); const float threshold = parser.NextOrDefault(default_threshold); + if (parser.HasError()) { + return cmd_cntx_->SendError(parser.TakeError().MakeReply()); + } - std::vector results(shard_set->size()); + std::vector results(shard_set->size()); shard_set->pool()->AwaitFiberOnAll([threshold, &results](util::ProactorBase*) { if (auto* shard = EngineShard::tlocal(); shard) { PageUsage page_usage{CollectPageStats::YES, threshold, CycleQuota{CycleQuota::kDefaultDefragQuota}}; - if (auto shard_res = shard->DoDefrag(&page_usage); shard_res.has_value()) { - results[shard->shard_id()] = std::move(shard_res.value()); - } + results[shard->shard_id()] = shard->DoDefrag(&page_usage); } }); - const CollectedPageStats merged = CollectedPageStats::Merge(std::move(results), threshold); + const DefragMergedReport merged = DefragMergedReport::Merge(std::move(results)); auto* rb = static_cast(cmd_cntx_->rb()); return rb->SendVerbatimString(merged.ToString()); } diff --git a/tools/defrag_baseline.py b/tools/defrag_baseline.py new file mode 100755 index 000000000000..d75713bf442b --- /dev/null +++ b/tools/defrag_baseline.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python3 +"""Create an uneven memory-fragmentation baseline for defrag experiments. + +This script only creates and deletes keys. It does not run MEMORY DEFRAGMENT and +does not wait for background defrag counters. +""" + +import argparse +import asyncio +import contextlib +import os +import random +from collections import Counter, defaultdict +from dataclasses import dataclass + +import aioredis + + +@dataclass(frozen=True) +class WideBand: + name: str + value_size: int + byte_share: float + profile: str + + +WIDE_BANDS = [ + WideBand("tiny", 64, 0.02, "0.99:6,0.90:3,0.60:1"), + WideBand("small", 128, 0.04, "0.98:4,0.80:3,0.35:2,0.08:1"), + WideBand("med1", 256, 0.10, "0.98:3,0.65:2,0.25:3,0.04:2"), + WideBand("med2", 512, 0.22, "0.95:3,0.55:3,0.15:3,0.03:1"), + WideBand("large1", 1024, 0.28, "0.95:3,0.60:3,0.20:3,0.05:1"), + WideBand("large2", 2048, 0.24, "0.90:4,0.45:3,0.12:2,0.03:1"), + WideBand("huge", 4096, 0.10, "0.85:5,0.35:3,0.08:2"), +] + + +def parse_profile(profile: str) -> list[tuple[float, int]]: + """Parse "live_ratio:weight,..." into weighted live-ratio entries.""" + entries = [] + for raw_part in profile.split(","): + part = raw_part.strip() + if not part: + continue + ratio_text, weight_text = part.split(":", 1) + ratio = float(ratio_text) + weight = int(weight_text) + if not 0.0 <= ratio <= 1.0: + raise ValueError(f"live ratio must be in [0, 1]: {ratio}") + if weight <= 0: + raise ValueError(f"profile weight must be positive: {weight}") + entries.append((ratio, weight)) + + if not entries: + raise ValueError("profile must contain at least one live_ratio:weight entry") + return entries + + +def make_ratio_deck(profile: list[tuple[float, int]], rng: random.Random) -> list[float]: + deck = [] + for live_ratio, weight in profile: + deck.extend([live_ratio] * weight) + rng.shuffle(deck) + return deck + + +def key_name(prefix: str, key_id: int) -> str: + return f"{prefix}:{key_id}" + + +def parse_arena_summary(report: str) -> tuple[str | None, list[tuple[int, int, int, int, float]]]: + """Return the machine-wide Total line and top-level bin rows from MEMORY ARENA SUMMARY.""" + lines = report.splitlines() + machine_start = 0 + for index, line in enumerate(lines): + if "Arena statistics for machine" in line: + machine_start = index + + machine_lines = lines[machine_start:] + + total_line = None + rows = [] + for line in machine_lines: + parts = line.replace("%", "").split() + if not parts: + continue + if parts[0] == "Total:" and len(parts) >= 6: + total_line = line + continue + if len(parts) != 6 or not parts[0].isdigit(): + continue + + block_size, _reserved, committed, used, wasted, waste_pct = parts + rows.append((int(wasted), int(block_size), int(committed), int(used), float(waste_pct))) + + rows.sort(reverse=True) + return total_line, rows + + +def print_arena_summary(report: str, top_bins: int) -> None: + total_line, rows = parse_arena_summary(report) + + if total_line: + parts = total_line.replace("%", "").split() + print( + "arena_total " + f"reserved={int(parts[1]):,} " + f"committed={int(parts[2]):,} " + f"used={int(parts[3]):,} " + f"wasted={int(parts[4]):,} " + f"waste_pct={float(parts[5]):.2f}%" + ) + else: + print("arena_total unavailable") + + if top_bins <= 0: + return + + print("top_waste_bins:") + for wasted, block_size, committed, used, waste_pct in rows[:top_bins]: + print( + f" block={block_size} committed={committed:,} used={used:,} " + f"wasted={wasted:,} waste_pct={waste_pct:.2f}%" + ) + + +def format_bytes(size: int) -> str: + value = float(size) + for unit in ("B", "KiB", "MiB", "GiB", "TiB"): + if value < 1024.0 or unit == "TiB": + if unit == "B": + return f"{int(value)}B" + return f"{value:.1f}{unit}" + value /= 1024.0 + + raise AssertionError("unreachable") + + +def format_bytes_pair(size: int) -> str: + return f"{size:,} ({format_bytes(size)})" + + +async def snapshot( + connection: aioredis.Redis, label: str, include_arena: bool, top_bins: int +) -> None: + print(f"\n=== {label} ===") + + info = await connection.execute_command("INFO", "memory") + print(f"used_memory={int(info.get('used_memory', 0)):,}") + print(f"used_memory_rss={int(info.get('used_memory_rss', 0)):,}") + print(f"object_used_memory={int(info.get('object_used_memory', 0)):,}") + print(f"table_used_memory={int(info.get('table_used_memory', 0)):,}") + + if include_arena: + report = await connection.execute_command("MEMORY", "ARENA", "SUMMARY") + print_arena_summary(report, top_bins) + + +async def populate( + connection: aioredis.Redis, keys_count: int, prefix: str, value_size: int +) -> None: + print(f"creating {keys_count:,} keys prefix={prefix!r} value_size={value_size:,}") + await connection.execute_command("DEBUG", "POPULATE", keys_count, prefix, value_size) + + +async def flushdb(connection: aioredis.Redis) -> None: + print("flushing current database") + await connection.execute_command("FLUSHDB") + + +async def delete_batches(connection: aioredis.Redis, keys: list[str], batch_size: int) -> int: + deleted = 0 + for start in range(0, len(keys), batch_size): + deleted += await connection.delete(*keys[start : start + batch_size]) + return deleted + + +async def delete_fragmented_chunks( + connection: aioredis.Redis, + *, + rng: random.Random, + prefix: str, + keys_count: int, + value_size: int, + chunk_keys: int, + profile_text: str, + delete_batch: int, + snapshot_every_chunks: int, + snapshot_prefix: str, + include_arena: bool, + top_arena_bins: int, +) -> dict: + profile = parse_profile(profile_text) + deck = make_ratio_deck(profile, rng) + + chunks_by_ratio = Counter() + deleted_by_ratio = defaultdict(int) + live_by_ratio = defaultdict(int) + total_deleted = 0 + chunks = 0 + + for chunk_start in range(0, keys_count, chunk_keys): + chunk_end = min(keys_count, chunk_start + chunk_keys) + chunk_size = chunk_end - chunk_start + + if chunks and chunks % len(deck) == 0: + deck = make_ratio_deck(profile, rng) + + live_ratio = deck[chunks % len(deck)] + live_count = round(chunk_size * live_ratio) + delete_count = chunk_size - live_count + + ids = list(range(chunk_start, chunk_end)) + delete_ids = rng.sample(ids, delete_count) + delete_keys = [key_name(prefix, key_id) for key_id in delete_ids] + + deleted = await delete_batches(connection, delete_keys, delete_batch) + live = chunk_size - deleted + + chunks_by_ratio[live_ratio] += 1 + deleted_by_ratio[live_ratio] += deleted + live_by_ratio[live_ratio] += live + total_deleted += deleted + chunks += 1 + + if snapshot_every_chunks and chunks % snapshot_every_chunks == 0: + await snapshot( + connection, + f"{snapshot_prefix}_after_delete_chunk_{chunks}", + include_arena, + top_arena_bins, + ) + + return { + "chunks": chunks, + "chunk_keys": chunk_keys, + "profile": profile_text, + "created_keys": keys_count, + "deleted_keys": total_deleted, + "live_keys": keys_count - total_deleted, + "value_size": value_size, + "chunks_by_ratio": chunks_by_ratio, + "deleted_by_ratio": deleted_by_ratio, + "live_by_ratio": live_by_ratio, + } + + +def print_fragmentation_summary(summary: dict, *, label: str, seed: int) -> None: + print("\n=== planned_fragmentation ===") + print(f"label={label}") + print(f"chunks={summary['chunks']:,}") + print(f"chunk_keys={summary['chunk_keys']:,}") + print(f"seed={seed}") + print(f"profile={summary['profile']}") + print(f"value_size={summary['value_size']:,}") + print(f"created_keys={summary['created_keys']:,}") + print(f"deleted_keys={summary['deleted_keys']:,}") + print(f"live_keys={summary['live_keys']:,}") + print( + "estimated_created_value_bytes=" + f"{format_bytes_pair(summary['created_keys'] * summary['value_size'])}" + ) + print( + "estimated_deleted_value_bytes=" + f"{format_bytes_pair(summary['deleted_keys'] * summary['value_size'])}" + ) + print( + "estimated_live_value_bytes=" + f"{format_bytes_pair(summary['live_keys'] * summary['value_size'])}" + ) + + print("\nby_live_ratio:") + for ratio in sorted(summary["chunks_by_ratio"].keys(), reverse=True): + print( + f" live_ratio={ratio:.2f} chunks={summary['chunks_by_ratio'][ratio]:,} " + f"live_keys={summary['live_by_ratio'][ratio]:,} " + f"deleted_keys={summary['deleted_by_ratio'][ratio]:,}" + ) + + +def print_wide_distribution( + summaries: list[tuple[WideBand, dict]], target_value_bytes: int +) -> None: + total_created_keys = sum(summary["created_keys"] for _, summary in summaries) + total_live_keys = sum(summary["live_keys"] for _, summary in summaries) + total_deleted_keys = sum(summary["deleted_keys"] for _, summary in summaries) + total_created_bytes = sum( + summary["created_keys"] * summary["value_size"] for _, summary in summaries + ) + total_live_bytes = sum(summary["live_keys"] * summary["value_size"] for _, summary in summaries) + total_deleted_bytes = sum( + summary["deleted_keys"] * summary["value_size"] for _, summary in summaries + ) + + print("\n=== wide_object_distribution ===") + print(f"target_value_bytes={format_bytes_pair(target_value_bytes)}") + print(f"created_value_bytes={format_bytes_pair(total_created_bytes)}") + print(f"live_value_bytes={format_bytes_pair(total_live_bytes)}") + print(f"deleted_value_bytes={format_bytes_pair(total_deleted_bytes)}") + print() + + print( + f"{'band':<8} {'size':>6} {'share':>7} {'keys_created':>13} " + f"{'keys_live':>12} {'keys_deleted':>13} {'bytes_created':>13} " + f"{'bytes_live':>10} {'bytes_deleted':>13} {'live%':>7} {'chunks':>7}" + ) + + for band, summary in summaries: + created_keys = summary["created_keys"] + live_keys = summary["live_keys"] + deleted_keys = summary["deleted_keys"] + value_size = summary["value_size"] + created_bytes = created_keys * value_size + live_bytes = live_keys * value_size + deleted_bytes = deleted_keys * value_size + live_ratio = live_keys / created_keys if created_keys else 0.0 + + print( + f"{band.name:<8} {value_size:>6,} {band.byte_share:>6.1%} " + f"{created_keys:>13,} {live_keys:>12,} {deleted_keys:>13,} " + f"{format_bytes(created_bytes):>13} {format_bytes(live_bytes):>10} " + f"{format_bytes(deleted_bytes):>13} {live_ratio:>6.1%} " + f"{summary['chunks']:>7,}" + ) + + total_live_ratio = total_live_keys / total_created_keys if total_created_keys else 0.0 + print( + f"{'total':<8} {'-':>6} {'100.0%':>7} {total_created_keys:>12,} " + f"{total_live_keys:>12,} {total_deleted_keys:>13,} " + f"{format_bytes(total_created_bytes):>13} {format_bytes(total_live_bytes):>10} " + f"{format_bytes(total_deleted_bytes):>13} {total_live_ratio:>6.1%} " + f"{sum(summary['chunks'] for _, summary in summaries):>7,}" + ) + + +def print_wide_live_ratio_distribution(summaries: list[tuple[WideBand, dict]]) -> None: + print("\n=== wide_live_ratio_distribution ===") + print( + f"{'band':<8} {'ratio':>7} {'chunks':>8} {'keys_total':>12} " + f"{'keys_live':>12} {'keys_deleted':>13} {'bytes_deleted':>14}" + ) + + for band, summary in summaries: + value_size = summary["value_size"] + for ratio in sorted(summary["chunks_by_ratio"].keys(), reverse=True): + live_keys = summary["live_by_ratio"][ratio] + deleted_keys = summary["deleted_by_ratio"][ratio] + keys = live_keys + deleted_keys + deleted_bytes = deleted_keys * value_size + + print( + f"{band.name:<8} {ratio:>6.1%} {summary['chunks_by_ratio'][ratio]:>8,} " + f"{keys:>12,} {live_keys:>12,} {deleted_keys:>12,} " + f"{format_bytes(deleted_bytes):>14}" + ) + + +def profile_histogram_counts(summary: dict) -> list[int]: + # Buckets are live-ratio ranges: [0.80, 1.00], [0.40, 0.80), [0.10, 0.40), [0.00, 0.10). + buckets = [0, 0, 0, 0] + for ratio, chunks in summary["chunks_by_ratio"].items(): + if ratio >= 0.80: + buckets[0] += chunks + elif ratio >= 0.40: + buckets[1] += chunks + elif ratio >= 0.10: + buckets[2] += chunks + else: + buckets[3] += chunks + return buckets + + +def render_histogram_cell(count: int, total: int, width: int = 12) -> str: + if count == 0 or total == 0: + return "-" + + pct = 100.0 * count / total + bars = max(1, round(width * count / total)) + return f"{'#' * bars} {pct:.0f}%" + + +def print_wide_profile_histogram(summaries: list[tuple[WideBand, dict]]) -> None: + print("\n=== wide_profile_histogram ===") + print( + f"{'band':<8} {'size':>6} {'chunks':>8} {'80-100% live':<18} " + f"{'40-80% live':<18} {'10-40% live':<18} {'0-10% live':<18}" + ) + + totals = [0, 0, 0, 0] + total_chunks = 0 + for band, summary in summaries: + counts = profile_histogram_counts(summary) + chunks = summary["chunks"] + totals = [cur + add for cur, add in zip(totals, counts)] + total_chunks += chunks + + print( + f"{band.name:<8} {band.value_size:>6,} {chunks:>8,} " + f"{render_histogram_cell(counts[0], chunks):<18} " + f"{render_histogram_cell(counts[1], chunks):<18} " + f"{render_histogram_cell(counts[2], chunks):<18} " + f"{render_histogram_cell(counts[3], chunks):<18}" + ) + + print( + f"{'total':<8} {'-':>6} {total_chunks:>8,} " + f"{render_histogram_cell(totals[0], total_chunks):<18} " + f"{render_histogram_cell(totals[1], total_chunks):<18} " + f"{render_histogram_cell(totals[2], total_chunks):<18} " + f"{render_histogram_cell(totals[3], total_chunks):<18}" + ) + + +async def create_uniform_fragmentation( + connection: aioredis.Redis, args: argparse.Namespace +) -> None: + rng = random.Random(args.seed) + + await flushdb(connection) + await snapshot(connection, "before_populate", args.arena, args.top_arena_bins) + await populate(connection, args.keys, args.key_name, args.value_size) + await snapshot(connection, "after_populate", args.arena, args.top_arena_bins) + + summary = await delete_fragmented_chunks( + connection, + rng=rng, + prefix=args.key_name, + keys_count=args.keys, + value_size=args.value_size, + chunk_keys=args.chunk_keys, + profile_text=args.profile, + delete_batch=args.delete_batch, + snapshot_every_chunks=args.snapshot_every_chunks, + snapshot_prefix="uniform", + include_arena=args.arena, + top_arena_bins=args.top_arena_bins, + ) + + print_fragmentation_summary(summary, label="uniform", seed=args.seed) + await snapshot(connection, "after_delete", args.arena, args.top_arena_bins) + + +def wide_chunk_keys(value_size: int) -> int: + return max(32, round((256 * 1024) / value_size)) + + +async def create_wide_fragmentation(connection: aioredis.Redis, args: argparse.Namespace) -> None: + rng = random.Random(args.seed) + target_value_bytes = args.keys * args.value_size + + await flushdb(connection) + await snapshot(connection, "before_populate", args.arena, args.top_arena_bins) + + band_specs = [] + for band in WIDE_BANDS: + keys_count = max(1, round((target_value_bytes * band.byte_share) / band.value_size)) + prefix = f"{args.key_name}:{band.name}" + chunk_keys = wide_chunk_keys(band.value_size) + band_specs.append((band, prefix, keys_count, chunk_keys)) + await populate(connection, keys_count, prefix, band.value_size) + + await snapshot(connection, "after_populate", args.arena, args.top_arena_bins) + + summaries = [] + for band, prefix, keys_count, chunk_keys in band_specs: + summary = await delete_fragmented_chunks( + connection, + rng=rng, + prefix=prefix, + keys_count=keys_count, + value_size=band.value_size, + chunk_keys=chunk_keys, + profile_text=band.profile, + delete_batch=args.delete_batch, + snapshot_every_chunks=args.snapshot_every_chunks, + snapshot_prefix=band.name, + include_arena=args.arena, + top_arena_bins=args.top_arena_bins, + ) + summaries.append((band, summary)) + + print("\n=== wide_workload ===") + print(f"seed={args.seed}") + print(f"target_value_bytes={target_value_bytes:,}") + print(f"bands={len(WIDE_BANDS)}") + + created_keys = sum(summary["created_keys"] for _, summary in summaries) + deleted_keys = sum(summary["deleted_keys"] for _, summary in summaries) + live_keys = sum(summary["live_keys"] for _, summary in summaries) + deleted_value_bytes = sum( + summary["deleted_keys"] * summary["value_size"] for _, summary in summaries + ) + live_value_bytes = sum(summary["live_keys"] * summary["value_size"] for _, summary in summaries) + + print(f"created_keys={created_keys:,}") + print(f"deleted_keys={deleted_keys:,}") + print(f"live_keys={live_keys:,}") + print(f"estimated_deleted_value_bytes={format_bytes_pair(deleted_value_bytes)}") + print(f"estimated_live_value_bytes={format_bytes_pair(live_value_bytes)}") + + print_wide_distribution(summaries, target_value_bytes) + print_wide_profile_histogram(summaries) + print_wide_live_ratio_distribution(summaries) + + print("\nby_band:") + for band, summary in summaries: + print( + f" band={band.name} value_size={band.value_size:,} " + f"byte_share={band.byte_share:.2f} chunk_keys={summary['chunk_keys']:,} " + f"profile={band.profile} created={summary['created_keys']:,} " + f"deleted={summary['deleted_keys']:,} live={summary['live_keys']:,}" + ) + + for band, summary in summaries: + print_fragmentation_summary(summary, label=f"wide:{band.name}", seed=args.seed) + + await snapshot(connection, "after_delete", args.arena, args.top_arena_bins) + + +async def create_fragmentation(connection: aioredis.Redis, args: argparse.Namespace) -> None: + if args.workload == "wide": + await create_wide_fragmentation(connection, args) + else: + await create_uniform_fragmentation(connection, args) + + +async def main(args: argparse.Namespace) -> None: + pool = aioredis.ConnectionPool( + host=args.server, + port=args.port, + db=0, + decode_responses=True, + max_connections=args.max_connections, + ) + connection = aioredis.Redis(connection_pool=pool) + if args.quiet: + with open(os.devnull, "w") as sink, contextlib.redirect_stdout(sink): + await create_fragmentation(connection, args) + else: + await create_fragmentation(connection, args) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Create an uneven fragmentation baseline.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument("-k", "--keys", type=int, default=800_000, help="number of keys to create") + parser.add_argument( + "--mul", + type=float, + default=1.0, + help="scale --keys by this multiplier (e.g. --mul 5 -> 5x keys)", + ) + parser.add_argument( + "-v", "--value-size", type=int, default=645, help="value size for DEBUG POPULATE" + ) + parser.add_argument("-n", "--key-name", default="key-for-testing", help="base key name") + parser.add_argument("-s", "--server", default="localhost", help="server host") + parser.add_argument("-p", "--port", type=int, default=6379, help="server port") + parser.add_argument( + "--workload", + choices=["uniform", "wide"], + default="uniform", + help="fragmentation workload preset", + ) + + parser.add_argument("--seed", type=int, default=12345, help="deterministic deletion seed") + parser.add_argument( + "--chunk-keys", + type=int, + default=512, + help="contiguous key-id region size assigned one live ratio", + ) + parser.add_argument("--delete-batch", type=int, default=1000, help="DEL batch size") + parser.add_argument( + "--max-connections", type=int, default=16, help="redis connection pool size" + ) + parser.add_argument( + "--profile", + default="0.95:2,0.80:2,0.60:3,0.30:2,0.10:1", + help="comma-separated live_ratio:weight entries", + ) + parser.add_argument( + "--arena", action="store_true", help="include MEMORY ARENA SUMMARY snapshots" + ) + parser.add_argument( + "--top-arena-bins", + type=int, + default=5, + help="number of machine-wide arena waste bins to print", + ) + parser.add_argument( + "--snapshot-every-chunks", + type=int, + default=0, + help="print snapshots every N deleted chunks; 0 disables intermediate snapshots", + ) + parser.add_argument("--quiet", action="store_true", help="suppress normal progress output") + + args = parser.parse_args() + if args.mul != 1.0: + args.keys = int(args.keys * args.mul) + asyncio.run(main(args)) diff --git a/tools/defrag_compare.py b/tools/defrag_compare.py new file mode 100755 index 000000000000..48b60ae0b820 --- /dev/null +++ b/tools/defrag_compare.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python3 + +import argparse +import os + + +def field(row, side, name): + v = row.get(side) + if isinstance(v, dict): + return v.get(name) + return None + + +def default_label(path: str, df) -> str: + labels = df.get("label") + if labels is not None: + non_null = labels.dropna() + if not non_null.empty: + return str(non_null.iloc[0]) + return os.path.splitext(os.path.basename(path))[0] + + +def load_run(path: str, pd): + df = pd.read_json(path, lines=True) + label = default_label(path, df) + df["waste_pct"] = df.apply(lambda r: field(r, "after", "waste_pct"), axis=1) + df = df[["cycle", "waste_pct"]].dropna() + return label, df + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("runs", nargs="+", help="defrag driver JSONL files") + parser.add_argument("--out", default="defrag_compare.png") + parser.add_argument("--title", default="Defrag Fragmentation Progress") + parser.add_argument( + "--guide", + type=float, + action="append", + default=[20.0, 10.0, 5.0], + help="horizontal waste percentage guide line; may be repeated", + ) + args = parser.parse_args() + + import matplotlib.pyplot as plt + import pandas as pd + from matplotlib.ticker import MaxNLocator + + fig, ax = plt.subplots(figsize=(16, 7), constrained_layout=True) + + for path in args.runs: + label, df = load_run(path, pd) + if df.empty: + print(f"skipping empty run: {path}") + continue + markevery = max(1, len(df) // 90) + ax.plot( + df["cycle"], + df["waste_pct"], + linewidth=2.5, + marker="o", + markersize=3, + markevery=markevery, + label=label, + ) + + for guide in args.guide: + ax.axhline(guide, color="gray", linewidth=1, alpha=0.25) + ax.text( + 0.995, + guide, + f"{guide:g}%", + transform=ax.get_yaxis_transform(), + ha="right", + va="bottom", + color="gray", + fontsize=9, + ) + + ax.set_title(args.title) + ax.set_xlabel("Cycle") + ax.set_ylabel("Waste %") + ax.grid(True, alpha=0.25) + ax.legend() + ax.margins(x=0.01) + ax.xaxis.set_major_locator(MaxNLocator(nbins=18, integer=True)) + + fig.savefig(args.out, dpi=160) + print(f"wrote {args.out}") + + +if __name__ == "__main__": + main() diff --git a/tools/defrag_drive.py b/tools/defrag_drive.py new file mode 100755 index 000000000000..1b70115d55e4 --- /dev/null +++ b/tools/defrag_drive.py @@ -0,0 +1,464 @@ +#!/usr/bin/env python3 +"""Drive MEMORY DEFRAGMENT in a loop, recording fragmentation and per-shard +phase timings to JSONL for plotting. + +Connects to an already-running dragonfly. Each cycle: + 1. Capture stderr-log byte offset + 2. MEMORY ARENA SUMMARY -> before + 3. MEMORY DEFRAGMENT -> blocks until the slice finishes; capture reply + 4. MEMORY ARENA SUMMARY -> after + 5. Read log delta, parse defrag[*] lines, attribute per shard + 6. Emit one JSONL record + 7. Sleep + +After the final summary, the script runs FLUSHALL to clean up the generated +dataset. + +Run dragonfly with: + ./build-dbg/dragonfly --alsologtostderr ... +This writes glog files into /tmp/ with /tmp/dragonfly.INFO as a stable symlink +to the current INFO log; the script reads from that symlink. + +Plot in pandas with: + pd.read_json("run.jsonl", lines=True) + +Use the required label to keep old/new runs separate: + ./tools/defrag_drive.py old + ./tools/defrag_drive.py new +""" + +import argparse +import asyncio +import json +import os +import re +import time + +import redis.asyncio as aioredis + + +ARENA_TOTAL_RE = re.compile(r"Total:\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+([\d.]+)%") + + +def parse_arena_total(report: str) -> dict | None: + """Pull the machine-wide Total: row from MEMORY ARENA SUMMARY.""" + last_total = None + for line in report.splitlines(): + match = ARENA_TOTAL_RE.search(line) + if match: + last_total = match + if not last_total: + return None + reserved, committed, used, wasted, waste_pct = last_total.groups() + return { + "reserved": int(reserved), + "committed": int(committed), + "used": int(used), + "wasted": int(wasted), + "waste_pct": float(waste_pct), + } + + +# defrag[CYCLE_DONE] shard=0 cycle=7 targets_done=12/15 (80.0%) bytes_freed=... +CYCLE_DONE_RE = re.compile( + r"defrag\[CYCLE_DONE\]\s+shard=(\d+)\s+cycle=(\d+)\s+" + r"targets_done=(\d+)/(\d+)\s+\(([\d.]+)%\)\s+" + r"bytes_freed=([\d.]+)(KiB|MiB|GiB|B)/[^\s]+\s+\(([\d.]+)%\)\s+" + r"bytes_moved=([\d.]+)(KiB|MiB|GiB|B)\s+" + r"cycle_took=([\d.]+)ms\s+freed_rate=([\d.]+)MiB/s" +) + +PHASE_RE = re.compile( + r"defrag\[(CENSUS|PLAN|EVACUATE|VERIFY)\]\s+shard=(\d+)\s+cycle=(\d+)\s+" + r".*?\stook=([\d.]+)ms(?:\s+cpu=([\d.]+)ms)?" +) + +DO_DEFRAG_EXIT_RE = re.compile( + r"defrag\[DoDefrag\]\s+shard=(\d+)\s+exit\s+" + r"phase=([A-Z_]+)->([A-Z_]+)\s+" + r"duration=(\d+)us.*?" + r"quota_depleted=(\d+)\s+work_pending=(\d+)\s+cycle_finished=(\d+)" +) + +UNIT = {"B": 1, "KiB": 1024, "MiB": 1024 * 1024, "GiB": 1024 * 1024 * 1024} + + +def record_committed_drop(record: dict) -> int: + before = record.get("before") or {} + after = record.get("after") or {} + if not before or not after: + return 0 + return before.get("committed", 0) - after.get("committed", 0) + + +def record_waste_drop(record: dict) -> float: + before = record.get("before") or {} + after = record.get("after") or {} + before_pct = before.get("waste_pct") + after_pct = after.get("waste_pct") + if before_pct is None or after_pct is None: + return 0.0 + return before_pct - after_pct + + +def record_work_pending(record: dict) -> bool | None: + pending = [s["work_pending"] for s in record.get("shards", []) if "work_pending" in s] + if not pending: + return None + return any(pending) + + +def stall_reason(records: list[dict], args: argparse.Namespace, stall_armed: bool) -> str | None: + if not stall_armed or args.stall_window <= 0 or len(records) < args.stall_window: + return None + if record_work_pending(records[-1]) is not False: + return None + + recent = records[-args.stall_window :] + first_before = recent[0].get("before") or {} + last_after = recent[-1].get("after") or {} + first_waste = first_before.get("waste_pct") + last_waste = last_after.get("waste_pct") + waste_drop = 0.0 + if first_waste is not None and last_waste is not None: + waste_drop = first_waste - last_waste + + committed_drop = sum(record_committed_drop(r) for r in recent) + min_committed_drop = int(args.stall_min_committed_drop_mb * 1024 * 1024) + + if waste_drop < args.stall_min_waste_drop and committed_drop < min_committed_drop: + return ( + f"stalled for {args.stall_window} driver iterations: " + f"waste_drop={waste_drop:.3f}pp " + f"committed_drop={committed_drop:,}B" + ) + + return None + + +def parse_log_delta(text: str) -> dict[int, dict]: + """Extract per-shard summary from a slice of dragonfly stderr.""" + by_shard: dict[int, dict] = {} + + for match in PHASE_RE.finditer(text): + phase, shard_str, cycle_str, took_ms, cpu_ms = match.groups() + shard = int(shard_str) + rec = by_shard.setdefault( + shard, + { + "shard": shard, + "cycle_id": int(cycle_str), + "phase_ms": {}, + "phase_cpu_ms": {}, + }, + ) + rec.setdefault("phase_cpu_ms", {}) + rec["phase_ms"][phase.lower()] = float(took_ms) + if cpu_ms is not None: + rec["phase_cpu_ms"][phase.lower()] = float(cpu_ms) + + for match in CYCLE_DONE_RE.finditer(text): + ( + shard_str, + cycle_str, + done, + total, + _done_pct, + freed_v, + freed_u, + _freed_pct, + moved_v, + moved_u, + cycle_took, + freed_rate, + ) = match.groups() + shard = int(shard_str) + rec = by_shard.setdefault( + shard, {"shard": shard, "cycle_id": int(cycle_str), "phase_ms": {}} + ) + rec.update( + { + "cycle_id": int(cycle_str), + "targets_complete": int(done), + "targets_total": int(total), + "bytes_freed": int(float(freed_v) * UNIT[freed_u]), + "bytes_moved": int(float(moved_v) * UNIT[moved_u]), + "cycle_took_ms": float(cycle_took), + "freed_rate_mibs": float(freed_rate), + } + ) + + for match in DO_DEFRAG_EXIT_RE.finditer(text): + shard_str, phase_start, phase_end, duration_us, quota, pending, finished = match.groups() + shard = int(shard_str) + rec = by_shard.setdefault(shard, {"shard": shard, "phase_ms": {}}) + rec.update( + { + "phase_start": phase_start, + "phase_end": phase_end, + "duration_us": int(duration_us), + "quota_depleted": bool(int(quota)), + "work_pending": bool(int(pending)), + "cycle_finished": bool(int(finished)), + } + ) + + return by_shard + + +async def get_arena(client: aioredis.Redis) -> dict | None: + report = await client.execute_command("MEMORY", "ARENA", "SUMMARY") + if isinstance(report, bytes): + report = report.decode() + return parse_arena_total(report) + + +async def run_defragment(client: aioredis.Redis) -> str: + reply = await client.execute_command("MEMORY", "DEFRAGMENT") + if isinstance(reply, bytes): + reply = reply.decode() + return reply + + +async def flushall(client: aioredis.Redis) -> None: + await client.execute_command("FLUSHALL") + + +def read_log_delta(path: str, start_offset: int) -> tuple[str, int]: + """Return text written to `path` since `start_offset`, plus new EOF.""" + with open(path, "rb") as fh: + fh.seek(0, os.SEEK_END) + end = fh.tell() + if end < start_offset: + # log was truncated/rotated under us; restart from 0 + start_offset = 0 + fh.seek(start_offset) + data = fh.read(end - start_offset) + return data.decode(errors="replace"), end + + +def log_size(path: str) -> int: + try: + return os.path.getsize(path) + except FileNotFoundError: + return 0 + + +def print_summary(records: list[dict]) -> None: + if not records: + return + print("\n=== summary ===") + for rec in records: + before = rec.get("before") or {} + after = rec.get("after") or {} + waste_before = before.get("waste_pct") + waste_after = after.get("waste_pct") + waste_str = ( + f"{waste_before:.2f}% -> {waste_after:.2f}%" + if waste_before is not None and waste_after is not None + else "waste n/a" + ) + call_ms = rec.get("call_ms", 0.0) + + shards = rec.get("shards") or [] + committed_drop = ( + (before.get("committed", 0) - after.get("committed", 0)) if before and after else 0 + ) + + # Aggregate per-phase CPU=ms across shards (max), preferring the new + # cpu= field over the wall-clock took= field. "-" for unparsed. + phase_keys = ("census", "plan", "evacuate", "verify") + phase_max: dict[str, float | None] = {k: None for k in phase_keys} + any_cpu = False + any_wall = False + for s in shards: + cpu_map = s.get("phase_cpu_ms") or {} + wall_map = s.get("phase_ms") or {} + for k in phase_keys: + v = cpu_map.get(k, wall_map.get(k)) + if k in cpu_map: + any_cpu = True + elif k in wall_map: + any_wall = True + if v is None: + continue + phase_max[k] = v if phase_max.get(k) is None else max(phase_max[k], v) + + if any_cpu or any_wall: + phase_str = " ".join( + f"{k[:4]}={'-' if phase_max[k] is None else f'{phase_max[k]:.1f}ms'}" + for k in phase_keys + ) + label = "cpu" if any_cpu else "wall" + phases_part = f"phases.{label}[{phase_str}]" + else: + phases_part = "phases[no transitions]" + + print( + f"cycle {rec['cycle']:>3}: waste {waste_str} call={call_ms:.1f}ms " + f"{phases_part} " + f"committed_drop={committed_drop:>+13,}B" + ) + + total_cpu_ms = sum(r.get("call_ms", 0.0) for r in records) + first_ts = records[0].get("ts_ns") + last_ts = records[-1].get("ts_ns") + wall_ms = (last_ts - first_ts) / 1_000_000.0 if first_ts and last_ts else 0.0 + first_waste = (records[0].get("before") or {}).get("waste_pct") + last_waste = (records[-1].get("after") or {}).get("waste_pct") + waste_summary = ( + f"{first_waste:.2f}% -> {last_waste:.2f}%" + if first_waste is not None and last_waste is not None + else "n/a" + ) + print( + f"\ntotals: {len(records)} cycles " + f"defrag_cpu={total_cpu_ms:.1f}ms wall={wall_ms / 1000.0:.2f}s " + f"waste {waste_summary}" + ) + print("(phases.cpu = server CPU per phase; phases.wall = older log without cpu= field)") + + +async def main(args: argparse.Namespace) -> None: + client = aioredis.Redis(host=args.host, port=args.port, db=0) + output_path = args.output or os.path.join("runs", f"{args.label}.jsonl") + + out_dir = os.path.dirname(output_path) + if out_dir: + os.makedirs(out_dir, exist_ok=True) + + records: list[dict] = [] + last_waste: float | None = None + stall_armed = False + stop_reason = f"reached --cycles={args.cycles}" + print(f"writing run: {output_path}") + with open(output_path, "w") as out_fh: + for cycle in range(args.cycles): + log_offset = log_size(args.log_path) + ts_ns = time.time_ns() + + before = await get_arena(client) + call_start_ns = time.monotonic_ns() + reply = await run_defragment(client) + call_ms = (time.monotonic_ns() - call_start_ns) / 1_000_000.0 + after = await get_arena(client) + + # Tiny pause so any tail-end log line lands before we read. + await asyncio.sleep(0.05) + log_text, _new_offset = read_log_delta(args.log_path, log_offset) + shards = parse_log_delta(log_text) + + record = { + "label": args.label, + "cycle": cycle, + "ts_ns": ts_ns, + "call_ms": call_ms, + "before": before, + "after": after, + "defrag_reply": reply, + "shards": [shards[k] for k in sorted(shards)], + } + out_fh.write(json.dumps(record) + "\n") + out_fh.flush() + records.append(record) + + committed_drop = record_committed_drop(record) + waste_before = (before or {}).get("waste_pct") + waste_after = (after or {}).get("waste_pct") + waste_str = ( + f"{waste_before:.2f}% -> {waste_after:.2f}%" + if waste_before is not None and waste_after is not None + else "n/a" + ) + print(f"cycle={cycle} waste={waste_str} committed_drop={committed_drop:,}B") + + current_waste = (after or {}).get("waste_pct") + last_waste = current_waste if current_waste is not None else last_waste + if ( + args.target_waste is not None + and current_waste is not None + and current_waste <= args.target_waste + ): + stop_reason = ( + f"reached target waste {args.target_waste:.2f}% at " + f"{current_waste:.2f}% (cycle {cycle})" + ) + break + + if committed_drop > 0 or record_waste_drop(record) > 0: + stall_armed = True + + reason = stall_reason(records, args, stall_armed) + if reason is not None: + stop_reason = f"{reason} (cycle {cycle})" + break + + if cycle + 1 < args.cycles: + await asyncio.sleep(args.sleep_ms / 1000.0) + + try: + if ( + stop_reason.startswith("reached --cycles=") + and args.target_waste is not None + and last_waste is not None + ): + stop_reason = ( + f"reached --cycles={args.cycles} (target {args.target_waste:.2f}% " + f"not reached, final {last_waste:.2f}%)" + ) + print(f"\nstopped: {stop_reason}") + print_summary(records) + await flushall(client) + print("\ncleanup: FLUSHALL") + finally: + await client.aclose() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "label", choices=("old", "new"), help="run label used for runs/